From 16d990a15491cf76cd6eef0846e1b4100e63261a Mon Sep 17 00:00:00 2001 From: Junrui Luo Date: Wed, 15 Apr 2026 17:26:55 +0800 Subject: [PATCH 001/377] KVM: s390: pci: fix GAIT table indexing due to double-scaling pointer arithmetic kvm_s390_pci_aif_enable(), kvm_s390_pci_aif_disable(), and aen_host_forward() index the GAIT by manually multiplying the index with sizeof(struct zpci_gaite). Since aift->gait is already a struct zpci_gaite pointer, this double-scales the offset, accessing element aisb*16 instead of aisb. This causes out-of-bounds accesses when aisb >= 32 (with ZPCI_NR_DEVICES=512) Fix by removing the erroneous sizeof multiplication. Fixes: 3c5a1b6f0a18 ("KVM: s390: pci: provide routines for enabling/disabling interrupt forwarding") Fixes: 73f91b004321 ("KVM: s390: pci: enable host forwarding of Adapter Event Notifications") Reported-by: Yuhao Jiang Cc: stable@vger.kernel.org Signed-off-by: Junrui Luo Reviewed-by: Christian Borntraeger Reviewed-by: Matthew Rosato Tested-by: Matthew Rosato Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 3 +-- arch/s390/kvm/pci.c | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 7cb8ce833b62..f48f25c7dc8f 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3307,8 +3307,7 @@ static void aen_host_forward(unsigned long si) struct zpci_gaite *gaite; struct kvm *kvm; - gaite = (struct zpci_gaite *)aift->gait + - (si * sizeof(struct zpci_gaite)); + gaite = aift->gait + si; if (gaite->count == 0) return; if (gaite->aisb != 0) diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index 86d93e8dddae..eed45af1a92d 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -290,8 +290,7 @@ static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, phys_to_virt(fib->fmt0.aibv)); spin_lock_irq(&aift->gait_lock); - gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * - sizeof(struct zpci_gaite)); + gaite = aift->gait + zdev->aisb; /* If assist not requested, host will get all alerts */ if (assist) @@ -357,8 +356,7 @@ static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force) if (zdev->kzdev->fib.fmt0.aibv == 0) goto out; spin_lock_irq(&aift->gait_lock); - gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * - sizeof(struct zpci_gaite)); + gaite = aift->gait + zdev->aisb; isc = gaite->gisc; gaite->count--; if (gaite->count == 0) { From b0bf14546bcefa4ea49f5efcd7db2a99f0cabde9 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 3 Apr 2026 11:20:55 -0400 Subject: [PATCH 002/377] nfsd: fix GET_DIR_DELEGATION when VFS leases are disabled When leases are disabled on the server, running xfstest generic/309 leads to an error because GET_DIR_DELEGATION returns EINVAL. nfsd_get_dir_deleg() can fail in several ways: like memory allocation and unable to get a lease because either leases are disable or it's already held. Currently only the condition "already held" is translated to returning directory-delegation-is-unavailable error. However, other failure conditions are likely temporary and thus should result in the same kind of error. Fixes: 8b99f6a8c116 ("nfsd: wire up GET_DIR_DELEGATION handling") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 85e94c30285a..2797da8cc950 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2535,10 +2535,6 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp, dd = nfsd_get_dir_deleg(cstate, gdd, nf); nfsd_file_put(nf); if (IS_ERR(dd)) { - int err = PTR_ERR(dd); - - if (err != -EAGAIN) - return nfserrno(err); gdd->gddrnf_status = GDD4_UNAVAIL; return nfs_ok; } From bfb4dc533d0abaca07013dd71e6b5c6f182232b3 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Fri, 10 Apr 2026 18:11:06 +0800 Subject: [PATCH 003/377] xfs: remove the meaningless XFS_ALLOC_FLAG_FREEING In xfs_refcount_finish_one(), there's no need to pass XFS_ALLOC_FLAG_FREEING to xfs_alloc_read_agf(). So remove it. Signed-off-by: Jinliang Zheng Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_refcount.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 40c7f0ff6cf3..0ec6ccd8b4dc 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1414,8 +1414,7 @@ xfs_refcount_finish_one( if (rcur == NULL) { struct xfs_perag *pag = to_perag(ri->ri_group); - error = xfs_alloc_read_agf(pag, tp, - XFS_ALLOC_FLAG_FREEING, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; From 00dd8d7ec5253c6273023a0fd6dc08683e0bdfef Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Sat, 11 Apr 2026 15:24:13 +0100 Subject: [PATCH 004/377] xfs: zero entire directory data block header region at init xfs_dir3_data_init currently zeroes only the xfs_dir3_blk_hdr portion of the directory data block header, then manually initializes the bestfree entries in a loop. This leaves the pad field in xfs_dir3_data_hdr uninitialized and requires explicit zeroing of each bestfree slot. Zero the entire header region (geo->data_entry_offset bytes) unconditionally before setting individual fields. This covers all current and future header fields, all padding (implicit and explicit), and the bestfree array, so the manual zeroing loop for bestfree can be removed. Suggested-by: Dave Chinner Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_dir2_data.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 80ba94f51e5c..35ff119aa84b 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -728,7 +728,6 @@ xfs_dir3_data_init( struct xfs_dir2_data_unused *dup; struct xfs_dir2_data_free *bf; int error; - int i; /* * Get the buffer set up for the block. @@ -741,13 +740,16 @@ xfs_dir3_data_init( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF); /* - * Initialize the header. + * Initialize the whole directory header region to zero + * so that all padding, bestfree entries, and any + * future header fields are clean. */ hdr = bp->b_addr; + memset(hdr, 0, geo->data_entry_offset); + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(args->owner); @@ -759,10 +761,6 @@ xfs_dir3_data_init( bf = xfs_dir2_data_bestfree_p(mp, hdr); bf[0].offset = cpu_to_be16(geo->data_entry_offset); bf[0].length = cpu_to_be16(geo->blksize - geo->data_entry_offset); - for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { - bf[i].length = 0; - bf[i].offset = 0; - } /* * Set up an unused entry for the block's body. From 8fbb1877dfa5e26bda1baf8cc6abd3f805098486 Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Sat, 11 Apr 2026 15:24:14 +0100 Subject: [PATCH 005/377] xfs: zero directory data block padding on write verification Old kernels did not zero the pad field in xfs_dir3_data_hdr when initializing directory data blocks, so existing filesystems may have non-zero padding on disk. Zero the pad field in xfs_dir3_data_write_verify alongside the existing LSN and checksum updates. The pad field is pure alignment padding with no runtime meaning, so zeroing it during write verification is safe and has no additional I/O cost. This lets filesystems gradually self-heal stale non-zero padding as directories are modified, without requiring an explicit repair pass. Suggested-by: Dave Chinner Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_dir2_data.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 35ff119aa84b..aecbab61014c 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -382,6 +382,7 @@ xfs_dir3_data_write_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + struct xfs_dir3_data_hdr *datahdr3 = bp->b_addr; xfs_failaddr_t fa; fa = xfs_dir3_data_verify(bp); @@ -396,6 +397,11 @@ xfs_dir3_data_write_verify( if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + /* + * Zero padding that may be stale from old kernels. + */ + datahdr3->pad = 0; + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } From 939919ccddfcc379bb82b1f90f732d9a5cb32cc8 Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Sat, 11 Apr 2026 15:24:15 +0100 Subject: [PATCH 006/377] xfs: check directory data block header padding in scrub Add the missing scrub check for the pad field in directory data block headers. Old kernels may have written non-zero padding without issue, and the write path now self-heals stale padding on modification. Flag non-zero padding as an optimization opportunity (preen) rather than corruption. Add xchk_fblock_set_preen helper for reporting file fork block issues that could be optimized. The trace event xchk_fblock_preen already exists. Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/common.c | 11 +++++++++++ fs/xfs/scrub/common.h | 2 ++ fs/xfs/scrub/dir.c | 7 ++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 20e63069088b..3d40cb0b2496 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -251,6 +251,17 @@ xchk_ino_set_preen( trace_xchk_ino_preen(sc, ino, __return_address); } +/* Record a block indexed by a file fork that could be optimized. */ +void +xchk_fblock_set_preen( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xchk_fblock_preen(sc, whichfork, offset, __return_address); +} + /* Record something being wrong with the filesystem primary superblock. */ void xchk_set_corrupt( diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index f2ecc68538f0..b494d747c008 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -25,6 +25,8 @@ bool xchk_fblock_xref_process_error(struct xfs_scrub *sc, void xchk_block_set_preen(struct xfs_scrub *sc, struct xfs_buf *bp); void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino); +void xchk_fblock_set_preen(struct xfs_scrub *sc, + int whichfork, xfs_fileoff_t offset); void xchk_set_corrupt(struct xfs_scrub *sc); void xchk_block_set_corrupt(struct xfs_scrub *sc, diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index e09724cd3725..09715a4aa154 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -492,7 +492,12 @@ xchk_directory_data_bestfree( goto out; xchk_buffer_recheck(sc, bp); - /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ + if (xfs_has_crc(sc->mp)) { + struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; + + if (hdr3->pad) + xchk_fblock_set_preen(sc, XFS_DATA_FORK, lblk); + } if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out_buf; From 592975da8c3ca87b043077e6eafa37665eae7936 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Wed, 15 Apr 2026 09:45:14 +1000 Subject: [PATCH 007/377] xfs: fix memory leak on error in xfs_alloc_zone_info() Currently, the 0th index of the zi_used_bucket_bitmap array is not freed on error due to the pre-decrement then evaluate semantic of the while loop used in xfs_alloc_zone_info(). Fix it by allowing for the i == 0 case to be covered. Fixes: 080d01c41d44 ("xfs: implement zoned garbage collection") Cc: stable@vger.kernel.org # v6.15 Reviewed-by: Damien Le Moal Reviewed-by: Carlos Maiolino Signed-off-by: Wilfred Mallawa Reviewed-by: Hans Holmberg Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index a851b98143c0..c64f9ab743a6 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1217,7 +1217,7 @@ xfs_alloc_zone_info( return zi; out_free_bitmaps: - while (--i > 0) + while (--i >= 0) kvfree(zi->zi_used_bucket_bitmap[i]); kfree(zi); return NULL; From af47a4be6a90c8bfc874f9994ac9c15813b9718b Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Fri, 17 Apr 2026 12:16:30 +1000 Subject: [PATCH 008/377] xfs: fix memory leak for data allocated by xfs_zone_gc_data_alloc() In xfs_zone_gc_mount(), on error, a struct xfs_zone_gc_data allocated with xfs_zone_gc_data_alloc() is freed with kfree(), however, this doesn't free the underlying folios or the rmap_irecs. Use xfs_zone_gc_data_free() to correctly free this memory. Fixes: 080d01c41d44 ("xfs: implement zoned garbage collection") Cc: stable@vger.kernel.org # v6.15 Signed-off-by: Wilfred Mallawa Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index fedcc47048af..c8a1d5c0332c 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -1221,7 +1221,7 @@ xfs_zone_gc_mount( if (data->oz) xfs_open_zone_put(data->oz); out_free_gc_data: - kfree(data); + xfs_zone_gc_data_free(data); return error; } From fca20fcb76a20655daf18738f4a88c638a6bb64c Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Fri, 10 Apr 2026 18:06:15 +0100 Subject: [PATCH 009/377] xfs: check da node block pad field during scrub The da node block header (xfs_da3_node_hdr) contains a __pad32 field that should always be zero. Add a check for this during directory and attribute btree scrubbing. Since old kernels may have written non-zero padding without issues, flag this as an optimization opportunity (preen) rather than corruption. Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/dabtree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 1a71d36898b1..c2d6ad59d03e 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -454,7 +454,12 @@ xchk_da_btree_block( } } - /* XXX: Check hdr3.pad32 once we know how to fix it. */ + if (xfs_has_crc(ip->i_mount)) { + struct xfs_da3_node_hdr *nodehdr3 = blk->bp->b_addr; + + if (nodehdr3->__pad32) + xchk_da_set_preen(ds, level); + } break; default: xchk_da_set_corrupt(ds, level); From 0cfe660559e857d7c00ab86c73e4510ce069086f Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Fri, 24 Apr 2026 15:39:00 -0400 Subject: [PATCH 010/377] KVM: s390: pci: Fix aisb calculation The current implementation of aisb calculation will erroneously index via an unsigned long * as well as multiply by 8B for every 64-bits in the offset; only one or the other is required. This throws off aisb calculations once the number of devices exceeds 64, and can result in out-of-bounds access as well as failure to indicate summary bits associated with those devices in guests. Fix this by converting to a physical address before applying the offset, as is already done in arch/s390/pci/pci_irq.c. Fixes: 3c5a1b6f0a18 ("KVM: s390: pci: provide routines for enabling/disabling interrupt forwarding") Signed-off-by: Matthew Rosato Reviewed-by: Niklas Schnelle Signed-off-by: Christian Borntraeger --- arch/s390/kvm/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index eed45af1a92d..5b075c38998e 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -166,7 +166,7 @@ static int kvm_zpci_set_airq(struct zpci_dev *zdev) fib.fmt0.noi = airq_iv_end(zdev->aibv); fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); fib.fmt0.aibvo = 0; - fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib.fmt0.aisb = virt_to_phys(aift->sbv->vector) + (zdev->aisb / 64) * 8; fib.fmt0.aisbo = zdev->aisb & 63; fib.gd = zdev->gisa; @@ -308,7 +308,7 @@ static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, /* Update guest FIB for re-issue */ fib->fmt0.aisbo = zdev->aisb & 63; - fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib->fmt0.aisb = virt_to_phys(aift->sbv->vector) + (zdev->aisb / 64) * 8; fib->fmt0.isc = gisc; /* Save some guest fib values in the host for later use */ From 87e63466c9fc30c3d95b8741c3df1f1ff01d7f23 Mon Sep 17 00:00:00 2001 From: Ravi Singh Date: Wed, 22 Apr 2026 07:39:59 +0000 Subject: [PATCH 011/377] xfs: flush delalloc blocks on ENOSPC in xfs_trans_alloc_icreate xfs_trans_alloc_icreate() can fail with ENOSPC when delalloc reservations have consumed most of the available block count (fdblocks). xfs_trans_alloc() already retries internally with xfs_blockgc_flush_all(), but that only trims post-EOF speculative preallocation and may not free enough space for the transaction reservation. Add a retry with xfs_flush_inodes() when xfs_trans_alloc() returns ENOSPC. This forces writeback of all dirty inodes via sync_inodes_sb(), converting delalloc reservations to real allocations and freeing the over-reserved portion back to fdblocks. This fixes all callers of xfs_trans_alloc_icreate() and removes the existing caller-level retry from xfs_create(), which is now handled centrally. Signed-off-by: Ravi Singh Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_inode.c | 6 ------ fs/xfs/xfs_trans.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index beaa26ec62da..9978ac1422fc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -699,12 +699,6 @@ xfs_create( */ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, &tp); - if (error == -ENOSPC) { - /* flush outstanding delalloc blocks and retry */ - xfs_flush_inodes(mp); - error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, - resblks, &tp); - } if (error) goto out_parent; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bcc470f56e46..148cc32449c1 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1199,10 +1199,21 @@ xfs_trans_alloc_icreate( { struct xfs_trans *tp; bool retried = false; + bool flushed = false; int error; retry: error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp); + if (error == -ENOSPC && !flushed) { + /* + * Flush all delalloc blocks to reclaim space from speculative + * preallocation. This is similar to the quota retry below + * but targets FS-wide ENOSPC. + */ + xfs_flush_inodes(mp); + flushed = true; + goto retry; + } if (error) return error; From a6715d7ec472a476db17787697a4abda62962284 Mon Sep 17 00:00:00 2001 From: Evangelos Petrongonas Date: Fri, 10 Apr 2026 01:16:05 +0000 Subject: [PATCH 012/377] kho: skip KHO for crash kernel kho_fill_kimage() unconditionally populates the kimage with KHO metadata for every kexec image type. When the image is a crash kernel, this can be problematic as the crash kernel can run in a small reserved region and the KHO scratch areas can sit outside it. The crash kernel then faults during kho_memory_init() when it tries phys_to_virt() on the KHO FDT address: Unable to handle kernel paging request at virtual address xxxxxxxx ... fdt_offset_ptr+... fdt_check_node_offset_+... fdt_first_property_offset+... fdt_get_property_namelen_+... fdt_getprop+... kho_memory_init+... mm_core_init+... start_kernel+... kho_locate_mem_hole() already skips KHO logic for KEXEC_TYPE_CRASH images, but kho_fill_kimage() was missing the same guard. As kho_fill_kimage() is the single point that populates image->kho.fdt and image->kho.scratch, fixing it here is sufficient for both arm64 and x86 as the FDT and boot_params path are bailing out when these fields are unset. Fixes: d7255959b69a ("kho: allow kexec load before KHO finalization") Signed-off-by: Evangelos Petrongonas Reviewed-by: Mike Rapoport (Microsoft) Link: https://patch.msgid.link/20260410011609.1103-1-epetron@amazon.de Signed-off-by: Mike Rapoport (Microsoft) --- kernel/liveupdate/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 94762de1fe5f..4fde8325c49f 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1702,7 +1702,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_enable) + if (!kho_enable || image->type == KEXEC_TYPE_CRASH) return 0; image->kho.fdt = virt_to_phys(kho_out.fdt); From 0fb1daf0b78d0e23b63b6b65de56d4a3fd83bc14 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Wed, 15 Apr 2026 06:23:00 +0100 Subject: [PATCH 013/377] mm/memfd_luo: report error when restoring a folio fails mid-loop memfd_luo_retrieve_folios() initialises err to -EIO, but the per-iteration calls to mem_cgroup_charge(), shmem_add_to_page_cache() and shmem_inode_acct_blocks() reuse and overwrite err. Once any iteration completes successfully, err becomes zero. If a later iteration's kho_restore_folio() returns NULL, the failure path jumps to put_folios without resetting err, so the function returns 0. The caller memfd_luo_retrieve() then takes the success path, sets args->file and reports the restore as successful, leaving userspace with a partially populated memfd and no indication that anything went wrong. Set err to -EIO in the kho_restore_folio() failure branch so the error is propagated to the caller. Signed-off-by: David Carlier Reviewed-by: Pratyush Yadav Fixes: b3749f174d68 ("mm: memfd_luo: allow preserving memfd") Link: https://patch.msgid.link/20260415052300.362539-1-devnexen@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- mm/memfd_luo.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index b02b503c750d..35d1247281e0 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -427,6 +427,7 @@ static int memfd_luo_retrieve_folios(struct file *file, if (!folio) { pr_err("Unable to restore folio at physical address: %llx\n", phys); + err = -EIO; goto put_folios; } index = pfolio->index; From 278dd0487907112de8e34e1a97ac6145a8081523 Mon Sep 17 00:00:00 2001 From: Rosalie Wanders Date: Fri, 10 Apr 2026 21:53:54 +0200 Subject: [PATCH 014/377] HID: sony: fix incorrect force-feedback check in sony_suspend() This commit fixes the incorrect force-feedback check in sony_suspend(), without this the check will always be true due to checking a constant define that is never 0. Signed-off-by: Rosalie Wanders Signed-off-by: Jiri Kosina --- drivers/hid/hid-sony.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index b5e724676c1d..23db406092ef 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -2456,11 +2456,10 @@ static void sony_remove(struct hid_device *hdev) static int sony_suspend(struct hid_device *hdev, pm_message_t message) { #ifdef CONFIG_SONY_FF + struct sony_sc *sc = hid_get_drvdata(hdev); /* On suspend stop any running force-feedback events */ - if (SONY_FF_SUPPORT) { - struct sony_sc *sc = hid_get_drvdata(hdev); - + if (sc->quirks & SONY_FF_SUPPORT) { sc->left = sc->right = 0; sony_send_output_report(sc); } From 80c4bbb2b38513e9c3d84805fa61a0ee16d79c45 Mon Sep 17 00:00:00 2001 From: Michael Zaidman Date: Sat, 11 Apr 2026 09:24:37 +0300 Subject: [PATCH 015/377] HID: ft260: validate i2c input report length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add two checks to ft260_raw_event() to prevent out-of-bounds reads from malicious or malfunctioning devices: First, reject reports shorter than the 2-byte header (report ID + length fields). Without this, even accessing xfer->length on a 1-byte report is an OOB read. Second, validate xfer->length against the actual data capacity of the received HID report. Each I2C data report ID (0xD0 through 0xDE) defines a different report size in the HID descriptor, so the available payload varies per report. A corrupted length field could cause memcpy to read beyond the report buffer. Reported-by: Sebastián Josué Alba Vives Signed-off-by: Michael Zaidman Signed-off-by: Jiri Kosina --- drivers/hid/hid-ft260.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-ft260.c b/drivers/hid/hid-ft260.c index 333341e80b0e..70e2eedb465a 100644 --- a/drivers/hid/hid-ft260.c +++ b/drivers/hid/hid-ft260.c @@ -1068,10 +1068,22 @@ static int ft260_raw_event(struct hid_device *hdev, struct hid_report *report, struct ft260_device *dev = hid_get_drvdata(hdev); struct ft260_i2c_input_report *xfer = (void *)data; + if (size < offsetof(struct ft260_i2c_input_report, data)) { + hid_err(hdev, "short report %d\n", size); + return -1; + } + if (xfer->report >= FT260_I2C_REPORT_MIN && xfer->report <= FT260_I2C_REPORT_MAX) { - ft260_dbg("i2c resp: rep %#02x len %d\n", xfer->report, - xfer->length); + ft260_dbg("i2c resp: rep %#02x len %d size %d\n", + xfer->report, xfer->length, size); + + if (xfer->length > size - + offsetof(struct ft260_i2c_input_report, data)) { + hid_err(hdev, "report %#02x: length %d exceeds HID report size\n", + xfer->report, xfer->length); + return -1; + } if ((dev->read_buf == NULL) || (xfer->length > dev->read_len - dev->read_idx)) { From 0f2b8466fb744a8b3313a9c1e2008f8cd53b2db7 Mon Sep 17 00:00:00 2001 From: Rosalie Wanders Date: Sat, 11 Apr 2026 17:32:48 +0200 Subject: [PATCH 016/377] HID: sony: remove unneeded WARN_ON() in sony_leds_init() This commit removes the unneeded WARN_ON() macro usage in sony_leds_init(), this is unneeded because the sony_leds_init() function call is already gated behind a SONY_LED_SUPPORT check in sony_input_configured() Signed-off-by: Rosalie Wanders Signed-off-by: Jiri Kosina --- drivers/hid/hid-sony.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 23db406092ef..6a860b9ef677 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -1640,9 +1640,6 @@ static int sony_leds_init(struct sony_sc *sc) u8 max_brightness[MAX_LEDS] = { [0 ... (MAX_LEDS - 1)] = 1 }; u8 use_hw_blink[MAX_LEDS] = { 0 }; - if (WARN_ON(!(sc->quirks & SONY_LED_SUPPORT))) - return -EINVAL; - if (sc->quirks & BUZZ_CONTROLLER) { sc->led_count = 4; use_color_names = 0; From a4170b63eda999d20ad6dc39ddc3ce5c1ac619e6 Mon Sep 17 00:00:00 2001 From: Rosalie Wanders Date: Sun, 12 Apr 2026 03:08:06 +0200 Subject: [PATCH 017/377] HID: sony: add missing size validation for SMK-Link remotes This commit adds the missing size validation for SMK-Link remotes in sony_raw_event(), this prevents a malicious device from allowing hid-sony to read out of bounds of the provided buffer. I do not own these devices so the size check only forces that the buffer is large enough for nsg_mrxu_parse_report(). Signed-off-by: Rosalie Wanders Signed-off-by: Jiri Kosina --- drivers/hid/hid-sony.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 6a860b9ef677..13fe7a3e57d7 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -1169,10 +1169,9 @@ static int sony_raw_event(struct hid_device *hdev, struct hid_report *report, sixaxis_parse_report(sc, rd, size); } else if ((sc->quirks & MOTION_CONTROLLER_BT) && rd[0] == 0x01 && size == 49) { sixaxis_parse_report(sc, rd, size); - } else if ((sc->quirks & NAVIGATION_CONTROLLER) && rd[0] == 0x01 && - size == 49) { + } else if ((sc->quirks & NAVIGATION_CONTROLLER) && rd[0] == 0x01 && size == 49) { sixaxis_parse_report(sc, rd, size); - } else if ((sc->quirks & NSG_MRXU_REMOTE) && rd[0] == 0x02) { + } else if ((sc->quirks & NSG_MRXU_REMOTE) && rd[0] == 0x02 && size >= 12) { nsg_mrxu_parse_report(sc, rd, size); return 1; } else if ((sc->quirks & RB4_GUITAR_PS4_USB) && rd[0] == 0x01 && size == 64) { From 12bd440b66ed8968afffc46928233967b5b79b98 Mon Sep 17 00:00:00 2001 From: Rosalie Wanders Date: Sun, 12 Apr 2026 03:12:03 +0200 Subject: [PATCH 018/377] HID: sony: add missing size validation for Rock Band 3 Pro instruments This commit adds the missing size validation for Rock Band 3 PS3 Pro instruments in sony_raw_event(), this prevents a malicious device from allowing hid-sony to read out of bounds of the provided buffer. Signed-off-by: Rosalie Wanders Signed-off-by: Jiri Kosina --- drivers/hid/hid-sony.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 13fe7a3e57d7..315343415e8f 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -1188,7 +1188,7 @@ static int sony_raw_event(struct hid_device *hdev, struct hid_report *report, /* Rock Band 3 PS3 Pro instruments set rd[24] to 0xE0 when they're * sending full reports, and 0x02 when only sending navigation. */ - if ((sc->quirks & RB3_PRO_INSTRUMENT) && rd[24] == 0x02) { + if ((sc->quirks & RB3_PRO_INSTRUMENT) && size >= 25 && rd[24] == 0x02) { /* Only attempt to enable full report every 8 seconds */ if (time_after(jiffies, sc->rb3_pro_poke_jiffies)) { sc->rb3_pro_poke_jiffies = jiffies + secs_to_jiffies(8); From 55ce1858848132ed074fe907f00b5ce1ccab0ce1 Mon Sep 17 00:00:00 2001 From: Damien Dejean Date: Tue, 14 Apr 2026 13:38:58 +0000 Subject: [PATCH 019/377] HID: elan: Add support for ELAN SB974D touchpad MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Elan SB974D touchpad uses ELAN_MT_I2C format to send HID reports. Add an entry to match for the device and parse its vendor specific format. Signed-off-by: Damien Dejean Signed-off-by: Kornel Dulęba Signed-off-by: Jiri Kosina --- drivers/hid/hid-elan.c | 1 + drivers/hid/hid-ids.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/hid/hid-elan.c b/drivers/hid/hid-elan.c index 76d93fc48f6a..0190ad567ce4 100644 --- a/drivers/hid/hid-elan.c +++ b/drivers/hid/hid-elan.c @@ -513,6 +513,7 @@ static const struct hid_device_id elan_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_HP_X2_10_COVER), .driver_data = ELAN_HAS_LED }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_TOSHIBA_CLICK_L9W) }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_SB974D) }, { } }; MODULE_DEVICE_TABLE(hid, elan_devices); diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 0cf63742315b..8cfec7dced66 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -455,6 +455,7 @@ #define USB_DEVICE_ID_EDIFIER_QR30 0xa101 /* EDIFIER Hal0 2.0 SE */ #define USB_VENDOR_ID_ELAN 0x04f3 +#define USB_DEVICE_ID_SB974D 0x0400 #define USB_DEVICE_ID_TOSHIBA_CLICK_L9W 0x0401 #define USB_DEVICE_ID_HP_X2 0x074d #define USB_DEVICE_ID_HP_X2_10_COVER 0x0755 From 3524900cc571bd922a1a6b6a0eb0c2705cdb3559 Mon Sep 17 00:00:00 2001 From: Matthew Schwartz Date: Mon, 20 Apr 2026 11:15:22 -0700 Subject: [PATCH 020/377] HID: hid-lenovo-go-s: restore OS_TYPE after resume from s2idle The controller MCU does not persist OS_TYPE across power cycles. During s2idle resume, the USB device may be power-cycled, causing the OS_TYPE setting to revert to the default Windows value. Add a reset_resume callback so that this is correctly restored after resume. Fixes: a23f3497bf208c59ad ("HID: hid-lenovo-go-s: Add Lenovo Legion Go S Series HID Driver") Reviewed-by: Derek J. Clark Signed-off-by: Matthew Schwartz Signed-off-by: Jiri Kosina --- drivers/hid/hid-lenovo-go-s.c | 44 +++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/drivers/hid/hid-lenovo-go-s.c b/drivers/hid/hid-lenovo-go-s.c index 01c7bdd4fbe0..ff1782a75191 100644 --- a/drivers/hid/hid-lenovo-go-s.c +++ b/drivers/hid/hid-lenovo-go-s.c @@ -1369,6 +1369,14 @@ static void cfg_setup(struct work_struct *work) "Failed to retrieve IMU Manufacturer: %i\n", ret); return; } + + ret = mcu_property_out(drvdata.hdev, GET_GAMEPAD_CFG, FEATURE_OS_MODE, + NULL, 0); + if (ret) { + dev_err(&drvdata.hdev->dev, + "Failed to retrieve OS Mode: %i\n", ret); + return; + } } static int hid_gos_cfg_probe(struct hid_device *hdev, @@ -1427,6 +1435,27 @@ static void hid_gos_cfg_remove(struct hid_device *hdev) hid_set_drvdata(hdev, NULL); } +static int hid_gos_cfg_reset_resume(struct hid_device *hdev) +{ + u8 os_mode = drvdata.os_mode; + int ret; + + ret = mcu_property_out(drvdata.hdev, SET_GAMEPAD_CFG, + FEATURE_OS_MODE, &os_mode, 1); + if (ret < 0) + return ret; + + ret = mcu_property_out(drvdata.hdev, GET_GAMEPAD_CFG, + FEATURE_OS_MODE, NULL, 0); + if (ret < 0) + return ret; + + if (drvdata.os_mode != os_mode) + return -ENODEV; + + return 0; +} + static int hid_gos_probe(struct hid_device *hdev, const struct hid_device_id *id) { @@ -1481,6 +1510,20 @@ static void hid_gos_remove(struct hid_device *hdev) } } +static int hid_gos_reset_resume(struct hid_device *hdev) +{ + int ep = get_endpoint_address(hdev); + + switch (ep) { + case GO_S_CFG_INTF_IN: + return hid_gos_cfg_reset_resume(hdev); + default: + break; + } + + return 0; +} + static const struct hid_device_id hid_gos_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_QHE, USB_DEVICE_ID_LENOVO_LEGION_GO_S_XINPUT) }, @@ -1496,6 +1539,7 @@ static struct hid_driver hid_lenovo_go_s = { .probe = hid_gos_probe, .remove = hid_gos_remove, .raw_event = hid_gos_raw_event, + .reset_resume = hid_gos_reset_resume, }; module_hid_driver(hid_lenovo_go_s); From ae4ac077332ea3341a0f4c0973556c6b7ac5b7a1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 Apr 2026 10:10:02 +0300 Subject: [PATCH 021/377] HID: intel-thc-hid: Intel-quickspi: Fix some error codes If we have a partial read that is supposed to be treated as failure but in this code we forgot to set the error code. Return -EINVAL. Fixes: 9d8d51735a3a ("HID: intel-thc-hid: intel-quickspi: Add HIDSPI protocol implementation") Signed-off-by: Dan Carpenter Reviewed-by: Even Xu Reviewed-by: Mark Pearson Signed-off-by: Jiri Kosina --- drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c index 16f780bc879b..cb19057f1191 100644 --- a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c +++ b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c @@ -94,7 +94,7 @@ static int quickspi_get_device_descriptor(struct quickspi_device *qsdev) dev_err_once(qsdev->dev, "Read DEVICE_DESCRIPTOR failed, ret = %d\n", ret); dev_err_once(qsdev->dev, "DEVICE_DESCRIPTOR expected len = %u, actual read = %u\n", input_len, read_len); - return ret; + return ret ?: -EINVAL; } input_rep_type = ((struct input_report_body_header *)read_buf)->input_report_type; @@ -318,7 +318,7 @@ int reset_tic(struct quickspi_device *qsdev) dev_err_once(qsdev->dev, "Read RESET_RESPONSE body failed, ret = %d\n", ret); dev_err_once(qsdev->dev, "RESET_RESPONSE body expected len = %u, actual = %u\n", read_len, actual_read_len); - return ret; + return ret ?: -EINVAL; } input_rep_type = FIELD_GET(HIDSPI_IN_REP_BDY_HDR_REP_TYPE, reset_response); From 487359284509a6745e14b8c0518768bc277809b0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 28 Apr 2026 10:33:16 +0200 Subject: [PATCH 022/377] HID: uclogic: Fix regression of input name assignment The previous fix for adding the devm_kasprintf() return check in the commit bd07f751208b ("HID: uclogic: Add NULL check in uclogic_input_configured()") changed the condition of hi->input->name assignment, and it resulted in missing the proper input device name when no custom suffix is defined. Restore the conditional to the original content to address the regression. Fixes: bd07f751208b ("HID: uclogic: Add NULL check in uclogic_input_configured()") Signed-off-by: Takashi Iwai Signed-off-by: Jiri Kosina --- drivers/hid/hid-uclogic-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-uclogic-core.c b/drivers/hid/hid-uclogic-core.c index bd7f93e96e4e..b73f09d26688 100644 --- a/drivers/hid/hid-uclogic-core.c +++ b/drivers/hid/hid-uclogic-core.c @@ -184,7 +184,9 @@ static int uclogic_input_configured(struct hid_device *hdev, suffix = "System Control"; break; } - } else { + } + + if (suffix) { hi->input->name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s %s", hdev->name, suffix); if (!hi->input->name) From f2abc305aa93f5b12d5c929d7a9c1cf7d7fee8af Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 29 Apr 2026 20:38:17 -0600 Subject: [PATCH 023/377] riscv: Define __riscv_copy_{,vec_}{words,bytes}_unaligned() using SYM_TYPED_FUNC_START After commit 67bdd7b01387 ("riscv: Split out measure_cycles() for reuse") and commit c03ad15f7cf6 ("riscv: Reuse measure_cycles() in check_vector_unaligned_access()"), there are CFI failure when booting kernels with CONFIG_CFI=y: CFI failure at measure_cycles+0x38/0xe0 (target: __riscv_copy_words_unaligned+0x0/0x50; expected type: ...) CFI failure at measure_cycles+0x38/0xe0 (target: __riscv_copy_vec_words_unaligned+0x0/0x24; expected type: ...) The __riscv_copy_*_unaligned() functions are now called indirectly but they are not defined with SYM_TYPED_FUNC_START, which is required for assembly functions called indirectly from C to pass CFI checking. Switch to SYM_TYPED_FUNC_START to clear up the CFI failures. Fixes: 67bdd7b01387 ("riscv: Split out measure_cycles() for reuse") Fixes: c03ad15f7cf6 ("riscv: Reuse measure_cycles() in check_vector_unaligned_access()") Signed-off-by: Nathan Chancellor Reviewed-by: Sami Tolvanen Reviewed-by: Nam Cao Link: https://patch.msgid.link/20260406-measure_cycles-cfi-failure-v1-1-03e0234ae02f@kernel.org Signed-off-by: Paul Walmsley --- arch/riscv/kernel/copy-unaligned.S | 5 +++-- arch/riscv/kernel/vec-copy-unaligned.S | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/copy-unaligned.S b/arch/riscv/kernel/copy-unaligned.S index 2b3d9398c113..90f3549621f7 100644 --- a/arch/riscv/kernel/copy-unaligned.S +++ b/arch/riscv/kernel/copy-unaligned.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2023 Rivos Inc. */ +#include #include #include @@ -9,7 +10,7 @@ /* void __riscv_copy_words_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using word loads and stores. */ /* Note: The size is truncated to a multiple of 8 * SZREG */ -SYM_FUNC_START(__riscv_copy_words_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_words_unaligned) andi a4, a2, ~((8*SZREG)-1) beqz a4, 2f add a3, a1, a4 @@ -41,7 +42,7 @@ SYM_FUNC_END(__riscv_copy_words_unaligned) /* void __riscv_copy_bytes_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using only byte accesses. */ /* Note: The size is truncated to a multiple of 8 */ -SYM_FUNC_START(__riscv_copy_bytes_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_bytes_unaligned) andi a4, a2, ~(8-1) beqz a4, 2f add a3, a1, a4 diff --git a/arch/riscv/kernel/vec-copy-unaligned.S b/arch/riscv/kernel/vec-copy-unaligned.S index 7ce4de6f6e69..361039f7b944 100644 --- a/arch/riscv/kernel/vec-copy-unaligned.S +++ b/arch/riscv/kernel/vec-copy-unaligned.S @@ -2,6 +2,7 @@ /* Copyright (C) 2024 Rivos Inc. */ #include +#include #include #include @@ -16,7 +17,7 @@ /* void __riscv_copy_vec_words_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using word loads and stores. */ /* Note: The size is truncated to a multiple of WORD_EEW */ -SYM_FUNC_START(__riscv_copy_vec_words_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_vec_words_unaligned) andi a4, a2, ~(WORD_EEW-1) beqz a4, 2f add a3, a1, a4 @@ -38,7 +39,7 @@ SYM_FUNC_END(__riscv_copy_vec_words_unaligned) /* void __riscv_copy_vec_bytes_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using only byte accesses. */ /* Note: The size is truncated to a multiple of 8 */ -SYM_FUNC_START(__riscv_copy_vec_bytes_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_vec_bytes_unaligned) andi a4, a2, ~(8-1) beqz a4, 2f add a3, a1, a4 From d272b8d2dd132de8579e3f79a77bc6ae58214a93 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 30 Apr 2026 12:53:50 +0800 Subject: [PATCH 024/377] riscv: cpufeature: Drop this_hwcap clear in T-Head vector workaround The variable this_hwcap is initialized to 0 for each loop, it is not necessary to do the bit clearance since this_hwcap is still 0 at this point, clearing the source_isa is enough here. Signed-off-by: Hui Wang Link: https://patch.msgid.link/20260430045350.22213-1-hui.wang@canonical.com Signed-off-by: Paul Walmsley --- arch/riscv/kernel/cpufeature.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 1734f9a4c2fd..3dc4c0d31550 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -896,10 +896,8 @@ static void __init riscv_fill_hwcap_from_isa_string(unsigned long *isa2hwcap) * CPU cores with the ratified spec will contain non-zero * marchid. */ - if (acpi_disabled && boot_vendorid == THEAD_VENDOR_ID && boot_archid == 0x0) { - this_hwcap &= ~isa2hwcap[RISCV_ISA_EXT_v]; + if (acpi_disabled && boot_vendorid == THEAD_VENDOR_ID && boot_archid == 0x0) clear_bit(RISCV_ISA_EXT_v, source_isa); - } riscv_resolve_isa(source_isa, isainfo->isa, &this_hwcap, isa2hwcap); From a2e5b58811c7bb7d63f366f586cc7317f20e62e7 Mon Sep 17 00:00:00 2001 From: Avi Radinsky Date: Wed, 29 Apr 2026 18:35:23 -0400 Subject: [PATCH 025/377] Documentation: riscv: cmodx: fix typos Fix typos in the dynamic ftrace section: atmoic -> atomic (twice), pacthable -> patchable, derect -> directed. Signed-off-by: Avi Radinsky Acked-by: Randy Dunlap Link: https://patch.msgid.link/391d16fb-5f11-45fa-8f3b-1debe095695e@tennr.com Signed-off-by: Paul Walmsley --- Documentation/arch/riscv/cmodx.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/arch/riscv/cmodx.rst b/Documentation/arch/riscv/cmodx.rst index 40ba53bed5df..cbfa812a11b4 100644 --- a/Documentation/arch/riscv/cmodx.rst +++ b/Documentation/arch/riscv/cmodx.rst @@ -21,13 +21,13 @@ call at each patchable function entry, and patches it dynamically at runtime to enable or disable the redirection. In the case of RISC-V, 2 instructions, AUIPC + JALR, are required to compose a function call. However, it is impossible to patch 2 instructions and expect that a concurrent read-side executes them -without a race condition. This series makes atmoic code patching possible in +without a race condition. This series makes atomic code patching possible in RISC-V ftrace. Kernel preemption makes things even worse as it allows the old state to persist across the patching process with stop_machine(). In order to get rid of stop_machine() and run dynamic ftrace with full kernel preemption, we partially initialize each patchable function entry at boot-time, -setting the first instruction to AUIPC, and the second to NOP. Now, atmoic +setting the first instruction to AUIPC, and the second to NOP. Now, atomic patching is possible because the kernel only has to update one instruction. According to Ziccif, as long as an instruction is naturally aligned, the ISA guarantee an atomic update. @@ -36,8 +36,8 @@ By fixing down the first instruction, AUIPC, the range of the ftrace trampoline is limited to +-2K from the predetermined target, ftrace_caller, due to the lack of immediate encoding space in RISC-V. To address the issue, we introduce CALL_OPS, where an 8B naturally align metadata is added in front of each -pacthable function. The metadata is resolved at the first trampoline, then the -execution can be derect to another custom trampoline. +patchable function. The metadata is resolved at the first trampoline, then the +execution can be directed to another custom trampoline. CMODX in the User Space ----------------------- From 4d2b03699460b8fd5df34408a03a84a1a7ff8aa1 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 9 Apr 2026 09:11:39 +0000 Subject: [PATCH 026/377] riscv: errata: Fix bitwise vs logical AND in MIPS errata patching The condition checking whether a specific errata needs patching uses logical AND (&&) instead of bitwise AND (&). Since logical AND only checks that both operands are non-zero, this causes all errata patches to be applied whenever any single errata is detected, rather than only applying the matching one. The SiFive errata implementation correctly uses bitwise AND for the same check. Fixes: 0b0ca959d206 ("riscv: errata: Fix the PAUSE Opcode for MIPS P8700") Signed-off-by: Michael Neuling Assisted-by: Cursor:claude-4.6-opus-high-thinking Link: https://patch.msgid.link/20260409091143.1348853-2-mikey@neuling.org [pjw@kernel.org: fixed checkpatch warning] Signed-off-by: Paul Walmsley --- arch/riscv/errata/mips/errata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/errata/mips/errata.c b/arch/riscv/errata/mips/errata.c index e984a8152208..2c3dc2259e93 100644 --- a/arch/riscv/errata/mips/errata.c +++ b/arch/riscv/errata/mips/errata.c @@ -57,7 +57,7 @@ void mips_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, } tmp = (1U << alt->patch_id); - if (cpu_req_errata && tmp) { + if (cpu_req_errata & tmp) { mutex_lock(&text_mutex); patch_text_nosync(ALT_OLD_PTR(alt), ALT_ALT_PTR(alt), alt->alt_len); From 6ebcbb53fc9bc30843054ed99fd60b8e542628f4 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 1 May 2026 06:23:20 +0000 Subject: [PATCH 027/377] riscv: Fix register corruption from uninitialized cregs on error compat_riscv_gpr_set() calls cregs_to_regs() unconditionally, even when user_regset_copyin() fails. Since cregs is an uninitialized stack variable, a copyin failure causes uninitialized stack data to be written into the target task's pt_regs, corrupting its register state and potentially leaking kernel stack contents. compat_restore_sigcontext() has the same issue: it calls cregs_to_regs() even when __copy_from_user() fails, leading to the same corruption of the signal-returning task's register state on error. Only call cregs_to_regs() when the user copy succeeds. Fixes: 4608c159594f ("riscv: compat: ptrace: Add compat_arch_ptrace implement") Fixes: 7383ee05314b ("riscv: compat: signal: Add rt_frame implementation") Signed-off-by: Michael Neuling Assisted-by: Cursor:claude-4.6-opus-high-thinking Link: https://patch.msgid.link/20260501062320.2339562-1-mikey@neuling.org Signed-off-by: Paul Walmsley --- arch/riscv/kernel/compat_signal.c | 2 ++ arch/riscv/kernel/ptrace.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/compat_signal.c b/arch/riscv/kernel/compat_signal.c index 6ec4e34255a9..cf3eb33a11e4 100644 --- a/arch/riscv/kernel/compat_signal.c +++ b/arch/riscv/kernel/compat_signal.c @@ -107,6 +107,8 @@ static long compat_restore_sigcontext(struct pt_regs *regs, /* sc_regs is structured the same as the start of pt_regs */ err = __copy_from_user(&cregs, &sc->sc_regs, sizeof(sc->sc_regs)); + if (unlikely(err)) + return err; cregs_to_regs(&cregs, regs); diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 93de2e7a3074..793bcee46182 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -577,8 +577,8 @@ static int compat_riscv_gpr_set(struct task_struct *target, struct compat_user_regs_struct cregs; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &cregs, 0, -1); - - cregs_to_regs(&cregs, task_pt_regs(target)); + if (!ret) + cregs_to_regs(&cregs, task_pt_regs(target)); return ret; } From db909bd7986c10da074917af3dae83a60fa65093 Mon Sep 17 00:00:00 2001 From: "Guo Ren (Alibaba DAMO Academy)" Date: Sun, 25 Jan 2026 00:52:12 -0500 Subject: [PATCH 028/377] riscv: mm: Fixup no5lvl failure when vaddr is invalid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unlike no4lvl, no5lvl still continues to detect satp, which requires va=pa mapping. When pa=0x800000000000, no5lvl would fail in Sv48 mode due to an illegal VA value of 0x800000000000. So, prevent detecting the satp flow for no5lvl, when vaddr is invalid. Add the is_vaddr_valid() function for checking. Fixes: 26e7aacb83df ("riscv: Allow to downgrade paging mode from the command line") Cc: Alexandre Ghiti Cc: Björn Töpel Signed-off-by: Guo Ren (Alibaba DAMO Academy) Tested-by: Fangyu Yu Link: https://patch.msgid.link/20260125055212.433163-1-guoren@kernel.org [pjw@kernel.org: cleaned up commit message] Signed-off-by: Paul Walmsley --- arch/riscv/mm/init.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index decd7df40fa4..fa8d2f6f554b 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -792,6 +792,27 @@ static void __init set_mmap_rnd_bits_max(void) mmap_rnd_bits_max = MMAP_VA_BITS - PAGE_SHIFT - 3; } +static bool __init is_vaddr_valid(unsigned long va) +{ + unsigned long up = 0; + + switch (satp_mode) { + case SATP_MODE_39: + up = 1UL << 38; + break; + case SATP_MODE_48: + up = 1UL << 47; + break; + case SATP_MODE_57: + up = 1UL << 56; + break; + default: + return false; + } + + return (va < up) || (va >= (ULONG_MAX - up + 1)); +} + /* * There is a simple way to determine if 4-level is supported by the * underlying hardware: establish 1:1 mapping in 4-level page table mode @@ -833,6 +854,9 @@ static __init void set_satp_mode(uintptr_t dtb_pa) set_satp_mode_pmd + PMD_SIZE, PMD_SIZE, PAGE_KERNEL_EXEC); retry: + if (!is_vaddr_valid(set_satp_mode_pmd)) + goto out; + create_pgd_mapping(early_pg_dir, set_satp_mode_pmd, pgtable_l5_enabled ? @@ -855,6 +879,7 @@ static __init void set_satp_mode(uintptr_t dtb_pa) disable_pgtable_l4(); } +out: memset(early_pg_dir, 0, PAGE_SIZE); memset(early_p4d, 0, PAGE_SIZE); memset(early_pud, 0, PAGE_SIZE); From 0799e5943611006b346b8813c7daf7dd5aa26bfd Mon Sep 17 00:00:00 2001 From: Lyes Bourennani Date: Wed, 22 Apr 2026 00:20:22 +0200 Subject: [PATCH 029/377] batman-adv: fix integer overflow on buff_pos Fixing an integer overflow present in batadv_iv_ogm_send_to_if. The size check is done using the int type in batadv_iv_ogm_aggr_packet whereas the buff_pos variable uses the s16 type. This could lead to an out-of-bound read. Cc: stable@vger.kernel.org Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Lyes Bourennani Signed-off-by: Alexis Pinson Signed-off-by: Sven Eckelmann --- net/batman-adv/bat_iv_ogm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f28e9cbf8ad5..618d1889c04e 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -335,7 +335,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); const char *fwd_str; u8 packet_num; - s16 buff_pos; + int buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; struct sk_buff *skb; u8 *packet_pos; From 3243543592425beec83d453793e9d27caa0d8e66 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Mon, 27 Apr 2026 14:43:33 +0800 Subject: [PATCH 030/377] batman-adv: reject new tp_meter sessions during teardown Prevent tp_meter from starting new sender or receiver sessions after mesh_state has left BATADV_MESH_ACTIVE. Fixes: 33a3bb4a3345 ("batman-adv: throughput meter implementation") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Co-developed-by: Luxing Yin Signed-off-by: Luxing Yin Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Signed-off-by: Sven Eckelmann --- net/batman-adv/tp_meter.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 2e42f6b348c8..d9a80e459c2e 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -947,6 +947,13 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, /* look for an already existing test towards this node */ spin_lock_bh(&bat_priv->tp_list_lock); + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) { + spin_unlock_bh(&bat_priv->tp_list_lock); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_DST_UNREACHABLE, + dst, bat_priv, session_cookie); + return; + } + tp_vars = batadv_tp_list_find(bat_priv, dst); if (tp_vars) { spin_unlock_bh(&bat_priv->tp_list_lock); @@ -1329,9 +1336,12 @@ static struct batadv_tp_vars * batadv_tp_init_recv(struct batadv_priv *bat_priv, const struct batadv_icmp_tp_packet *icmp) { - struct batadv_tp_vars *tp_vars; + struct batadv_tp_vars *tp_vars = NULL; spin_lock_bh(&bat_priv->tp_list_lock); + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + goto out_unlock; + tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, icmp->session); if (tp_vars) @@ -1464,6 +1474,9 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_icmp_tp_packet *icmp; + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + goto out; + icmp = (struct batadv_icmp_tp_packet *)skb->data; switch (icmp->subtype) { @@ -1478,6 +1491,8 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) "Received unknown TP Metric packet type %u\n", icmp->subtype); } + +out: consume_skb(skb); } From 3d3cf6a7314aca4df0a6dde28ce784a2a30d0166 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Mon, 27 Apr 2026 14:43:34 +0800 Subject: [PATCH 031/377] batman-adv: stop tp_meter sessions during mesh teardown TP meter sessions remain linked on bat_priv->tp_list after the netlink request has already finished. When the mesh interface is removed, batadv_mesh_free() currently tears down the mesh without first draining these sessions. A running sender thread or a late incoming tp_meter packet can then keep processing against a mesh instance which is already shutting down. Synchronize tp_meter with the mesh lifetime by stopping all active sessions from batadv_mesh_free() and waiting for sender threads to exit before teardown continues. Fixes: 33a3bb4a3345 ("batman-adv: throughput meter implementation") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Co-developed-by: Luxing Yin Signed-off-by: Luxing Yin Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Signed-off-by: Sven Eckelmann --- net/batman-adv/main.c | 1 + net/batman-adv/tp_meter.c | 94 +++++++++++++++++++++++++++++++-------- net/batman-adv/tp_meter.h | 1 + net/batman-adv/types.h | 4 ++ 4 files changed, 82 insertions(+), 18 deletions(-) diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 3a35aadd8b41..a4d33ee0fda5 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -249,6 +249,7 @@ void batadv_mesh_free(struct net_device *mesh_iface) atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); batadv_purge_outstanding_packets(bat_priv, NULL); + batadv_tp_stop_all(bat_priv); batadv_gw_node_free(bat_priv); diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index d9a80e459c2e..58ca59a2799e 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -365,23 +366,38 @@ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) } /** - * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer - * @bat_priv: the bat priv with all the mesh interface information - * @tp_vars: the private data of the current TP meter session to cleanup + * batadv_tp_list_detach() - remove tp session from mesh session list once + * @tp_vars: the private data of the current TP meter session */ -static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, - struct batadv_tp_vars *tp_vars) +static void batadv_tp_list_detach(struct batadv_tp_vars *tp_vars) { - cancel_delayed_work(&tp_vars->finish_work); + bool detached = false; spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); - hlist_del_rcu(&tp_vars->list); + if (!hlist_unhashed(&tp_vars->list)) { + hlist_del_init_rcu(&tp_vars->list); + detached = true; + } spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); + if (!detached) + return; + + atomic_dec(&tp_vars->bat_priv->tp_num); + /* drop list reference */ batadv_tp_vars_put(tp_vars); +} - atomic_dec(&tp_vars->bat_priv->tp_num); +/** + * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer + * @tp_vars: the private data of the current TP meter session to cleanup + */ +static void batadv_tp_sender_cleanup(struct batadv_tp_vars *tp_vars) +{ + cancel_delayed_work_sync(&tp_vars->finish_work); + + batadv_tp_list_detach(tp_vars); /* kill the timer and remove its reference */ timer_delete_sync(&tp_vars->timer); @@ -886,7 +902,8 @@ static int batadv_tp_send(void *arg) batadv_orig_node_put(orig_node); batadv_tp_sender_end(bat_priv, tp_vars); - batadv_tp_sender_cleanup(bat_priv, tp_vars); + batadv_tp_sender_cleanup(tp_vars); + complete(&tp_vars->finished); batadv_tp_vars_put(tp_vars); @@ -918,7 +935,8 @@ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars) batadv_tp_vars_put(tp_vars); /* cleanup of failed tp meter variables */ - batadv_tp_sender_cleanup(bat_priv, tp_vars); + batadv_tp_sender_cleanup(tp_vars); + complete(&tp_vars->finished); return; } @@ -1024,6 +1042,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, tp_vars->start_time = jiffies; init_waitqueue_head(&tp_vars->more_bytes); + init_completion(&tp_vars->finished); spin_lock_init(&tp_vars->unacked_lock); INIT_LIST_HEAD(&tp_vars->unacked_list); @@ -1126,14 +1145,7 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t) "Shutting down for inactivity (more than %dms) from %pM\n", BATADV_TP_RECV_TIMEOUT, tp_vars->other_end); - spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); - hlist_del_rcu(&tp_vars->list); - spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); - - /* drop list reference */ - batadv_tp_vars_put(tp_vars); - - atomic_dec(&bat_priv->tp_num); + batadv_tp_list_detach(tp_vars); spin_lock_bh(&tp_vars->unacked_lock); list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { @@ -1496,6 +1508,52 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) consume_skb(skb); } +/** + * batadv_tp_stop_all() - stop all currently running tp meter sessions + * @bat_priv: the bat priv with all the mesh interface information + */ +void batadv_tp_stop_all(struct batadv_priv *bat_priv) +{ + struct batadv_tp_vars *tp_vars[BATADV_TP_MAX_NUM]; + struct batadv_tp_vars *tp_var; + size_t count = 0; + size_t i; + + spin_lock_bh(&bat_priv->tp_list_lock); + hlist_for_each_entry(tp_var, &bat_priv->tp_list, list) { + if (WARN_ON_ONCE(count >= BATADV_TP_MAX_NUM)) + break; + + if (!kref_get_unless_zero(&tp_var->refcount)) + continue; + + tp_vars[count++] = tp_var; + } + spin_unlock_bh(&bat_priv->tp_list_lock); + + for (i = 0; i < count; i++) { + tp_var = tp_vars[i]; + + switch (tp_var->role) { + case BATADV_TP_SENDER: + batadv_tp_sender_shutdown(tp_var, + BATADV_TP_REASON_CANCEL); + wake_up(&tp_var->more_bytes); + wait_for_completion(&tp_var->finished); + break; + case BATADV_TP_RECEIVER: + batadv_tp_list_detach(tp_var); + if (timer_shutdown_sync(&tp_var->timer)) + batadv_tp_vars_put(tp_var); + break; + } + + batadv_tp_vars_put(tp_var); + } + + synchronize_net(); +} + /** * batadv_tp_meter_init() - initialize global tp_meter structures */ diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index f0046d366eac..4e97cd10cd02 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -17,6 +17,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, u32 test_length, u32 *cookie); void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, u8 return_value); +void batadv_tp_stop_all(struct batadv_priv *bat_priv); void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb); #endif /* _NET_BATMAN_ADV_TP_METER_H_ */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 8fc5fe0e9b05..daa06f421154 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1328,6 +1329,9 @@ struct batadv_tp_vars { /** @finish_work: work item for the finishing procedure */ struct delayed_work finish_work; + /** @finished: completion signaled when a sender thread exits */ + struct completion finished; + /** @test_length: test length in milliseconds */ u32 test_length; From d581fc99d3b958cb6e363104e9aab57f36aee6f3 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Thu, 23 Apr 2026 13:56:47 +0100 Subject: [PATCH 032/377] mm/memfd_luo: reject memfds whose page count exceeds UINT_MAX memfd_luo_preserve_folios() declares max_folios as unsigned int and computes it from the inode size, then passes it to memfd_pin_folios() which itself caps max_folios at unsigned int. For files whose base-page count exceeds UINT_MAX (larger than 16 TiB with 4 KiB pages), the assignment truncates silently: only a prefix of the file gets pinned and preserved, while memfd_luo_preserve() still records the full inode size in ser->size. On retrieve the inode is restored to the full size but only the preserved prefix repopulates the page cache, so the tail comes back as holes and user data is silently lost across the live update. Reject such files at preserve time with -EFBIG rather than chunk the pin loop, which would also require enlarging the preserved folios array well beyond what is practical. Fixes: b3749f174d68 ("mm: memfd_luo: allow preserving memfd") Signed-off-by: David Carlier Reviewed-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Link: https://patch.msgid.link/20260423125648.152113-1-devnexen@gmail.com Signed-off-by: Pasha Tatashin --- mm/memfd_luo.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index 35d1247281e0..94ae113f68f6 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -259,7 +259,7 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) struct inode *inode = file_inode(args->file); struct memfd_luo_folio_ser *folios_ser; struct memfd_luo_ser *ser; - u64 nr_folios; + u64 nr_folios, inode_size; int err = 0, seals; inode_lock(inode); @@ -285,7 +285,18 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) } ser->pos = args->file->f_pos; - ser->size = i_size_read(inode); + inode_size = i_size_read(inode); + + /* + * memfd_pin_folios() caps at UINT_MAX folios; refuse larger + * files to avoid silently preserving only a prefix. + */ + if (DIV_ROUND_UP_ULL(inode_size, PAGE_SIZE) > UINT_MAX) { + err = -EFBIG; + goto err_free_ser; + } + + ser->size = inode_size; ser->seals = seals; err = memfd_luo_preserve_folios(args->file, &ser->folios, From 7b0b68b2b95606e65594958686833e53423f58f2 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Thu, 23 Apr 2026 13:56:48 +0100 Subject: [PATCH 033/377] mm/memfd_luo: document preservation of file seals Commit 8a552d68a86e ("mm: memfd_luo: preserve file seals") started preserving file seals across live update and restoring them via memfd_add_seals() on retrieve, but the DOC header was not updated and still listed seals under "Non-Preserved Properties" as being unsealed on restore. Move the Seals entry to the "Preserved Properties" section and describe the actual behavior, including the MEMFD_LUO_ALL_SEALS restriction that both preserve and retrieve enforce. Fixes: 8a552d68a86e ("mm: memfd_luo: preserve file seals") Signed-off-by: David Carlier Reviewed-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Link: https://patch.msgid.link/20260423125648.152113-2-devnexen@gmail.com Signed-off-by: Pasha Tatashin --- mm/memfd_luo.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index 94ae113f68f6..59de210bee5f 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -50,6 +50,11 @@ * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property * is maintained. * + * Seals + * File seals set on the memfd are preserved and re-applied on restore. + * Only seals known to this LUO version (see ``MEMFD_LUO_ALL_SEALS``) may + * be present; preservation fails with ``-EOPNOTSUPP`` otherwise. + * * Non-Preserved Properties * ======================== * @@ -61,10 +66,6 @@ * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set * again after restore via ``fcntl()``. - * - * Seals - * File seals are not preserved. The file is unsealed on restore and if - * needed, must be sealed again via ``fcntl()``. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt From 17e4c68ff35090d8cb743e3c82c09f92fda1ebda Mon Sep 17 00:00:00 2001 From: David Gow Date: Sat, 25 Apr 2026 11:41:53 +0800 Subject: [PATCH 034/377] kunit: config: Enable KUNIT_DEBUGFS by default The KUNIT_DEBUGFS option is currently enabled based on the value of KUNIT_ALL_TESTS, but it really doesn't have anything to do with the set of enabled tests, so just enable it by default anyway. In particular, this shouldn't be only visible if KUNIT_ALL_TESTS is set, which is quite confusing. Link: https://lore.kernel.org/r/20260425034155.53913-1-david@davidgow.net Fixes: beaed42c427d ("kunit: default KUNIT_* fragments to KUNIT_ALL_TESTS") Signed-off-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index 498cc51e493d..f80ca3aeedb0 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -16,8 +16,8 @@ menuconfig KUNIT if KUNIT config KUNIT_DEBUGFS - bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" if !KUNIT_ALL_TESTS - default KUNIT_ALL_TESTS + bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" + default y help Enable debugfs representation for kunit. Currently this consists of /sys/kernel/debug/kunit//results files for each From 8f80b5b227ef9ea422080487715c841856339aed Mon Sep 17 00:00:00 2001 From: David Gow Date: Sat, 25 Apr 2026 11:41:54 +0800 Subject: [PATCH 035/377] kunit: config: KUNIT_DEBUGFS should depend on DEBUG_FS CONFIG_KUNIT_DEBUGFS is totally useless without debugfs, so it should depend on CONFIG_DEBUG_FS. Link: https://lore.kernel.org/r/20260425034155.53913-2-david@davidgow.net Fixes: e2219db280e3 ("kunit: add debugfs /sys/kernel/debug/kunit//results display") Signed-off-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index f80ca3aeedb0..94ff8e4089bf 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -17,6 +17,7 @@ if KUNIT config KUNIT_DEBUGFS bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" + depends on DEBUG_FS default y help Enable debugfs representation for kunit. Currently this consists From 09ae540e1d5c02210795911bf5459282d7af04e9 Mon Sep 17 00:00:00 2001 From: Mikhail Gavrilov Date: Thu, 23 Apr 2026 02:33:49 +0500 Subject: [PATCH 036/377] rhashtable: drop ht->mutex in rhashtable_free_and_destroy() rhashtable_free_and_destroy() is a single-shot teardown routine: cancel_work_sync() has already quiesced the deferred rehash worker, and the function's documented contract requires the caller to guarantee no other concurrent access to the rhashtable. Under those conditions ht->mutex is not protecting anything -- taking it is a leftover from the original teardown path. That leftover is actively harmful: it closes a circular lock-class dependency with fs_reclaim. The deferred rehash worker takes ht->mutex and then allocates GFP_KERNEL memory in bucket_table_alloc(), establishing &ht->mutex -> fs_reclaim After commit b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure") introduced simple_xattr_ht_free(), which calls rhashtable_free_and_destroy(), the simple_xattrs teardown became reachable from evict() under the dcache shrinker. The subsequent per-subsystem adaptations made the reverse edge concrete in three independent code paths: * commit 52b364fed6e1 ("shmem: adapt to rhashtable-based simple_xattrs with lazy allocation") * commit 5bd97f5c5f24 ("kernfs: adapt to rhashtable-based simple_xattrs with lazy allocation") * commit 50704c391fbf ("pidfs: adapt to rhashtable-based simple_xattrs") Any of the three closes the cycle fs_reclaim -> &ht->mutex which lockdep reports as follows. This particular splat was observed organically on a workstation kernel built from vfs-7.1-rc1.xattr at ~35h uptime under normal mixed workload, with CONFIG_PROVE_LOCKING=y. The path happens to go through kernfs: WARNING: possible circular locking dependency detected 7.0.0-faeab166167f-with-fixes-v1+ #191 Tainted: G U kswapd0/243 is trying to acquire lock: ffff8882e475c0f8 (&ht->mutex){+.+.}-{4:4}, at: rhashtable_free_and_destroy+0x36/0x740 but task is already holding lock: ffffffffa8ad1d00 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x995/0x1600 the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: __lock_acquire+0x506/0xbf0 lock_acquire.part.0+0xc7/0x280 fs_reclaim_acquire+0xd9/0x130 __kvmalloc_node_noprof+0xcd/0xb40 bucket_table_alloc.isra.0+0x5a/0x440 rhashtable_rehash_alloc+0x4e/0xd0 rht_deferred_worker+0x14b/0x440 process_one_work+0x8fd/0x16a0 worker_thread+0x601/0xff0 kthread+0x36b/0x470 ret_from_fork+0x5bf/0x910 ret_from_fork_asm+0x1a/0x30 -> #0 (&ht->mutex){+.+.}-{4:4}: check_prev_add+0xdb/0xce0 validate_chain+0x554/0x780 __lock_acquire+0x506/0xbf0 lock_acquire.part.0+0xc7/0x280 __mutex_lock+0x1b2/0x2550 rhashtable_free_and_destroy+0x36/0x740 kernfs_put.part.0+0x119/0x570 evict+0x3b6/0x9c0 __dentry_kill+0x181/0x540 shrink_dentry_list+0x135/0x440 prune_dcache_sb+0xdb/0x150 super_cache_scan+0x2ff/0x520 do_shrink_slab+0x35a/0xee0 shrink_slab_memcg+0x457/0x950 shrink_slab+0x43b/0x550 shrink_one+0x31a/0x6f0 shrink_many+0x31e/0xc80 shrink_node+0xeb3/0x14a0 balance_pgdat+0x8ed/0x1600 kswapd+0x2f3/0x530 kthread+0x36b/0x470 ret_from_fork+0x5bf/0x910 ret_from_fork_asm+0x1a/0x30 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&ht->mutex); lock(fs_reclaim); lock(&ht->mutex); Note that lockdep tracks lock classes, not instances: the two &ht->mutex sites are on different rhashtable objects (the deferred worker was triggered by some unrelated rhashtable growth), but because rhashtable_init() uses a single static lockdep key for all rhashtables, this is a real class-level cycle. Once reported, lockdep disables itself for the remainder of the boot, masking any subsequent locking bugs. Drop the mutex. After cancel_work_sync() the rehash worker is quiesced and, per this function's contract, no other concurrent access is possible; the tables are therefore owned exclusively by this function and can be walked without any lock held. Switch the table walks from rht_dereference() (which requires ht->mutex to be held under CONFIG_PROVE_RCU) to rcu_dereference_raw(), which has no lockdep annotation. rht_ptr_exclusive() already uses rcu_dereference_protected(p, 1) and needs no change. This is the only place in lib/rhashtable.c where &ht->mutex is acquired from a path reachable under fs_reclaim; the deferred worker is the only other site and it is the forward edge. Removing the acquisition here therefore eliminates the class cycle for all three subsystems that use simple_xattrs, not just the one in the splat above. No locking-semantics change is introduced for correct users; incorrect users would already be racing with rehash worker completion regardless of the mutex. Synthetic reproduction of the splat within a few-minute window was unsuccessful across several attempts (tmpfs and kernfs zombies via cgroupfs with open-fd-through-rmdir, with and without swap, up to ~60k reclaim-path executions of simple_xattr_ht_free() in a single run), consistent with the rare coincidence-of-edges profile of the bug: the forward edge is already registered in /proc/lockdep on any idle system via rht_deferred_worker, but the reverse edge requires evict() to complete kernfs_put()'s final release inside the fs_reclaim critical section, which in my attempts was ordered against rather than interleaved with the worker. Fixes: b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure") Signed-off-by: Mikhail Gavrilov Signed-off-by: Herbert Xu --- lib/rhashtable.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 7a67ef5b67b6..426d4e381f13 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -1166,6 +1166,11 @@ static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, * This function will eventually sleep to wait for an async resize * to complete. The caller is responsible that no further write operations * occurs in parallel. + * + * After cancel_work_sync() has returned, the deferred rehash worker is + * quiesced and, per the contract above, no other concurrent access to the + * rhashtable is possible. The tables are therefore owned exclusively by + * this function and can be walked without ht->mutex held. */ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), @@ -1177,8 +1182,15 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, irq_work_sync(&ht->run_irq_work); cancel_work_sync(&ht->run_work); - mutex_lock(&ht->mutex); - tbl = rht_dereference(ht->tbl, ht); + /* + * Do NOT take ht->mutex here. The rehash worker establishes + * ht->mutex -> fs_reclaim via GFP_KERNEL bucket allocation under + * the mutex; callers on the reclaim path (e.g. simple_xattr_ht_free() + * from evict() under the dcache shrinker for shmem/kernfs/pidfs + * inodes) would otherwise close a circular dependency + * fs_reclaim -> ht->mutex. + */ + tbl = rcu_dereference_raw(ht->tbl); restart: if (free_fn) { for (i = 0; i < tbl->size; i++) { @@ -1187,22 +1199,21 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, cond_resched(); for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), next = !rht_is_a_nulls(pos) ? - rht_dereference(pos->next, ht) : NULL; + rcu_dereference_raw(pos->next) : NULL; !rht_is_a_nulls(pos); pos = next, next = !rht_is_a_nulls(pos) ? - rht_dereference(pos->next, ht) : NULL) + rcu_dereference_raw(pos->next) : NULL) rhashtable_free_one(ht, pos, free_fn, arg); } } - next_tbl = rht_dereference(tbl->future_tbl, ht); + next_tbl = rcu_dereference_raw(tbl->future_tbl); bucket_table_free(tbl); if (next_tbl) { tbl = next_tbl; goto restart; } - mutex_unlock(&ht->mutex); } EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); From dad0d91cc2c3e6b6fb285ccfe7ddf71525797198 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 28 Apr 2026 18:14:18 +0200 Subject: [PATCH 037/377] mm/slab: Add kvfree_atomic() helper kvmalloc() now supports non-sleeping GFP flags, including the vmalloc fallback path. This means it may return vmalloc memory even for GFP_ATOMIC and GFP_NOWAIT allocations. Freeing such memory with kvfree() may then end up calling vfree(), which is not safe for non-sleeping contexts. Introduce kvfree_atomic() helper for such cases. It mirrors kvfree(), but uses vfree_atomic() for vmalloced memory. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka (SUSE) Acked-by: Harry Yoo (Oracle) Signed-off-by: Herbert Xu --- include/linux/slab.h | 3 +++ mm/slub.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/linux/slab.h b/include/linux/slab.h index 15a60b501b95..2b5ab488e96b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1234,6 +1234,9 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) +extern void kvfree_atomic(const void *addr); +DEFINE_FREE(kvfree_atomic, void *, if (!IS_ERR_OR_NULL(_T)) kvfree_atomic(_T)) + extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); diff --git a/mm/slub.c b/mm/slub.c index 0baa906f39ab..8f9004536729 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6882,6 +6882,22 @@ void kvfree(const void *addr) } EXPORT_SYMBOL(kvfree); +/** + * kvfree_atomic() - Free memory. + * @addr: Pointer to allocated memory. + * + * Same as kvfree(), but uses vfree_atomic() for vmalloc + * backed memory. Must not be called from NMI context. + */ +void kvfree_atomic(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree_atomic(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree_atomic); + /** * kvfree_sensitive - Free a data object containing sensitive information. * @addr: address of the data object to be freed. From d1fa83ecac31093a550534a79a33bc7f4ba8fc10 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 28 Apr 2026 18:14:19 +0200 Subject: [PATCH 038/377] rhashtable: Add bucket_table_free_atomic() helper rhashtable_insert_rehash() allocates a new bucket table with GFP_ATOMIC, as it is called from an RCU read-side critical section. If rhashtable_rehash_attach() then fails, the new table is freed via kvfree(). This is unsafe, since kvfree() may fall back to vfree() for vmalloc-backed allocations, which can sleep and trigger: BUG: sleeping function called from invalid context Add bucket_table_free_atomic(), which uses kvfree_atomic() so the table can be freed safely from non-sleeping context. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Herbert Xu --- lib/rhashtable.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 426d4e381f13..04b3a808fca9 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -114,6 +114,14 @@ static void bucket_table_free(const struct bucket_table *tbl) kvfree(tbl); } +static void bucket_table_free_atomic(const struct bucket_table *tbl) +{ + if (tbl->nest) + nested_bucket_table_free(tbl); + + kvfree_atomic(tbl); +} + static void bucket_table_free_rcu(struct rcu_head *head) { bucket_table_free(container_of(head, struct bucket_table, rcu)); @@ -496,7 +504,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht, err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { - bucket_table_free(new_tbl); + bucket_table_free_atomic(new_tbl); if (err == -EEXIST) err = 0; } else From 1f7305d87aa23db2579df222eba504a333c2c978 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 5 May 2026 17:52:03 +0100 Subject: [PATCH 039/377] KVM: arm64: Work around C1-Pro erratum 4193714 for protected guests C1-Pro cores with SME have an erratum where TLBI+DSB does not complete all outstanding SME accesses. Instead a DSB needs to be executed on the affected CPUs. The implication is that pages cannot be unmapped from the host Stage 2 and then provided to a protected guest or to the hypervisor. Host SME accesses may still complete after this point. This erratum breaks pKVM's guarantees, and the workaround is hard to implement as EL2 and EL1 share a security state meaning EL1 can mask IPIs sent by EL2, leading to interrupt blackouts. Instead, do this in EL3. This has the advantage of a separate security state, meaning lower EL cannot mask the IPI. It is also simpler for EL3 to know about CPUs that are off or in PSCI's CPU_SUSPEND. Add the needed hook to host_stage2_set_owner_metadata_locked(). This covers the cases where the host loses access to a page: __pkvm_host_donate_guest() __pkvm_guest_unshare_host() host_stage2_set_owner_locked() when owner_id == PKVM_ID_HYP Since pKVM relies on the firmware call for correctness, check for the firmware counterpart during protected KVM initialisation and fail the pKVM initialisation if it is missing. Signed-off-by: James Morse Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Mark Rutland Cc: Marc Zyngier Cc: Oliver Upton Cc: Will Deacon Cc: Vincent Donnefort Cc: Lorenzo Pieralisi Cc: Sudeep Holla Link: https://patch.msgid.link/20260505165205.2690919-1-catalin.marinas@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 21 +++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 23 ++++++++++++++++++++++- include/linux/arm-smccc.h | 6 ++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 8bb2c7422cc8..34c9950884d5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -4,6 +4,7 @@ * Author: Christoffer Dall */ +#include #include #include #include @@ -2638,6 +2639,22 @@ static int init_pkvm_host_sve_state(void) return 0; } +static int pkvm_check_sme_dvmsync_fw_call(void) +{ + struct arm_smccc_res res; + + if (!cpus_have_final_cap(ARM64_WORKAROUND_4193714)) + return 0; + + arm_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res); + if (res.a0) { + kvm_err("pKVM requires firmware support for C1-Pro erratum 4193714\n"); + return -ENODEV; + } + + return 0; +} + /* * Finalizes the initialization of hyp mode, once everything else is initialized * and the initialziation process cannot fail. @@ -2838,6 +2855,10 @@ static int __init init_hyp_mode(void) if (err) goto out_err; + err = pkvm_check_sme_dvmsync_fw_call(); + if (err) + goto out_err; + err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 28a471d1927c..a3050e2b65b1 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -5,6 +5,7 @@ */ #include + #include #include #include @@ -14,6 +15,7 @@ #include +#include #include #include #include @@ -29,6 +31,19 @@ static struct hyp_pool host_s2_pool; static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm); #define current_vm (*this_cpu_ptr(&__current_vm)) +static void pkvm_sme_dvmsync_fw_call(void) +{ + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_4193714)) { + struct arm_smccc_res res; + + /* + * Ignore the return value. Probing for the workaround + * availability took place in init_hyp_mode(). + */ + hyp_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res); + } +} + static void guest_lock_component(struct pkvm_hyp_vm *vm) { hyp_spin_lock(&vm->lock); @@ -574,8 +589,14 @@ static int host_stage2_set_owner_metadata_locked(phys_addr_t addr, u64 size, ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt, addr, size, &host_s2_pool, KVM_HOST_INVALID_PTE_TYPE_DONATION, annotation); - if (!ret) + if (!ret) { + /* + * After stage2 maintenance has happened, but before the page + * owner has changed. + */ + pkvm_sme_dvmsync_fw_call(); __host_update_page_state(addr, size, PKVM_NOPAGE); + } return ret; } diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 50b47eba7d01..e7195750d21b 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -105,6 +105,12 @@ ARM_SMCCC_SMC_32, \ 0, 0x3fff) +/* C1-Pro erratum 4193714: SME DVMSync early acknowledgement */ +#define ARM_SMCCC_CPU_WORKAROUND_4193714 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_CPU, 0x10) + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_32, \ From 8d9b9d985ad3a81c751a6b97edaf1d3c0780af7c Mon Sep 17 00:00:00 2001 From: Wei-Lin Chang Date: Tue, 5 May 2026 15:47:35 +0100 Subject: [PATCH 040/377] KVM: arm64: nv: Consider the DS bit when translating TCR_EL2 When running an nVHE L1, TCR_EL2 is mapped to TCR_EL1. Writes to the register are trapped and written to TCR_EL1 after a translation. Booting an nVHE L1 with 52-bit VA isn't working because the translation was ignoring the DS bit set by the guest, hence causing repeating level 0 faults. Add it in the translation function. Signed-off-by: Wei-Lin Chang Link: https://patch.msgid.link/20260505144735.1496530-1-weilin.chang@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_nested.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 091544e6af44..dc2957662ff2 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -23,6 +23,7 @@ static inline u64 tcr_el2_ps_to_tcr_el1_ips(u64 tcr_el2) static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr) { return TCR_EPD1_MASK | /* disable TTBR1_EL1 */ + ((tcr & TCR_EL2_DS) ? TCR_DS : 0) | ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) | tcr_el2_ps_to_tcr_el1_ips(tcr) | (tcr & TCR_EL2_TG0_MASK) | From 9be19df816dea9eb7dfe1661b3690bed6a2cb146 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 5 May 2026 10:49:13 +0100 Subject: [PATCH 041/377] KVM: arm64: Handle permission faults with guest_memfd gmem_abort() calls kvm_pgtable_stage2_map() to make changes to stage 2. It does this for both relaxing permissions on an existing mapping and to install a missing mapping. kvm_pgtable_stage2_map() doesn't make changes to stage 2 if there is an existing, valid entry and the new entry modifies only the permissions. This is checked in: kvm_pgtable_stage2_map() stage2_map_walk_leaf() stage2_map_walker_try_leaf() stage2_pte_needs_update() and if only the permissions differ, kvm_pgtable_stage2_map() returns -EAGAIN and KVM returns to the guest to replay the instruction. The assumption is that a concurrent fault on a different VCPU already mapped the faulting IPA, and replaying the instruction will either succeed, or cause a permission fault, which should be handled with kvm_pgtable_stage2_relax_perms(). gmem_abort(), on a read or write fault on a system without DIC (instruction cache invalidation required for data to instruction coherence), installs a valid entry with read and write permissions, but without executable permissions. On an execution fault on the same page, gmem_abort() attempts to relax the permissions to allow execution, but calls kvm_pgtable_stage2_map() to change the existing, valid, entry. kvm_pgtable_stage2_map() returns -EAGAIN and KVM resumes execution from the faulting instruction, which leads to an infinite loop of permission faults on the same instruction. Allow the guest to make progress by using kvm_pgtable_stage2_relax_perms() to relax permissions. Fixes: a7b57e099592 ("KVM: arm64: Handle guest_memfd-backed guest page faults") Signed-off-by: Alexandru Elisei Reviewed-by: Fuad Tabba Link: https://patch.msgid.link/20260505094913.75317-1-alexandru.elisei@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/mmu.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d089c107d9b7..4da9281312eb 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1576,21 +1576,24 @@ struct kvm_s2_fault_desc { static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) { bool write_fault, exec_fault; + bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt; unsigned long mmu_seq; struct page *page; struct kvm *kvm = s2fd->vcpu->kvm; - void *memcache; + void *memcache = NULL; kvm_pfn_t pfn; gfn_t gfn; int ret; - memcache = get_mmu_memcache(s2fd->vcpu); - ret = topup_mmu_memcache(s2fd->vcpu, memcache); - if (ret) - return ret; + if (!perm_fault) { + memcache = get_mmu_memcache(s2fd->vcpu); + ret = topup_mmu_memcache(s2fd->vcpu, memcache); + if (ret) + return ret; + } if (s2fd->nested) gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT; @@ -1631,9 +1634,19 @@ static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) goto out_unlock; } - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, - __pfn_to_phys(pfn), prot, - memcache, flags); + if (perm_fault) { + /* + * Drop the SW bits in favour of those stored in the + * PTE, which will be preserved. + */ + prot &= ~KVM_NV_GUEST_MAP_SZ; + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, s2fd->fault_ipa, + prot, flags); + } else { + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, + __pfn_to_phys(pfn), prot, + memcache, flags); + } out_unlock: kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W); From fc240715fc5003538ff530e3cfb985e7769b7171 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 4 May 2026 13:28:08 +0200 Subject: [PATCH 042/377] KVM: selftests: arm64: Fix steal_time test after UAPI refactoring Fix the following failure to the steal_time test on arm64 by making the timer address known to the guest. ==== Test Assertion Failure ==== steal_time.c:229: !ret pid=18514 tid=18514 errno=22 - Invalid argument 1 0x000000000040252f: check_steal_time_uapi at steal_time.c:229 (discriminator 20) 2 (inlined by) main at steal_time.c:537 (discriminator 20) 3 0x0000ffffa23d621b: ?? ??:0 4 0x0000ffffa23d62fb: ?? ??:0 5 0x0000000000402b6f: _start at ??:? KVM_SET_DEVICE_ATTR failed, rc: -1 errno: 22 (Invalid argument) Fixes: 40351ed924dd ("KVM: selftests: Refactor UAPI tests into dedicated function") Signed-off-by: Sebastian Ott Link: https://patch.msgid.link/20260504112808.21276-1-sebott@redhat.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/steal_time.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 7df2bc8eec02..76fcdd1fd3cb 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -220,6 +220,8 @@ static void check_steal_time_uapi(void) }; vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &dev); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, 1, 0); + virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, 1); st_ipa = (ulong)ST_GPA_BASE | 1; ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); From 9a624ea3f26f40c76bd2c7f77cde30659d42efbd Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Thu, 30 Apr 2026 10:37:24 +0000 Subject: [PATCH 043/377] KVM: arm64: Remove potential UB on nvhe tracing clock update Sashiko(locally) reports possiblity of division by zero and out-of-bounds bitwise shift in trace_clock_update(). Although the clock update is untrusted, we should at least have some basic checks to avoid undefined behaviours. Reviewed-by: Vincent Donnefort Signed-off-by: Mostafa Saleh Link: https://patch.msgid.link/20260430103724.2151625-1-smostafa@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/clock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/clock.c b/arch/arm64/kvm/hyp/nvhe/clock.c index 32fc4313fe43..a7fc61976fd0 100644 --- a/arch/arm64/kvm/hyp/nvhe/clock.c +++ b/arch/arm64/kvm/hyp/nvhe/clock.c @@ -35,6 +35,9 @@ void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) struct clock_data *clock = &trace_clock_data; u64 bank = clock->cur ^ 1; + if (!mult || shift >= 64) + return; + clock->data[bank].mult = mult; clock->data[bank].shift = shift; clock->data[bank].epoch_ns = epoch_ns; From 54ea22273eef67196f238263bc79f27da04d2114 Mon Sep 17 00:00:00 2001 From: Steffen Eiden Date: Tue, 28 Apr 2026 18:05:12 +0200 Subject: [PATCH 044/377] MAINTAINERS: Add Steffen as reviewer for KVM/arm64 KVM/arm64 and KVM/s390 will eventually share some code. Add me as a cross-reviewer from the s390 team to arm64 to help to keep both architectures in sync. Signed-off-by: Steffen Eiden Link: https://patch.msgid.link/20260428160527.1378085-16-seiden@linux.ibm.com [maz: rephrase commit message to use future tense, since this is merged ahead of the code] Signed-off-by: Marc Zyngier --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 882214b0e7db..3fe4ea59199c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14052,6 +14052,7 @@ KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) M: Marc Zyngier M: Oliver Upton R: Joey Gouly +R: Steffen Eiden R: Suzuki K Poulose R: Zenghui Yu L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) From 91892231ae5e638326e7eaa0174de86fac9aa5fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A1mon=20van=20Raaij?= Date: Wed, 6 May 2026 20:31:18 +0200 Subject: [PATCH 045/377] ALSA: hda/realtek: Add codec SSID quirk for Lenovo Yoga Pro 9 16IMH9 (17aa:38d5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Lenovo Yoga Pro 9 16IMH9 units carry codec SSID 17aa:38d5 instead of 17aa:38d6, which was added in commit 56722cfbb78d ("ALSA: hda/realtek: Add codec SSID quirk for Lenovo Yoga Pro 9 16IMH9"). The corresponding firmware blob TAS2XXX38D5.bin already ships in linux-firmware, and the hardware is otherwise identical: same PCI subsystem ID 17aa:3811 shared with the Legion S7 15IMH05, same TI TAS2781 amplifiers behind ACPI HID TIAS2781, same ALC287_FIXUP_TAS2781_I2C requirement. Add a second HDA_CODEC_QUIRK entry directly above the existing 17aa:38d6 entry so both variants resolve to the correct fixup. Reported and verified on hardware by GitHub user 0xEthamin. Link: https://github.com/ramonvanraaij/yoga9-tas2781-hda/issues/1 Signed-off-by: Rámon van Raaij Link: https://patch.msgid.link/20260506183118.patch1-ramon@vanraaij.eu Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 11d0ea8ed859..55bb98e2e55a 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7678,6 +7678,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { /* Yoga Pro 9 16IMH9 shares PCI SSID 17aa:3811 with Legion S7 15IMH05; * use codec SSID to distinguish them */ + HDA_CODEC_QUIRK(0x17aa, 0x38d5, "Lenovo Yoga Pro 9 16IMH9", ALC287_FIXUP_TAS2781_I2C), HDA_CODEC_QUIRK(0x17aa, 0x38d6, "Lenovo Yoga Pro 9 16IMH9", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3811, "Legion S7 15IMH05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), From d6854daa67be623860f4e1873fd3d3c275aba4ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Thu, 7 May 2026 00:40:51 -0300 Subject: [PATCH 046/377] ALSA: usb-audio: Bound MIDI endpoint descriptor scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit snd_usbmidi_get_ms_info() validates the internal MIDIStreaming endpoint descriptor size before using baAssocJackID[], but the descriptor walker can still return a class-specific endpoint descriptor whose bLength exceeds the remaining bytes in the endpoint-extra scan. That leaves later flexible-array reads bounded by bLength, but not by the remaining bytes in the endpoint-extra scan. Stop walking when bLength is zero or extends past the remaining endpoint-extra scan. Fixes: 5c6cd7021a05 ("ALSA: usb-audio: Fix case when USB MIDI interface has more than one extra endpoint descriptor") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260507-usb-midi-endpoint-scan-bounds-v1-1-329d7348160e@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/midi.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sound/usb/midi.c b/sound/usb/midi.c index 0a5b8941ebda..d87e3f357cf7 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1951,15 +1951,17 @@ static struct usb_ms_endpoint_descriptor *find_usb_ms_endpoint_descriptor( while (extralen > 3) { struct usb_ms_endpoint_descriptor *ms_ep = (struct usb_ms_endpoint_descriptor *)extra; + int length = ms_ep->bLength; - if (ms_ep->bLength > 3 && + if (!length || length > extralen) + break; + + if (length > 3 && ms_ep->bDescriptorType == USB_DT_CS_ENDPOINT && ms_ep->bDescriptorSubtype == UAC_MS_GENERAL) return ms_ep; - if (!extra[0]) - break; - extralen -= extra[0]; - extra += extra[0]; + extralen -= length; + extra += length; } return NULL; } From 918be519c7876329e1b6e2ea1c59f0b75e792dca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Thu, 7 May 2026 00:40:52 -0300 Subject: [PATCH 047/377] ALSA: usb-audio: Bound MIDI 2.0 endpoint descriptor scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The USB MIDI 2.0 endpoint parser has the same descriptor walking pattern as the legacy MIDI parser. It validates bLength against bNumGrpTrmBlock before reading baAssoGrpTrmBlkID[], but not against the remaining bytes in the endpoint-extra scan. A malformed device can therefore make later baAssoGrpTrmBlkID[] reads consume bytes past the walked descriptor. Reject zero-length and overlong descriptors while walking endpoint extras. Fixes: ff49d1df79ae ("ALSA: usb-audio: USB MIDI 2.0 UMP support") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260507-usb-midi-endpoint-scan-bounds-v1-2-329d7348160e@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/midi2.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sound/usb/midi2.c b/sound/usb/midi2.c index 2785600d2312..04aeb9052f13 100644 --- a/sound/usb/midi2.c +++ b/sound/usb/midi2.c @@ -496,15 +496,17 @@ static void *find_usb_ms_endpoint_descriptor(struct usb_host_endpoint *hostep, while (extralen > 3) { struct usb_ms_endpoint_descriptor *ms_ep = (struct usb_ms_endpoint_descriptor *)extra; + int length = ms_ep->bLength; - if (ms_ep->bLength > 3 && + if (!length || length > extralen) + break; + + if (length > 3 && ms_ep->bDescriptorType == USB_DT_CS_ENDPOINT && ms_ep->bDescriptorSubtype == subtype) return ms_ep; - if (!extra[0]) - break; - extralen -= extra[0]; - extra += extra[0]; + extralen -= length; + extra += length; } return NULL; } From 72d52bac023b376b73277804b24315ea2a49ad1e Mon Sep 17 00:00:00 2001 From: Ayaan Mirza Baig Date: Sat, 18 Apr 2026 00:46:13 +0000 Subject: [PATCH 048/377] platform/x86: samsung-galaxybook: Refactor camera lens cover input device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the camera_lens_cover_switch input device to a generic input device which can be used for multiple input events. Move input device allocation and registration into a dedicated galaxybook_input_init() helper which is called early in probe so that the device is available to all features. No functional change. Signed-off-by: Ayaan Mirza Baig Link: https://patch.msgid.link/20260418004613.93981-2-ayaanmirzabaig85@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/samsung-galaxybook.c | 46 ++++++++++++----------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/drivers/platform/x86/samsung-galaxybook.c b/drivers/platform/x86/samsung-galaxybook.c index 755cb82bdb60..a51ad6b03164 100644 --- a/drivers/platform/x86/samsung-galaxybook.c +++ b/drivers/platform/x86/samsung-galaxybook.c @@ -53,7 +53,7 @@ struct samsung_galaxybook { void *i8042_filter_ptr; struct work_struct block_recording_hotkey_work; - struct input_dev *camera_lens_cover_switch; + struct input_dev *input; struct acpi_battery_hook battery_hook; @@ -859,13 +859,28 @@ static int block_recording_acpi_set(struct samsung_galaxybook *galaxybook, const if (err) return err; - input_report_switch(galaxybook->camera_lens_cover_switch, + input_report_switch(galaxybook->input, SW_CAMERA_LENS_COVER, value ? 1 : 0); - input_sync(galaxybook->camera_lens_cover_switch); + input_sync(galaxybook->input); return 0; } +static int galaxybook_input_init(struct samsung_galaxybook *galaxybook) +{ + galaxybook->input = devm_input_allocate_device(&galaxybook->platform->dev); + if (!galaxybook->input) + return -ENOMEM; + + galaxybook->input->name = "Samsung Galaxy Book Camera Lens Cover"; + galaxybook->input->phys = DRIVER_NAME "/input0"; + galaxybook->input->id.bustype = BUS_HOST; + + input_set_capability(galaxybook->input, EV_SW, SW_CAMERA_LENS_COVER); + + return input_register_device(galaxybook->input); +} + static int galaxybook_block_recording_init(struct samsung_galaxybook *galaxybook) { bool value; @@ -887,24 +902,8 @@ static int galaxybook_block_recording_init(struct samsung_galaxybook *galaxybook return GB_NOT_SUPPORTED; } - galaxybook->camera_lens_cover_switch = - devm_input_allocate_device(&galaxybook->platform->dev); - if (!galaxybook->camera_lens_cover_switch) - return -ENOMEM; - - galaxybook->camera_lens_cover_switch->name = "Samsung Galaxy Book Camera Lens Cover"; - galaxybook->camera_lens_cover_switch->phys = DRIVER_NAME "/input0"; - galaxybook->camera_lens_cover_switch->id.bustype = BUS_HOST; - - input_set_capability(galaxybook->camera_lens_cover_switch, EV_SW, SW_CAMERA_LENS_COVER); - - err = input_register_device(galaxybook->camera_lens_cover_switch); - if (err) - return err; - - input_report_switch(galaxybook->camera_lens_cover_switch, - SW_CAMERA_LENS_COVER, value ? 1 : 0); - input_sync(galaxybook->camera_lens_cover_switch); + input_report_switch(galaxybook->input, SW_CAMERA_LENS_COVER, value ? 1 : 0); + input_sync(galaxybook->input); return 0; } @@ -1392,6 +1391,11 @@ static int galaxybook_probe(struct platform_device *pdev) return dev_err_probe(&galaxybook->platform->dev, err, "failed to initialize kbd_backlight\n"); + err = galaxybook_input_init(galaxybook); + if (err) + return dev_err_probe(&galaxybook->platform->dev, err, + "failed to initialize input device\n"); + err = galaxybook_fw_attrs_init(galaxybook); if (err) return dev_err_probe(&galaxybook->platform->dev, err, From 90dc96c61be35a1f81b56dfc5dcb80d50debbf89 Mon Sep 17 00:00:00 2001 From: Ayaan Mirza Baig Date: Sat, 18 Apr 2026 00:46:14 +0000 Subject: [PATCH 049/377] platform/x86: samsung-galaxybook: Handle ACPI hotkey notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Samsung Galaxy Book 5 (SAM0430), the keyboard backlight, microphone mute, and camera block hotkeys do not generate i8042 scancodes. Instead they arrive as ACPI notifications 0x7d, 0x6e, and 0x6f respectively, all of which previously fell through to the default "unknown" warning in galaxybook_acpi_notify(). Add handling for these three events: - 0x7d (Fn+F9, keyboard backlight): schedule the existing kbd_backlight_hotkey_work which cycles brightness. - 0x6e (Fn+F10, microphone mute): emit KEY_MICMUTE via the driver's input device. - 0x6f (Fn+F11, camera block): if block_recording is active use the existing block_recording_hotkey_work; otherwise emit a toggle of SW_CAMERA_LENS_COVER via the driver's input device on models where the block_recording ACPI feature is not supported. Tested on Samsung Galaxy Book 5 (SAM0430) and Samsung Galaxy Book2 Pro (SAM0429). Signed-off-by: Ayaan Mirza Baig Co-developed-by: Joshua Grisham Signed-off-by: Joshua Grisham Link: https://patch.msgid.link/20260418004613.93981-3-ayaanmirzabaig85@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/samsung-galaxybook.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/platform/x86/samsung-galaxybook.c b/drivers/platform/x86/samsung-galaxybook.c index a51ad6b03164..6382af0b106c 100644 --- a/drivers/platform/x86/samsung-galaxybook.c +++ b/drivers/platform/x86/samsung-galaxybook.c @@ -197,6 +197,9 @@ static const guid_t performance_mode_guid = #define GB_ACPI_NOTIFY_DEVICE_ON_TABLE 0x6c #define GB_ACPI_NOTIFY_DEVICE_OFF_TABLE 0x6d #define GB_ACPI_NOTIFY_HOTKEY_PERFORMANCE_MODE 0x70 +#define GB_ACPI_NOTIFY_HOTKEY_KBD_BACKLIGHT 0x7d +#define GB_ACPI_NOTIFY_HOTKEY_MICMUTE 0x6e +#define GB_ACPI_NOTIFY_HOTKEY_CAMERA 0x6f #define GB_KEY_KBD_BACKLIGHT_KEYDOWN 0x2c #define GB_KEY_KBD_BACKLIGHT_KEYUP 0xac @@ -876,6 +879,7 @@ static int galaxybook_input_init(struct samsung_galaxybook *galaxybook) galaxybook->input->phys = DRIVER_NAME "/input0"; galaxybook->input->id.bustype = BUS_HOST; + input_set_capability(galaxybook->input, EV_KEY, KEY_MICMUTE); input_set_capability(galaxybook->input, EV_SW, SW_CAMERA_LENS_COVER); return input_register_device(galaxybook->input); @@ -1259,6 +1263,25 @@ static void galaxybook_acpi_notify(acpi_handle handle, u32 event, void *data) if (galaxybook->has_performance_mode) platform_profile_cycle(); break; + case GB_ACPI_NOTIFY_HOTKEY_KBD_BACKLIGHT: + if (galaxybook->has_kbd_backlight) + schedule_work(&galaxybook->kbd_backlight_hotkey_work); + break; + case GB_ACPI_NOTIFY_HOTKEY_MICMUTE: + input_report_key(galaxybook->input, KEY_MICMUTE, 1); + input_sync(galaxybook->input); + input_report_key(galaxybook->input, KEY_MICMUTE, 0); + input_sync(galaxybook->input); + break; + case GB_ACPI_NOTIFY_HOTKEY_CAMERA: + if (galaxybook->has_block_recording) { + schedule_work(&galaxybook->block_recording_hotkey_work); + } else { + input_report_switch(galaxybook->input, SW_CAMERA_LENS_COVER, + !test_bit(SW_CAMERA_LENS_COVER, galaxybook->input->sw)); + input_sync(galaxybook->input); + } + break; default: dev_warn(&galaxybook->platform->dev, "unknown ACPI notification event: 0x%x\n", event); From ad3bff944c0f4f2e913298a9664391af32f87491 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 30 Apr 2026 08:11:01 -0700 Subject: [PATCH 050/377] platform/x86: intel: Move debugfs register before creating devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible that the driver handling device is enumerated before registering debugfs. If the driver wants to access debugfs by calling tpmi_get_debugfs_dir(), this will return error in this case. Hence register debugfs before creating devices. Fixes: 811f67c51636 ("platform/x86/intel/tpmi: Add new auxiliary driver for performance limits") Signed-off-by: Srinivas Pandruvada Cc: Stable@vger.kernel.org Link: https://patch.msgid.link/20260430151103.1549733-2-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec_tpmi.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/intel/vsec_tpmi.c b/drivers/platform/x86/intel/vsec_tpmi.c index 7fc6ff8d1040..a38014e81e85 100644 --- a/drivers/platform/x86/intel/vsec_tpmi.c +++ b/drivers/platform/x86/intel/vsec_tpmi.c @@ -817,10 +817,6 @@ static int intel_vsec_tpmi_init(struct auxiliary_device *auxdev) auxiliary_set_drvdata(auxdev, tpmi_info); - ret = tpmi_create_devices(tpmi_info); - if (ret) - return ret; - /* * Allow debugfs when security policy allows. Everything this debugfs * interface provides, can also be done via /dev/mem access. If @@ -830,6 +826,12 @@ static int intel_vsec_tpmi_init(struct auxiliary_device *auxdev) if (!security_locked_down(LOCKDOWN_DEV_MEM) && capable(CAP_SYS_RAWIO)) tpmi_dbgfs_register(tpmi_info); + ret = tpmi_create_devices(tpmi_info); + if (ret) { + debugfs_remove_recursive(tpmi_info->dbgfs_dir); + return ret; + } + return 0; } From 57c347a2e2473bfb5c1f1132a3209c55efbe640b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 30 Apr 2026 08:11:02 -0700 Subject: [PATCH 051/377] platform/x86: intel: Add notifiers support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases a driver using services of vsec_tpmi driver requires some processing before vsec_tpmi exits. For example a children using debugfs can't use debugfs as this will be deleted by the vsec_tpmi driver. This is the case when unbind using PCI driver interface. In this case the remove callback of vsec_tpmi driver is called first, then remove callback of its children. Add support of blocking chain notifiers support. Notify on successful probe and before clean up in the remove callback. Fixes: 811f67c51636 ("platform/x86/intel/tpmi: Add new auxiliary driver for performance limits") Signed-off-by: Srinivas Pandruvada Cc: Stable@vger.kernel.org Link: https://patch.msgid.link/20260430151103.1549733-3-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec_tpmi.c | 19 +++++++++++++++++++ include/linux/intel_tpmi.h | 6 ++++++ 2 files changed, 25 insertions(+) diff --git a/drivers/platform/x86/intel/vsec_tpmi.c b/drivers/platform/x86/intel/vsec_tpmi.c index a38014e81e85..16fd7aa41f20 100644 --- a/drivers/platform/x86/intel/vsec_tpmi.c +++ b/drivers/platform/x86/intel/vsec_tpmi.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -188,6 +189,20 @@ struct tpmi_feature_state { /* Used during auxbus device creation */ static DEFINE_IDA(intel_vsec_tpmi_ida); +static BLOCKING_NOTIFIER_HEAD(tpmi_notify_list); + +int tpmi_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&tpmi_notify_list, nb); +} +EXPORT_SYMBOL_NS_GPL(tpmi_register_notifier, "INTEL_TPMI"); + +int tpmi_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&tpmi_notify_list, nb); +} +EXPORT_SYMBOL_NS_GPL(tpmi_unregister_notifier, "INTEL_TPMI"); + struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev) { struct intel_vsec_device *vsec_dev = auxdev_to_ivdev(auxdev); @@ -832,6 +847,8 @@ static int intel_vsec_tpmi_init(struct auxiliary_device *auxdev) return ret; } + blocking_notifier_call_chain(&tpmi_notify_list, TPMI_CORE_INIT, auxdev); + return 0; } @@ -845,6 +862,8 @@ static void tpmi_remove(struct auxiliary_device *auxdev) { struct intel_tpmi_info *tpmi_info = auxiliary_get_drvdata(auxdev); + blocking_notifier_call_chain(&tpmi_notify_list, TPMI_CORE_EXIT, auxdev); + debugfs_remove_recursive(tpmi_info->dbgfs_dir); } diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index 94c06bf214fb..15f02422e9ca 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -28,6 +28,12 @@ enum intel_tpmi_id { TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ }; +#define TPMI_CORE_INIT 0 +#define TPMI_CORE_EXIT 1 + +int tpmi_register_notifier(struct notifier_block *nb); +int tpmi_unregister_notifier(struct notifier_block *nb); + struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev); struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index); int tpmi_get_resource_count(struct auxiliary_device *auxdev); From 14473e8c4e97d51eff9b2f384ae696f7a32f182b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 30 Apr 2026 08:11:03 -0700 Subject: [PATCH 052/377] platform/x86/intel/tpmi/plr: Prevent fault during unbind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This driver faults when intel vsec driver is unbound from PCI driver interface. For example: echo 0000:00:03.1 > /sys/bus/pci/drivers/intel_vsec/unbind This is caused by accessing plr->dbgfs_dir after vsec_tpmi driver is removed. Here vsec_tpmi driver is the parent. On unbind, the parent device remove callback is called first which here will remove debugfs interface. Hence plr->dbgfs_dir is no longer valid. Register notifier for TPMI_CORE_EXIT and make this pointer to NULL, so that debugfs_remove_recursive() is not called with bad plr->dbgfs_dir pointer. After notifier is returned the vsec_tpmi driver will call remove debugfs by calling debugfs_remove_recursive(). Fixes: 811f67c51636 ("platform/x86/intel/tpmi: Add new auxiliary driver for performance limits") Signed-off-by: Srinivas Pandruvada Cc: Stable@vger.kernel.org Link: https://patch.msgid.link/20260430151103.1549733-4-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/plr_tpmi.c | 45 +++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/plr_tpmi.c b/drivers/platform/x86/intel/plr_tpmi.c index 05727169f49c..8faecc311038 100644 --- a/drivers/platform/x86/intel/plr_tpmi.c +++ b/drivers/platform/x86/intel/plr_tpmi.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +61,8 @@ struct tpmi_plr { struct tpmi_plr_die *die_info; int num_dies; struct auxiliary_device *auxdev; + struct notifier_block nb; + struct mutex lock; /* Protect access to dbgfs_dir */ }; static const char * const plr_coarse_reasons[] = { @@ -255,6 +258,30 @@ static ssize_t plr_status_write(struct file *filp, const char __user *ubuf, } DEFINE_SHOW_STORE_ATTRIBUTE(plr_status); +static int intel_plr_notify(struct notifier_block *self, unsigned long action, void *data) +{ + struct tpmi_plr *plr = container_of(self, struct tpmi_plr, nb); + + if (action == TPMI_CORE_EXIT) { + guard(mutex)(&plr->lock); + plr->dbgfs_dir = NULL; + } + + return NOTIFY_DONE; +} + +static int intel_plr_register_notifier(struct notifier_block *nb) +{ + nb->notifier_call = intel_plr_notify; + nb->priority = 0; + return tpmi_register_notifier(nb); +} + +static void intel_plr_unregister_notifier(struct notifier_block *nb) +{ + tpmi_unregister_notifier(nb); +} + static int intel_plr_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id) { struct oobmsm_plat_info *plat_info; @@ -282,10 +309,18 @@ static int intel_plr_probe(struct auxiliary_device *auxdev, const struct auxilia if (!plr) return -ENOMEM; + err = devm_mutex_init(&auxdev->dev, &plr->lock); + if (err) + return err; + + intel_plr_register_notifier(&plr->nb); + plr->die_info = devm_kcalloc(&auxdev->dev, num_resources, sizeof(*plr->die_info), GFP_KERNEL); - if (!plr->die_info) - return -ENOMEM; + if (!plr->die_info) { + err = -ENOMEM; + goto err_notify; + } plr->num_dies = num_resources; plr->dbgfs_dir = debugfs_create_dir("plr", dentry); @@ -326,6 +361,9 @@ static int intel_plr_probe(struct auxiliary_device *auxdev, const struct auxilia err: debugfs_remove_recursive(plr->dbgfs_dir); +err_notify: + intel_plr_unregister_notifier(&plr->nb); + return err; } @@ -333,6 +371,9 @@ static void intel_plr_remove(struct auxiliary_device *auxdev) { struct tpmi_plr *plr = auxiliary_get_drvdata(auxdev); + intel_plr_unregister_notifier(&plr->nb); + + guard(mutex)(&plr->lock); debugfs_remove_recursive(plr->dbgfs_dir); } From d7396a72eae795d7f968fb451237b6ac1616d712 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:44 +0100 Subject: [PATCH 053/377] KVM: arm64: Make EL2 exception entry and exit context-synchronization events SCTLR_EL2.EIS and SCTLR_EL2.EOS control whether exception entry and exit at EL2 are Context Synchronisation Events (CSEs). Per ARM DDI 0487 M.b D24.2.175 (p. D24-9754): - !FEAT_ExS: the bit is RES1, so the entry/exit is unconditionally a CSE. - FEAT_ExS: the reset value is architecturally UNKNOWN; software must set the bit to make the entry/exit a CSE. INIT_SCTLR_EL2_MMU_ON in arch/arm64/include/asm/sysreg.h sets neither bit. KVM/arm64 hot paths rely on ERET from EL2 being a CSE, and on synchronous EL1->EL2 entry being a CSE, to elide explicit ISBs after MSRs to context-switching system registers (HCR_EL2, ZCR_EL2, ptrauth keys, etc.). On FEAT_ExS hardware those reliances are not architecturally backed unless EOS=1 (and, for entry, EIS=1). Until commit 0a35bd285f43 ("arm64: Convert SCTLR_EL2 to sysreg infrastructure"), SCTLR_EL2_RES1 was a hand-rolled mask that included BIT(11) (EOS) and BIT(22) (EIS), so INIT_SCTLR_EL2_MMU_ON was setting both unconditionally. The conversion made SCTLR_EL2_RES1 auto-generated; because the sysreg tooling only models unconditionally-RES1 fields and EIS/EOS are RES1 only when FEAT_ExS is absent, the auto-generated mask is UL(0). The seven other bits dropped from the old mask (positions 4, 5, 16, 18, 23, 28, 29) are unconditionally RES1 in the E2H=0 SCTLR_EL2 layout per DDI 0487 M.b D24.2.175, so dropping them is harmless. EIS and EOS are the only bits whose semantics changed for FEAT_ExS hardware and where the kernel relies on the value being 1. Make the guarantee explicit: include SCTLR_ELx_EIS | SCTLR_ELx_EOS in INIT_SCTLR_EL2_MMU_ON so that EL2 exception entry and exit are unconditionally CSEs regardless of whether FEAT_ExS is implemented. This matches the pairing in arch/arm64/kvm/config.c which treats EIS and EOS together as RES1 under !FEAT_ExS. Fixes: 0a35bd285f43 ("arm64: Convert SCTLR_EL2 to sysreg infrastructure") Reviewed-by: Yuan Yao Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-2-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/sysreg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 736561480f36..7aa08d59d494 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -844,7 +844,7 @@ #define INIT_SCTLR_EL2_MMU_ON \ (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | \ SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | \ - SCTLR_ELx_ITFSB | SCTLR_EL2_RES1) + SCTLR_ELx_ITFSB | SCTLR_ELx_EIS | SCTLR_ELx_EOS | SCTLR_EL2_RES1) #define INIT_SCTLR_EL2_MMU_OFF \ (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) From 300fac4cc266b7782d88602b6b6a7faf31ce6405 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:45 +0100 Subject: [PATCH 054/377] KVM: arm64: Guard against NULL vcpu on VHE hyp panic path On VHE, __hyp_call_panic() unconditionally calls __deactivate_traps(vcpu) on the vcpu pointer read from host_ctxt->__hyp_running_vcpu. That pointer is cleared after every guest exit (and is never set when no guest is running), so an unexpected EL2 exception landing in _guest_exit_panic, e.g. via the el2t*_invalid / el2h_irq_invalid vectors - reaches this function with vcpu == NULL. __deactivate_traps() then dereferences vcpu via ___deactivate_traps() -> vserror_state_is_nested() -> vcpu_has_nv() -> vcpu->arch.features, faulting inside the panic handler and obscuring the original failure. The nVHE counterpart (hyp_panic() in arch/arm64/kvm/hyp/nvhe/switch.c) already guards its vcpu-using cleanup with "if (vcpu)"; mirror that here. sysreg_restore_host_state_vhe() does not depend on vcpu and continues to run unconditionally, preserving panic forensics. The trailing panic("...VCPU:%p", vcpu) prints "(null)" safely via printk's %p handling. Fixes: 6a0259ed29bb ("KVM: arm64: Remove hyp_panic arguments") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/vhe/switch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 9db3f11a4754..1e8995add14f 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -663,7 +663,8 @@ static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par) host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; - __deactivate_traps(vcpu); + if (vcpu) + __deactivate_traps(vcpu); sysreg_restore_host_state_vhe(host_ctxt); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n", From d4d215e5b81ba5acb17752cab12c514a8062bada Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:46 +0100 Subject: [PATCH 055/377] KVM: arm64: Fix __deactivate_fgt macro parameter typo __deactivate_fgt() declares its first parameter as "htcxt" but the body references "hctxt". The parameter is unused; the macro silently captures "hctxt" from the enclosing scope. Both existing callers (__deactivate_traps_hfgxtr() and __deactivate_traps_ich_hfgxtr()) happen to define a local "struct kvm_cpu_context *hctxt", so the macro works by coincidence. A future caller without an "hctxt" local in scope, or naming it differently, would compile but bind to the wrong context. Align the parameter name with the sibling __activate_fgt() macro. The "vcpu" parameter remains unused in the body, kept for API symmetry with __activate_fgt() (which uses it). Fixes: f5a5a406b4b8 ("KVM: arm64: Propagate and handle Fine-Grained UNDEF bits") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 98b2976837b1..bf0eb5e43427 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -245,7 +245,7 @@ static inline void __activate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu) __activate_fgt(hctxt, vcpu, ICH_HFGITR_EL2); } -#define __deactivate_fgt(htcxt, vcpu, reg) \ +#define __deactivate_fgt(hctxt, vcpu, reg) \ do { \ write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ SYS_ ## reg); \ From 5130d450d1488e62e1b5310f41910a3c7320e827 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:47 +0100 Subject: [PATCH 056/377] KVM: arm64: Seed pkvm_ownership_selftest vcpu memcache The hypercall handlers call pkvm_refill_memcache() to top up the hyp_vcpu memcache before invoking __pkvm_host_{share,donate}_guest(). pkvm_ownership_selftest invokes those functions directly with a static selftest_vcpu that has an empty memcache. Seed selftest_vcpu's memcache from the prepopulated selftest pages, leaving the remainder for selftest_vm.pool. Required by the memcache-sufficiency pre-check added in the following patches. Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-5-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index e7496eb85628..eb1c10120f9f 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -752,16 +752,30 @@ static struct pkvm_hyp_vcpu selftest_vcpu = { struct pkvm_hyp_vcpu *init_selftest_vm(void *virt) { struct hyp_page *p = hyp_virt_to_page(virt); + unsigned long min_pages, seeded = 0; int i; selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); + /* + * Mirror pkvm_refill_memcache() for the share/donate pre-checks; + * the selftest invokes those functions directly and would + * otherwise see an empty memcache. + */ + min_pages = kvm_mmu_cache_min_pages(&selftest_vm.kvm.arch.mmu); + for (i = 0; i < pkvm_selftest_pages(); i++) { if (p[i].refcount) continue; p[i].refcount = 1; - hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + if (seeded < min_pages) { + push_hyp_memcache(&selftest_vcpu.vcpu.arch.pkvm_memcache, + hyp_page_to_virt(&p[i]), hyp_virt_to_phys); + seeded++; + } else { + hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + } } selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm(); From 8234409ffb656970e2f5b29e416f041419980bef Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:48 +0100 Subject: [PATCH 057/377] KVM: arm64: Pre-check vcpu memcache for host->guest share __pkvm_host_share_guest() ends with kvm_pgtable_stage2_map() to install the guest stage-2 mapping, after a forward pass that mutates the host vmemmap (sets PKVM_PAGE_SHARED_OWNED and increments host_share_guest_count) for every page in the range. The map's return value is wrapped in WARN_ON() and otherwise discarded, asserting that the call cannot fail. WARN_ON() at nVHE EL2 panics, so this assertion is only correct if the call genuinely cannot fail. kvm_pgtable_stage2_map() can fail with -ENOMEM when the stage-2 walker exhausts the caller's memcache, and the host controls the vcpu memcache via the topup interface, so an under-provisioned share request would otherwise turn a recoverable -ENOMEM into a fatal hyp panic. Bound the worst-case walker allocation in the existing pre-check pass so that kvm_pgtable_stage2_map() cannot fail at the call site, using kvm_mmu_cache_min_pages() -- the same bound host EL1 uses for its own stage-2 maps. If the vcpu memcache holds fewer pages, return -ENOMEM before any state mutation. Fixes: d0bd3e6570ae ("KVM: arm64: Introduce __pkvm_host_share_guest()") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-6-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index a3050e2b65b1..ffb123fc1145 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1390,6 +1390,22 @@ int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm) return ret && ret != -EHWPOISON ? ret : 0; } +/* + * share/donate install at most one stage-2 leaf (PAGE_SIZE, or one + * KVM_PGTABLE_LAST_LEVEL - 1 block for share). kvm_mmu_cache_min_pages() + * bounds the worst-case allocation: exact for the PAGE_SIZE leaf, + * conservative by one for the block. + */ +static int __guest_check_pgtable_memcache(struct pkvm_hyp_vcpu *vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + + if (vcpu->vcpu.arch.pkvm_memcache.nr_pages < kvm_mmu_cache_min_pages(vm->pgt.mmu)) + return -ENOMEM; + + return 0; +} + int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) { struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); @@ -1474,6 +1490,10 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu } } + ret = __guest_check_pgtable_memcache(vcpu); + if (ret) + goto unlock; + for_each_hyp_page(page, phys, size) { set_host_state(page, PKVM_PAGE_SHARED_OWNED); page->host_share_guest_count++; From effc0a39b8e0f30670fe24f51e44329d4324e566 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:49 +0100 Subject: [PATCH 058/377] KVM: arm64: Pre-check vcpu memcache for host->guest donate __pkvm_host_donate_guest() flips the host stage-2 PTE for the donated page to a non-valid annotation via host_stage2_set_owner_metadata_locked() and then calls kvm_pgtable_stage2_map() to install the matching guest stage-2 mapping. The map's return value is wrapped in WARN_ON() and otherwise discarded, asserting that the call cannot fail. WARN_ON() at nVHE EL2 panics, so this assertion is only correct if the call genuinely cannot fail. kvm_pgtable_stage2_map() can fail with -ENOMEM even at PAGE_SIZE granularity: the donate path verifies PKVM_NOPAGE for the guest IPA before the map, so the walker must allocate fresh page-table pages from the vcpu memcache, and the host controls the vcpu memcache via the topup interface. An under-provisioned donation request would otherwise turn a recoverable -ENOMEM into a fatal hyp panic. Bound the worst-case walker allocation alongside the existing __host_check_page_state_range() / __guest_check_page_state_range() pre-checks, using the helper introduced for host->guest share. If the vcpu memcache holds fewer pages than kvm_mmu_cache_min_pages(), return -ENOMEM before any state mutation. Fixes: 1e579adca177 ("KVM: arm64: Introduce __pkvm_host_donate_guest()") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-7-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index ffb123fc1145..25f04629014e 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1425,6 +1425,10 @@ int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) if (ret) goto unlock; + ret = __guest_check_pgtable_memcache(vcpu); + if (ret) + goto unlock; + meta = host_stage2_encode_gfn_meta(vm, gfn); WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE, PKVM_ID_GUEST, meta)); From 74570e12b4705ea11dcdfbfbd0a0b0fdaeff3059 Mon Sep 17 00:00:00 2001 From: Gyeyoung Baek Date: Sun, 19 Apr 2026 16:17:15 +0900 Subject: [PATCH 059/377] accel/rocket: Fix prep_bo ioctl leaking positive return from dma_resv_wait_timeout() dma_resv_wait_timeout() returns a positive 'remaining jiffies' value on success, 0 on timeout, and -errno on failure. rocket_ioctl_prep_bo() returns this 'long' result from an int-typed ioctl handler, so positive values reach userspace as bogus errors. Explicitly set ret to 0 on the success path. Fixes: 525ad89dd904 ("accel/rocket: Add IOCTLs for synchronizing memory accesses") Cc: stable@vger.kernel.org Signed-off-by: Gyeyoung Baek Reviewed-by: Tomeu Vizoso Link: https://patch.msgid.link/c0ebf83b345721701b22d8f5bc41c52c0ecf5e16.1776581974.git.gye976@gmail.com Signed-off-by: Steven Price --- drivers/accel/rocket/rocket_gem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/accel/rocket/rocket_gem.c b/drivers/accel/rocket/rocket_gem.c index b6a385d2edfc..c8084719208a 100644 --- a/drivers/accel/rocket/rocket_gem.c +++ b/drivers/accel/rocket/rocket_gem.c @@ -145,6 +145,8 @@ int rocket_ioctl_prep_bo(struct drm_device *dev, void *data, struct drm_file *fi ret = dma_resv_wait_timeout(gem_obj->resv, DMA_RESV_USAGE_WRITE, true, timeout); if (!ret) ret = timeout ? -ETIMEDOUT : -EBUSY; + else if (ret > 0) + ret = 0; shmem_obj = &to_rocket_bo(gem_obj)->base; From 459d75523b71c0ec254d153d8850d0b7008af396 Mon Sep 17 00:00:00 2001 From: Gyeyoung Baek Date: Sun, 19 Apr 2026 16:17:16 +0900 Subject: [PATCH 060/377] drm/panfrost: Fix wait_bo ioctl leaking positive return from dma_resv_wait_timeout() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dma_resv_wait_timeout() returns a positive 'remaining jiffies' value on success, 0 on timeout, and -errno on failure. panfrost_ioctl_wait_bo() returns this 'long' result from an int-typed ioctl handler, so positive values reach userspace as bogus errors. Explicitly set ret to 0 on the success path. Fixes: f3ba91228e8e ("drm/panfrost: Add initial panfrost driver") Cc: stable@vger.kernel.org Signed-off-by: Gyeyoung Baek Reviewed-by: Adrián Larumbe Reviewed-by: Boris Brezillon Reviewed-by: Steven Price Link: https://patch.msgid.link/fe33f82fded7be1c18e2e0eb2db451d5a738cf39.1776581974.git.gye976@gmail.com Signed-off-by: Steven Price --- drivers/gpu/drm/panfrost/panfrost_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index 711f5101aa04..074c0995ddc2 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -390,6 +390,8 @@ panfrost_ioctl_wait_bo(struct drm_device *dev, void *data, true, timeout); if (!ret) ret = timeout ? -ETIMEDOUT : -EBUSY; + else if (ret > 0) + ret = 0; drm_gem_object_put(gem_obj); From a59e45221df82e8a6246c617615c1ccc12e3545d Mon Sep 17 00:00:00 2001 From: Haichen Feng <2806891994@qq.com> Date: Thu, 7 May 2026 22:02:42 +0800 Subject: [PATCH 061/377] platform/x86: hp-wmi: Add support for Victus 16-r0xxx (8BC2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HP Victus 16-r0xxx (board ID: 8BC2) has the same WMI as other Victus S boards, but requires quirks for correctly switching thermal profile. Add the DMI board name to victus_s_thermal_profile_boards[] table and map it to omen_v1_thermal_params. Testing on board 8BC2 confirmed that platform profile is registered successfully and fan RPMs are readable and controllable. Signed-off-by: Haichen Feng <2806891994@qq.com> Link: https://patch.msgid.link/tencent_8E29805D8DC7B6005244C3433C62DD9DF606@qq.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-wmi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 24c151289dd3..6950bec2a9d8 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -205,6 +205,10 @@ static const struct dmi_system_id victus_s_thermal_profile_boards[] __initconst .matches = { DMI_MATCH(DMI_BOARD_NAME, "8BBE") }, .driver_data = (void *)&victus_s_thermal_params, }, + { + .matches = { DMI_MATCH(DMI_BOARD_NAME, "8BC2") }, + .driver_data = (void *)&omen_v1_thermal_params, + }, { .matches = { DMI_MATCH(DMI_BOARD_NAME, "8BCA") }, .driver_data = (void *)&omen_v1_thermal_params, From aa2fbece1b07954ef26488c800d126a36a8ab93e Mon Sep 17 00:00:00 2001 From: Shuhao Fu Date: Tue, 28 Apr 2026 16:01:39 +0800 Subject: [PATCH 062/377] ALSA: hda: cs35l56: Put ACPI device after setting companion acpi_dev_get_first_match_dev() returns a refcounted ACPI device and callers are expected to balance it with acpi_dev_put(). When no companion is already attached, cs35l56_hda_read_acpi() looks up an ACPI device and sets it with ACPI_COMPANION_SET(), but leaves the lookup reference held. ACPI_COMPANION_SET() does not take ownership of that reference, so drop it with acpi_dev_put() after attaching the companion. Fixes: 73cfbfa9caea ("ALSA: hda/cs35l56: Add driver for Cirrus Logic CS35L56 amplifier") Signed-off-by: Shuhao Fu Tested-by: Simon Trimmer Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260428080139.GA1649104@chcpu16 --- sound/hda/codecs/side-codecs/cs35l56_hda.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda.c b/sound/hda/codecs/side-codecs/cs35l56_hda.c index 4c8d01799931..cdbc576569ef 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda.c @@ -1041,6 +1041,7 @@ static int cs35l56_hda_read_acpi(struct cs35l56_hda *cs35l56, int hid, int id) return -ENODEV; } ACPI_COMPANION_SET(cs35l56->base.dev, adev); + acpi_dev_put(adev); } /* Initialize things that could be overwritten by a fixup */ From fca7401fe37f7abc6e54147ea560f37279231137 Mon Sep 17 00:00:00 2001 From: Shuhao Fu Date: Tue, 28 Apr 2026 16:12:38 +0800 Subject: [PATCH 063/377] ALSA: hda: cs35l41: Put ACPI device on missing physical node acpi_dev_get_first_match_dev() returns a refcounted ACPI device and callers must balance it with acpi_dev_put(). cs35l41_hda_read_acpi() stores the returned ACPI device in cs35l41->dacpi. That reference is normally released by the later probe cleanup or the remove path, but the NULL-check on physdev exits before either of those paths can run. Drop the lookup reference before returning -ENODEV. Fixes: c34b04cc6178 ("ALSA: hda: cs35l41: Fix NULL pointer dereference in cs35l41_hda_read_acpi()") Signed-off-by: Shuhao Fu Tested-by: Simon Trimmer Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260428081238.GA1659932@chcpu16 --- sound/hda/codecs/side-codecs/cs35l41_hda.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/hda/codecs/side-codecs/cs35l41_hda.c b/sound/hda/codecs/side-codecs/cs35l41_hda.c index b64890006bb7..acfccc848f82 100644 --- a/sound/hda/codecs/side-codecs/cs35l41_hda.c +++ b/sound/hda/codecs/side-codecs/cs35l41_hda.c @@ -1896,8 +1896,10 @@ static int cs35l41_hda_read_acpi(struct cs35l41_hda *cs35l41, const char *hid, i cs35l41->dacpi = adev; physdev = get_device(acpi_get_first_physical_node(adev)); - if (!physdev) + if (!physdev) { + acpi_dev_put(adev); return -ENODEV; + } sub = acpi_get_subsystem_id(ACPI_HANDLE(physdev)); if (IS_ERR(sub)) From 4616a9c36be7e2e051ef53b0e8fd729da0277abf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2026 11:05:31 -1000 Subject: [PATCH 064/377] sched_ext: Move scx_error() out of scx_link_sched()'s lock region scx_link_sched() holds scx_sched_lock. The scx_error() calls inside take the same lock through scx_claim_exit() and deadlock. Move them out of the guard. Fixes: 6b4576b09714 ("sched_ext: Reject sub-sched attachment to a disabled parent") Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3f0d8aeaed81..7d367c140a36 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5585,10 +5585,12 @@ static void refresh_watchdog(void) static s32 scx_link_sched(struct scx_sched *sch) { + const char *err_msg; + s32 ret = 0; + scoped_guard(raw_spinlock_irq, &scx_sched_lock) { #ifdef CONFIG_EXT_SUB_SCHED struct scx_sched *parent = scx_parent(sch); - s32 ret; if (parent) { /* @@ -5598,15 +5600,16 @@ static s32 scx_link_sched(struct scx_sched *sch) * parent can shoot us down. */ if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { - scx_error(sch, "parent disabled"); - return -ENOENT; + err_msg = "parent disabled"; + ret = -ENOENT; + break; } ret = rhashtable_lookup_insert_fast(&scx_sched_hash, &sch->hash_node, scx_sched_hash_params); if (ret) { - scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret); - return ret; + err_msg = "failed to insert into scx_sched_hash"; + break; } list_add_tail(&sch->sibling, &parent->children); @@ -5616,6 +5619,15 @@ static s32 scx_link_sched(struct scx_sched *sch) list_add_tail_rcu(&sch->all, &scx_sched_all); } + /* + * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after + * the guard above is released. + */ + if (ret) { + scx_error(sch, "%s (%d)", err_msg, ret); + return ret; + } + refresh_watchdog(); return 0; } From dde2f938d02f2c740d49bb5113dea941f941026a Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Thu, 7 May 2026 18:54:34 +0800 Subject: [PATCH 065/377] cgroup/cpuset: move PF_EXITING check before __GFP_HARDWALL in cpuset_current_node_allowed() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since prepare_alloc_pages() unconditionally adds __GFP_HARDWALL for the fast path when cpusets are enabled, the __GFP_HARDWALL check in cpuset_current_node_allowed() causes the PF_EXITING escape path to be skipped on the first allocation attempt. This makes it unreachable in the common case, so dying tasks can get stuck in direct reclaim or even trigger OOM while trying to exit, despite being allowed to allocate from any node. Move the PF_EXITING check before __GFP_HARDWALL so that dying tasks can allocate memory from any node to exit quickly, even when cpusets are enabled. Also update the function comment to reflect the actual behavior of prepare_alloc_pages() and the corrected check ordering. Signed-off-by: Chen Wandun Acked-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e3a081a07c6d..a48901a0416a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4176,11 +4176,11 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, * yes. If current has access to memory reserves as an oom victim, yes. - * Otherwise, no. + * If the current task is PF_EXITING, yes. Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed. + * unless the task has been OOM killed or is exiting. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * @@ -4194,7 +4194,9 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless - * in interrupt, of course). + * in interrupt, of course). The PF_EXITING check must therefore + * come before the __GFP_HARDWALL check, otherwise a dying task + * would be blocked on the fast path. * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() @@ -4204,6 +4206,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * tsk_is_oom_victim - any node ok + * PF_EXITING - any node ok (let dying task exit quickly) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ @@ -4223,11 +4226,10 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) */ if (unlikely(tsk_is_oom_victim(current))) return true; - if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return false; - if (current->flags & PF_EXITING) /* Let dying task have memory */ return true; + if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ + return false; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); From 363a53749cc483409498e8f6e1525fe081f1d9d2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2026 12:09:21 -1000 Subject: [PATCH 066/377] sched_ext: Drop unused scx_find_sub_sched() stub scx_find_sub_sched()'s only caller, scx_bpf_sub_dispatch(), is gated on CONFIG_EXT_SUB_SCHED. When CONFIG_EXT_SUB_SCHED=n the caller compiles out and the stub becomes dead code, tripping -Wunused-function on randconfigs. Drop the stub. Fixes: 25037af712eb ("sched_ext: Add rhashtable lookup for sub-schedulers") Reported-by: kernel test robot Closes: https://lore.kernel.org/all/202605080556.42PXw8U9-lkp@intel.com/ Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7d367c140a36..48b4834c7027 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -297,7 +297,6 @@ static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) #else /* CONFIG_EXT_SUB_SCHED */ static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } -static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; } static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} #endif /* CONFIG_EXT_SUB_SCHED */ From fc51cba3ebae67f967120e27162e94cfb8594479 Mon Sep 17 00:00:00 2001 From: ZhengYuan Huang Date: Wed, 25 Mar 2026 08:43:39 +0800 Subject: [PATCH 067/377] btrfs: fix check_chunk_block_group_mappings() to iterate all chunk maps [BUG] A corrupted image with a chunk present in the chunk tree but whose corresponding block group item is missing from the extent tree can be mounted successfully, even though check_chunk_block_group_mappings() is supposed to catch exactly this corruption at mount time. Once mounted, running btrfs balance with a usage filter (-dusage=N or -dusage=min..max) triggers a null-ptr-deref: KASAN: null-ptr-deref in range [0x0000000000000070-0x0000000000000077] RIP: 0010:chunk_usage_filter fs/btrfs/volumes.c:3874 [inline] RIP: 0010:should_balance_chunk fs/btrfs/volumes.c:4018 [inline] RIP: 0010:__btrfs_balance fs/btrfs/volumes.c:4172 [inline] RIP: 0010:btrfs_balance+0x2024/0x42b0 fs/btrfs/volumes.c:4604 [CAUSE] The crash occurs because __btrfs_balance() iterates the on-disk chunk tree, finds the orphaned chunk, calls chunk_usage_filter() (or chunk_usage_range_filter()), which queries the in-memory block group cache via btrfs_lookup_block_group(). Since no block group was ever inserted for this chunk, the lookup returns NULL, and the subsequent dereference of cache->used crashes. check_chunk_block_group_mappings() uses btrfs_find_chunk_map() to iterate the in-memory chunk map (fs_info->mapping_tree): map = btrfs_find_chunk_map(fs_info, start, 1); With @start = 0 and @length = 1, btrfs_find_chunk_map() looks for a chunk map that *contains* the logical address 0. If no chunk contains logical address 0, btrfs_find_chunk_map(fs_info, 0, 1) returns NULL immediately and the loop breaks after the very first iteration, having checked zero chunks. The entire verification function is therefore a no-op, and the corrupted image passes the mount-time check undetected. [FIX] Replace the btrfs_find_chunk_map() based loop with a direct in-order walk of fs_info->mapping_tree using rb_first_cached() + rb_next(). This guarantees that every chunk map in the tree is visited regardless of the logical addresses involved. No lock is taken around the traversal. This function is called during mount from btrfs_read_block_groups(), which is invoked from open_ctree() before any background threads (cleaner, transaction kthread, etc.) are started. There are therefore no concurrent writers that could modify mapping_tree at this point. An analogous lockless direct traversal of mapping_tree already exists in fill_dummy_bgs() in the same file. Since we walk the rb-tree directly via rb_entry() without going through btrfs_find_chunk_map(), no reference is taken on each map entry, so the btrfs_free_chunk_map() calls are also removed. Signed-off-by: ZhengYuan Huang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e6f5a17a13e3..b611c64119db 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2412,29 +2412,25 @@ static struct btrfs_block_group *btrfs_create_block_group( */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { - u64 start = 0; + struct rb_node *node; int ret = 0; - while (1) { + /* + * This is called during mount from btrfs_read_block_groups(), before + * any background threads are started, so no concurrent writers can + * modify the mapping_tree. No lock is needed here. + */ + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { struct btrfs_chunk_map *map; struct btrfs_block_group *bg; - /* - * btrfs_find_chunk_map() will return the first chunk map - * intersecting the range, so setting @length to 1 is enough to - * get the first chunk. - */ - map = btrfs_find_chunk_map(fs_info, start, 1); - if (!map) - break; - + map = rb_entry(node, struct btrfs_chunk_map, rb_node); bg = btrfs_lookup_block_group(fs_info, map->start); if (unlikely(!bg)) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", map->start, map->chunk_len); ret = -EUCLEAN; - btrfs_free_chunk_map(map); break; } if (unlikely(bg->start != map->start || bg->length != map->chunk_len || @@ -2447,12 +2443,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) bg->start, bg->length, bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ret = -EUCLEAN; - btrfs_free_chunk_map(map); btrfs_put_block_group(bg); break; } - start = map->start + map->chunk_len; - btrfs_free_chunk_map(map); btrfs_put_block_group(bg); } return ret; From 4822703b150fc25f7bdb8cf266a482619881a97e Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Wed, 29 Apr 2026 00:10:25 -0700 Subject: [PATCH 068/377] btrfs: always pass __GFP_NOWARN from add_ra_bio_pages() A build workload newly prints order-0 allocation failures on 7.1-rc1: sh: page allocation failure: order:0 mode:0x14084a(__GFP_HIGHMEM|__GFP_MOVABLE|__GFP_IO|__GFP_KSWAPD_RECLAIM| __GFP_COMP|__GFP_HARDWALL) CPU: 27 UID: 1000 PID: 855540 Comm: sh Not tainted 7.1.0-rc1-llvm-00058-gdca922e019dd #1 PREEMPTLAZY Call Trace: dump_stack_lvl+0x50/0x70 warn_alloc+0xeb/0x100 __alloc_pages_slowpath+0x567/0x5a0 ? filemap_get_entry+0x11a/0x140 __alloc_frozen_pages_noprof+0x249/0x2d0 alloc_pages_mpol+0xe4/0x180 folio_alloc_noprof+0x80/0xa0 add_ra_bio_pages+0x13c/0x4b0 btrfs_submit_compressed_read+0x229/0x300 submit_one_bio+0x9e/0xe0 btrfs_readahead+0x185/0x1a0 [...] (lldb) source list -a add_ra_bio_pages+0x13c .../vmlinux.unstripped add_ra_bio_pages + 316 at .../fs/btrfs/compression.c:454:8 451 452 folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, constraint_gfp), 453 0, NULL); -> 454 if (!folio) 455 break; I can reproduce this consistently by running a memory hog concurrently with a buffered writer on a machine with a very large amount of swap. Commit 7ae37b2c94ed ("btrfs: prevent direct reclaim during compressed readahead") clearly intended to suppress these warnings. But because the mask set in the address_space with mapping_set_gfp_mask() doesn't include __GFP_NOWARN, mapping_gfp_constraint() removes it from constraint_gfp before it is passed to filemap_alloc_folio(). Fix by refactoring the code to add __GFP_NOWARN after the call to mapping_gfp_constraint(). Fixes: 7ae37b2c94ed ("btrfs: prevent direct reclaim during compressed readahead") Signed-off-by: Calvin Owens Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c5783ac1b646..e2ef01a59d04 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -407,22 +407,18 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; - /* - * Avoid direct reclaim when the caller does not allow it. Since - * add_ra_bio_pages() is always speculative, suppress allocation warnings - * in either case. - */ + /* Avoid direct reclaim when the caller does not allow it. */ + constraint_gfp = ~__GFP_FS; + cache_gfp = GFP_NOFS | __GFP_NOWARN; if (!direct_reclaim) { - constraint_gfp = ~(__GFP_FS | __GFP_DIRECT_RECLAIM) | __GFP_NOWARN; - cache_gfp = (GFP_NOFS & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN; - } else { - constraint_gfp = (~__GFP_FS) | __GFP_NOWARN; - cache_gfp = GFP_NOFS | __GFP_NOWARN; + constraint_gfp &= ~__GFP_DIRECT_RECLAIM; + cache_gfp &= ~__GFP_DIRECT_RECLAIM; } while (cur < compressed_end) { pgoff_t page_end; pgoff_t pg_index = cur >> PAGE_SHIFT; + gfp_t masked_constraint_gfp; u32 add_size; if (pg_index > end_index) @@ -449,8 +445,14 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, constraint_gfp), - 0, NULL); + /* + * Since add_ra_bio_pages() is always speculative, suppress + * allocation warnings. + */ + masked_constraint_gfp = mapping_gfp_constraint(mapping, constraint_gfp); + masked_constraint_gfp |= __GFP_NOWARN; + + folio = filemap_alloc_folio(masked_constraint_gfp, 0, NULL); if (!folio) break; From c73370c677646e86fc4b1780fb07027bdf847375 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 28 Apr 2026 16:58:56 +0100 Subject: [PATCH 069/377] btrfs: tracepoints: fix sleep while in atomic context in btrfs_sync_file() The trace event btrfs_sync_file() is called in an atomic context (all trace events are) and its call to dput(), which is needed due to the call to dget_parent(), can sleep, triggering a kernel splat. This can be reproduced by enabling the trace event and running btrfs/056 from fstests for example. The splat shown in dmesg is the following: [53.919] BUG: sleeping function called from invalid context at fs/dcache.c:970 [53.947] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 32773, name: xfs_io [53.988] preempt_count: 2, expected: 0 [53.967] RCU nest depth: 0, expected: 0 [53.943] Preemption disabled at: [53.944] [<0000000000000000>] 0x0 [54.078] CPU: 0 UID: 0 PID: 32773 Comm: xfs_io Tainted: G W 7.1.0-rc1-btrfs-next-232+ #1 PREEMPT(full) [54.070] Tainted: [W]=WARN [54.071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [54.072] Call Trace: [54.074] [54.076] dump_stack_lvl+0x56/0x80 [54.079] __might_resched.cold+0xd6/0x10f [54.072] dput.part.0+0x24/0x110 [54.078] trace_event_raw_event_btrfs_sync_file+0x75/0x140 [btrfs] [54.089] btrfs_sync_file+0x1ed/0x530 [btrfs] [54.087] ? __handle_mm_fault+0x8ae/0xed0 [54.089] btrfs_do_write_iter+0x172/0x210 [btrfs] [54.091] vfs_write+0x21f/0x450 [54.094] __x64_sys_pwrite64+0x8d/0xc0 [54.096] ? do_user_addr_fault+0x20c/0x670 [54.099] do_syscall_64+0x60/0xf20 [54.092] ? clear_bhb_loop+0x60/0xb0 [54.094] entry_SYSCALL_64_after_hwframe+0x76/0x7e So stop using dget_parent() and dput() and access the parent dentry directly as dentry->d_parent. This is also what ext4 is doing in its equivalent trace event ext4_sync_file_enter(). Fixes: a85b46db143f ("btrfs: tracepoints: get correct superblock from dentry in event btrfs_sync_file()") Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8ad7a2d76c1d..ec1df8b94517 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -771,10 +771,8 @@ TRACE_EVENT(btrfs_sync_file, TP_fast_assign( struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); - struct dentry *parent = dget_parent(dentry); - struct inode *parent_inode = d_inode(parent); + struct inode *parent_inode = d_inode(dentry->d_parent); - dput(parent); TP_fast_assign_fsid(btrfs_sb(inode->i_sb)); __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->parent = btrfs_ino(BTRFS_I(parent_inode)); From 4066c55e109475a06d18a1f127c939d551211956 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 30 Apr 2026 10:37:22 +0930 Subject: [PATCH 070/377] btrfs: only release the dirty pages io tree after successful writes [WARNING] With extra warning on dirty extent buffers at umount (aka, the next patch in the series), test case generic/388 can trigger the following warning about dirty extent buffers at unmount time: BTRFS critical (device dm-2 state E): emergency shutdown BTRFS error (device dm-2 state E): error while writing out transaction: -30 BTRFS warning (device dm-2 state E): Skipping commit of aborted transaction. BTRFS error (device dm-2 state EA): Transaction 9 aborted (error -30) BTRFS: error (device dm-2 state EA) in cleanup_transaction:2068: errno=-30 Readonly filesystem BTRFS info (device dm-2 state EA): forced readonly BTRFS info (device dm-2 state EA): last unmount of filesystem 4fbf2e15-f941-49a0-bc7c-716315d2777c ------------[ cut here ]------------ WARNING: disk-io.c:3311 at invalidate_and_check_btree_folios+0xfd/0x1ca [btrfs], CPU#8: umount/914368 CPU: 8 UID: 0 PID: 914368 Comm: umount Tainted: G OE 7.1.0-rc1-custom+ #372 PREEMPT(full) 2de38db8d1deae71fde295430a0ff3ab98ccf596 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 02/02/2022 RIP: 0010:invalidate_and_check_btree_folios+0xfd/0x1ca [btrfs] Call Trace: close_ctree+0x52e/0x574 [btrfs d2f0b1cd330d1287e7a9919d112eadfc0e914efd] generic_shutdown_super+0x89/0x1a0 kill_anon_super+0x16/0x40 btrfs_kill_super+0x16/0x20 [btrfs d2f0b1cd330d1287e7a9919d112eadfc0e914efd] deactivate_locked_super+0x2d/0xb0 cleanup_mnt+0xdc/0x140 task_work_run+0x5a/0xa0 exit_to_user_mode_loop+0x123/0x4b0 do_syscall_64+0x243/0x7c0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 ---[ end trace 0000000000000000 ]--- BTRFS warning (device dm-2 state EA): unable to release extent buffer 30539776 owner 9 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30621696 owner 257 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30638080 owner 258 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30654464 owner 7 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30703616 owner 2 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30720000 owner 10 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30736384 owner 4 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30752768 owner 11 gen 9 refs 2 flags 0x7 I'm using a stripped down version, which seems to trigger the warning more reliably: _fsstress_pid="" workload() { dmesg -C mkfs.btrfs -f -K $dev > /dev/null echo 1 > /sys/kernel/debug/clear_warn_once mount $dev $mnt $fsstress -w -n 1024 -p 4 -d $mnt & _fsstress_pid=$! sleep 0 $godown $mnt pkill --echo -PIPE fsstress > /dev/null wait $_fsstress_pid unset _fsstress_pid umount $mnt if dmesg | grep -q "WARNING"; then fail fi } for (( i = 0; i < $runtime; i++ )); do echo "=== $i/$runtime ===" workload done [CAUSE] Inside btrfs_write_and_wait_transaction(), we first try to write all dirty ebs, then wait for them to finish. After that we call btrfs_extent_io_tree_release() to free all extent states from dirty_pages io tree. However if we hit an error from btrfs_write_marked_extent(), then we still call btrfs_extent_io_tree_release() to clear that dirty_pages io tree, which may contain dirty records that we haven't yet submitted. Furthermore, the later transaction cleanup path will utilize that dirty_pages io tree to properly cleanup those dirty ebs, but since it's already empty, no dirty ebs are properly cleaned up, thus will later trigger the warnings inside invalidate_btree_folios(). [FIX] Normally such dirty ebs won't cause problems, as when the iput() is called on the btree inode, the dirty ebs will be forcibly written back, and since the fs is already in an error status, such writeback will not reach disk and finish immediately. But it's still better to get rid of such dirty ebs, if we ended up with dirty ebs but the fs is not in an error status, then such writeback at iput() time will be too late, as all workers are already stopped but writeback will utilize workers, which will lead to NULL pointer dereferences. Instead of unconditionally calling btrfs_extent_io_tree_release(), only call it if btrfs_write_and_wait_transaction() finished successfully, so that @dirty_pages extent io tree is kept untouched for transaction cleanup. CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + fs/btrfs/transaction.c | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8a11be02eeb9..c0a30bb213d7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4686,6 +4686,7 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, free_extent_buffer_stale(eb); } } + btrfs_extent_io_tree_release(dirty_pages); } static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 248adb785051..194f581b36f3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1293,14 +1293,13 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); - btrfs_extent_io_tree_release(&trans->transaction->dirty_pages); - if (ret) return ret; - else if (ret2) + if (ret2) return ret2; - else - return 0; + + btrfs_extent_io_tree_release(&trans->transaction->dirty_pages); + return 0; } /* From c562ba61fc5e11798720acc1b172862158f1fa0b Mon Sep 17 00:00:00 2001 From: Robbie Ko Date: Fri, 1 May 2026 10:41:56 +0800 Subject: [PATCH 071/377] btrfs: fix incorrect i_size after remount caused by KEEP_SIZE prealloc gap When fallocate() with FALLOC_FL_KEEP_SIZE preallocates an extent past the current i_size, the file_extent_tree of the inode is updated to cover that range. However, on the next mount, btrfs_read_locked_inode() only re-populates file_extent_tree with [0, round_up(i_size, sectorsize)), losing the marks that belonged to the KEEP_SIZE prealloc extent beyond i_size. Later, when a non-KEEP_SIZE fallocate() extends i_size into / past that old prealloc extent, the reservation loop in btrfs_fallocate() skips already-prealloc segments and does not call into the path that marks the file_extent_tree, so a gap remains inside the file_extent_tree across [old_aligned_i_size, start_of_new_alloc). Then __btrfs_prealloc_file_range() calls btrfs_inode_safe_disk_i_size_write(), which uses find_contiguous_extent_bit() starting at offset 0 to derive disk_i_size. The walk stops at the gap, so disk_i_size ends up smaller than i_size and gets persisted. After the next mount, the file shows the wrong (smaller) size. The following reproducer triggers the problem: $ cat test.sh MNT=/mnt/sdi DEV=/dev/sdi mkdir -p $MNT mkfs.btrfs -f -O ^no-holes $DEV mount $DEV $MNT touch $MNT/file1 # KEEP_SIZE prealloc beyond i_size (i_size stays 0) fallocate -n -o 4M -l 4M $MNT/file1 umount $MNT mount $DEV $MNT # non-KEEP_SIZE fallocate that overlaps the previous prealloc tail # and extends past it fallocate -o 7M -l 2M $MNT/file1 ls -lh $MNT/file1 umount $MNT mount $DEV $MNT ls -lh $MNT/file1 umount $MNT Running the reproducer gives the following result: $ ./test.sh (...) -rw-rw-r-- 1 root root 9.0M May 4 16:35 /mnt/sdi/file1 -rw-rw-r-- 1 root root 7.0M May 4 16:35 /mnt/sdi/file1 The size before the second mount is correct (9M), but after the remount it drops to 7M, i.e. the start of the gap inside file_extent_tree. Fix this in __btrfs_prealloc_file_range() by marking the entire range [round_down(old_i_size, sectorsize), round_up(new_i_size, sectorsize)) in file_extent_tree before updating i_size and calling btrfs_inode_safe_disk_i_size_write(). This ensures the contiguous bit search starting from 0 is not truncated by a stale gap left behind by a previous KEEP_SIZE prealloc that was not restored on inode load. The fix has no effect when the NO_HOLES feature is enabled because btrfs_inode_safe_disk_i_size_write() and btrfs_inode_set_file_extent_range() both take the fast path that directly tracks disk_i_size without consulting file_extent_tree. Fixes: 9ddc959e802b ("btrfs: use the file extent tree infrastructure") Reviewed-by: Filipe Manana Signed-off-by: Robbie Ko [ Minor updates to the change log ] Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 906d5c21ebc4..75136a172710 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9299,10 +9299,38 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && (cur_offset > inode->i_size)) { + u64 range_start; + u64 range_end; + if (cur_offset > actual_len) i_size = actual_len; else i_size = cur_offset; + + /* + * Make sure the file_extent_tree covers the entire + * range [old_i_size, new_i_size) before we update + * disk_i_size. Without this, a previous KEEP_SIZE + * prealloc that extended past i_size (and was lost + * across umount/mount because file_extent_tree is + * only populated up to round_up(i_size) on inode + * load) can leave a gap inside this range. That gap + * would cause btrfs_inode_safe_disk_i_size_write() + * (via find_contiguous_extent_bit() starting at 0) + * to truncate disk_i_size to the start of the gap, + * making the persisted size smaller than i_size. + */ + range_start = round_down(inode->i_size, fs_info->sectorsize); + range_end = round_up(i_size, fs_info->sectorsize); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + range_start, range_end - range_start); + if (ret) { + btrfs_abort_transaction(trans, ret); + if (own_trans) + btrfs_end_transaction(trans); + break; + } + i_size_write(inode, i_size); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } From 8e72510db9fa2d41f2b06d5c01fe9020e076fee4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:13 +0200 Subject: [PATCH 072/377] netfilter: x_tables: allow initial table replace without emitting audit log message At the moment we emit the audit log a bit too early, which makes it necessary to also emit an unregister log in case we have to unwind errors after possible hook register failure. Followup patch will be slightly simpler if we can delay the register message until after the hooks have been wired up. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2c67c2e6b132..bb0cb3959551 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1472,11 +1472,9 @@ struct xt_counters *xt_counters_alloc(unsigned int counters) } EXPORT_SYMBOL(xt_counters_alloc); -struct xt_table_info * -xt_replace_table(struct xt_table *table, - unsigned int num_counters, - struct xt_table_info *newinfo, - int *error) +static struct xt_table_info * +do_replace_table(struct xt_table *table, unsigned int num_counters, + struct xt_table_info *newinfo, int *error) { struct xt_table_info *private; unsigned int cpu; @@ -1531,10 +1529,23 @@ xt_replace_table(struct xt_table *table, } } - audit_log_nfcfg(table->name, table->af, private->number, - !private->number ? AUDIT_XT_OP_REGISTER : - AUDIT_XT_OP_REPLACE, - GFP_KERNEL); + return private; +} + +struct xt_table_info * +xt_replace_table(struct xt_table *table, unsigned int num_counters, + struct xt_table_info *newinfo, + int *error) +{ + struct xt_table_info *private; + + private = do_replace_table(table, num_counters, newinfo, error); + if (private) + audit_log_nfcfg(table->name, table->af, private->number, + !private->number ? AUDIT_XT_OP_REGISTER : + AUDIT_XT_OP_REPLACE, + GFP_KERNEL); + return private; } EXPORT_SYMBOL_GPL(xt_replace_table); From b62eb8dcf2c47d4d676a434efbd57c4f776f7829 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:14 +0200 Subject: [PATCH 073/377] netfilter: x_tables: allocate hook ops while under mutex arp/ip(6)t_register_table() add the table to the per-netns list via xt_register_table() before allocating the per-netns hook ops copy via kmemdup_array(). This leaves a window where the table is visible in the list with ops=NULL. If the pernet exit happens runs concurrently the pre_exit callback finds the table via xt_find_table() and passes the NULL ops pointer to nf_unregister_net_hooks(), causing a NULL dereference: general protection fault in nf_unregister_net_hooks+0xbc/0x150 RIP: nf_unregister_net_hooks (net/netfilter/core.c:613) Call Trace: ipt_unregister_table_pre_exit iptable_mangle_net_pre_exit ops_pre_exit_list cleanup_net Fix by moving the ops allocation into the xtables core so the table is never in the list without valid ops. Also ensure the table is no longer processing packets before its torn down on error unwind. nf_register_net_hooks might have published at least one hook; call synchronize_rcu() if there was an error. audit log register message gets deferred until all operations have passed, this avoids need to emit another ureg message in case of error unwinding. Based on earlier patch by Tristan Madani. Fixes: f9006acc8dfe5 ("netfilter: arp_tables: pass table pointer via nf_hook_ops") Fixes: ee177a54413a ("netfilter: ip6_tables: pass table pointer via nf_hook_ops") Fixes: ae689334225f ("netfilter: ip_tables: pass table pointer via nf_hook_ops") Link: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + net/ipv4/netfilter/arp_tables.c | 35 +++------------------ net/ipv4/netfilter/ip_tables.c | 41 +++--------------------- net/ipv6/netfilter/ip6_tables.c | 38 +++-------------------- net/netfilter/x_tables.c | 50 +++++++++++++++++++++++++----- 5 files changed, 55 insertions(+), 110 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index a81b46af5118..cb4b694dd9e4 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -305,6 +305,7 @@ struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 97ead883e4a1..c02e46a0271a 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1522,13 +1522,11 @@ int arpt_register_table(struct net *net, const struct arpt_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1543,7 +1541,7 @@ int arpt_register_table(struct net *net, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct arpt_entry *iter; @@ -1553,31 +1551,6 @@ int arpt_register_table(struct net *net, return PTR_ERR(new_table); } - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - - return ret; - -out_free: - __arpt_unregister_table(net, new_table); return ret; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 23c8deff8095..488c5945ebb2 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1724,13 +1724,11 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1745,7 +1743,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ipt_entry *iter; @@ -1755,37 +1753,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return PTR_ERR(new_table); } - /* No template? No need to do anything. This is used by 'nat' table, it registers - * with the nat core instead of the netfilter core. - */ - if (!template_ops) - return 0; - - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - - return ret; - -out_free: - __ipt_unregister_table(net, new_table); return ret; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index d585ac3c1113..dbe7c7acd702 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1733,13 +1733,11 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1754,7 +1752,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ip6t_entry *iter; @@ -1764,34 +1762,6 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return PTR_ERR(new_table); } - if (!template_ops) - return 0; - - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - - return ret; - -out_free: - __ip6t_unregister_table(net, new_table); return ret; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index bb0cb3959551..06f27bea9eed 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1542,7 +1542,6 @@ xt_replace_table(struct xt_table *table, unsigned int num_counters, private = do_replace_table(table, num_counters, newinfo, error); if (private) audit_log_nfcfg(table->name, table->af, private->number, - !private->number ? AUDIT_XT_OP_REGISTER : AUDIT_XT_OP_REPLACE, GFP_KERNEL); @@ -1552,20 +1551,32 @@ EXPORT_SYMBOL_GPL(xt_replace_table); struct xt_table *xt_register_table(struct net *net, const struct xt_table *input_table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t, *table = NULL; + struct nf_hook_ops *ops = NULL; struct xt_table_info *private; - struct xt_table *t, *table; - int ret; + unsigned int num_ops; + int ret = -EINVAL; + + num_ops = hweight32(input_table->valid_hooks); + if (num_ops == 0) + goto out; + + ret = -ENOMEM; + if (template_ops) { + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); + if (!ops) + goto out; + } /* Don't add one object to multiple lists. */ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); - if (!table) { - ret = -ENOMEM; + if (!table) goto out; - } mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ @@ -1579,7 +1590,7 @@ struct xt_table *xt_register_table(struct net *net, /* Simplifies replace_table code. */ table->private = bootstrap; - if (!xt_replace_table(table, 0, newinfo, &ret)) + if (!do_replace_table(table, 0, newinfo, &ret)) goto unlock; private = table->private; @@ -1588,14 +1599,37 @@ struct xt_table *xt_register_table(struct net *net, /* save number of initial entries */ private->initial_entries = private->number; + if (ops) { + int i; + + for (i = 0; i < num_ops; i++) + ops[i].priv = table; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) { + mutex_unlock(&xt[table->af].mutex); + /* nf_register_net_hooks() might have published a + * base chain before internal error unwind. + */ + synchronize_rcu(); + goto out; + } + + table->ops = ops; + } + + audit_log_nfcfg(table->name, table->af, private->number, + AUDIT_XT_OP_REGISTER, GFP_KERNEL); + list_add(&table->list, &xt_net->tables[table->af]); mutex_unlock(&xt[table->af].mutex); return table; unlock: mutex_unlock(&xt[table->af].mutex); - kfree(table); out: + kfree(table); + kfree(ops); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(xt_register_table); From 527d6931473b75d90e38942aae6537d1a527f1fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:15 +0200 Subject: [PATCH 074/377] netfilter: x_tables: add and use xt_unregister_table_pre_exit Remove the copypasted variants of _pre_exit and add one single function in the xtables core. ebtables is not compatible with x_tables and therefore unchanged. This is a preparation patch to reduce noise in the followup bug fixes. Reviewed-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + include/linux/netfilter_arp/arp_tables.h | 1 - include/linux/netfilter_ipv4/ip_tables.h | 1 - include/linux/netfilter_ipv6/ip6_tables.h | 1 - net/ipv4/netfilter/arp_tables.c | 9 ------- net/ipv4/netfilter/arptable_filter.c | 2 +- net/ipv4/netfilter/ip_tables.c | 9 ------- net/ipv4/netfilter/iptable_filter.c | 2 +- net/ipv4/netfilter/iptable_mangle.c | 2 +- net/ipv4/netfilter/iptable_nat.c | 1 + net/ipv4/netfilter/iptable_raw.c | 2 +- net/ipv4/netfilter/iptable_security.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 9 ------- net/ipv6/netfilter/ip6table_filter.c | 2 +- net/ipv6/netfilter/ip6table_mangle.c | 2 +- net/ipv6/netfilter/ip6table_nat.c | 1 + net/ipv6/netfilter/ip6table_raw.c | 2 +- net/ipv6/netfilter/ip6table_security.c | 2 +- net/netfilter/x_tables.c | 29 +++++++++++++++++++++++ 19 files changed, 41 insertions(+), 39 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index cb4b694dd9e4..74486714ae20 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -309,6 +309,7 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index a40aaf645fa4..05631a25e622 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -53,7 +53,6 @@ int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *ops); void arpt_unregister_table(struct net *net, const char *name); -void arpt_unregister_table_pre_exit(struct net *net, const char *name); extern unsigned int arpt_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 132b0e4a6d4d..13593391d605 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -26,7 +26,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *ops); -void ipt_unregister_table_pre_exit(struct net *net, const char *name); void ipt_unregister_table_exit(struct net *net, const char *name); /* Standard entry. */ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 8b8885a73c76..c6d5b927830d 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -27,7 +27,6 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *); int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *ops); -void ip6t_unregister_table_pre_exit(struct net *net, const char *name); void ip6t_unregister_table_exit(struct net *net, const char *name); extern unsigned int ip6t_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c02e46a0271a..bd348b7bad2c 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1554,15 +1554,6 @@ int arpt_register_table(struct net *net, return ret; } -void arpt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); -} -EXPORT_SYMBOL(arpt_unregister_table_pre_exit); - void arpt_unregister_table(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 78cd5ee24448..393d9a8c7739 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -43,7 +43,7 @@ static int arptable_filter_table_init(struct net *net) static void __net_exit arptable_filter_net_pre_exit(struct net *net) { - arpt_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_ARP, "filter"); } static void __net_exit arptable_filter_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 488c5945ebb2..864489928fb5 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1756,14 +1756,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return ret; } -void ipt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); -} - void ipt_unregister_table_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); @@ -1854,7 +1846,6 @@ static void __exit ip_tables_fini(void) } EXPORT_SYMBOL(ipt_register_table); -EXPORT_SYMBOL(ipt_unregister_table_pre_exit); EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); module_init(ip_tables_init); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 3ab908b74795..b2fbd9651d61 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -61,7 +61,7 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 385d945d8ebe..a99e61996197 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -96,7 +96,7 @@ static int iptable_mangle_table_init(struct net *net) static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "mangle"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 625a1ca13b1b..8fc4912e790d 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -129,6 +129,7 @@ static int iptable_nat_table_init(struct net *net) static void __net_exit iptable_nat_net_pre_exit(struct net *net) { ipt_nat_unregister_lookups(net); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat"); } static void __net_exit iptable_nat_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 0e7f53964d0a..42511721e538 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -53,7 +53,7 @@ static int iptable_raw_table_init(struct net *net) static void __net_exit iptable_raw_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "raw"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "raw"); } static void __net_exit iptable_raw_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index d885443cb267..4646bf6d7d2b 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -50,7 +50,7 @@ static int iptable_security_table_init(struct net *net) static void __net_exit iptable_security_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "security"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "security"); } static void __net_exit iptable_security_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index dbe7c7acd702..edf50bc7787e 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1765,14 +1765,6 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return ret; } -void ip6t_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); -} - void ip6t_unregister_table_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); @@ -1864,7 +1856,6 @@ static void __exit ip6_tables_fini(void) } EXPORT_SYMBOL(ip6t_register_table); -EXPORT_SYMBOL(ip6t_unregister_table_pre_exit); EXPORT_SYMBOL(ip6t_unregister_table_exit); EXPORT_SYMBOL(ip6t_do_table); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index e8992693e14a..f05a9e4b2c67 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -60,7 +60,7 @@ static int __net_init ip6table_filter_net_init(struct net *net) static void __net_exit ip6table_filter_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "filter"); } static void __net_exit ip6table_filter_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 8dd4cd0c47bd..afa4a5703e43 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -89,7 +89,7 @@ static int ip6table_mangle_table_init(struct net *net) static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "mangle"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "mangle"); } static void __net_exit ip6table_mangle_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 5be723232df8..bb8aa3fc42b4 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -131,6 +131,7 @@ static int ip6table_nat_table_init(struct net *net) static void __net_exit ip6table_nat_net_pre_exit(struct net *net) { ip6t_nat_unregister_lookups(net); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat"); } static void __net_exit ip6table_nat_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index fc9f6754028f..32d2da81c52a 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -52,7 +52,7 @@ static int ip6table_raw_table_init(struct net *net) static void __net_exit ip6table_raw_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "raw"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "raw"); } static void __net_exit ip6table_raw_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 4df14a9bae78..3dfd8d6ea4b9 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -49,7 +49,7 @@ static int ip6table_security_table_init(struct net *net) static void __net_exit ip6table_security_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "security"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "security"); } static void __net_exit ip6table_security_net_exit(struct net *net) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 06f27bea9eed..9c1e896c7b03 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1650,6 +1650,35 @@ void *xt_unregister_table(struct xt_table *table) return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); + +/** + * xt_unregister_table_pre_exit - pre-shutdown unregister of a table + * @net: network namespace + * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6) + * @name: name of the table to unregister + * + * Unregisters the specified netfilter table from the given network namespace + * and also unregisters the hooks from netfilter core: no new packets will be + * processed. + */ +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_net->tables[af], list) { + if (strcmp(t->name, name) == 0) { + mutex_unlock(&xt[af].mutex); + + if (t->ops) /* nat table registers with nat core, t->ops is NULL. */ + nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks)); + return; + } + } + mutex_unlock(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_table_pre_exit); #endif #ifdef CONFIG_PROC_FS From d338693d778579b676a61346849bebd892427158 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:16 +0200 Subject: [PATCH 075/377] netfilter: x_tables: unregister the templates first When the module is going away we need to zap the template first. Else there is a small race window where userspace could instantiate a new table after the pernet exit function has removed the current table. Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Reported-by: Tristan Madani Reviewed-by: Tristan Madani Closes: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arptable_filter.c | 2 +- net/ipv4/netfilter/iptable_filter.c | 2 +- net/ipv4/netfilter/iptable_mangle.c | 2 +- net/ipv4/netfilter/iptable_raw.c | 2 +- net/ipv4/netfilter/iptable_security.c | 2 +- net/ipv6/netfilter/ip6table_filter.c | 2 +- net/ipv6/netfilter/ip6table_mangle.c | 2 +- net/ipv6/netfilter/ip6table_raw.c | 2 +- net/ipv6/netfilter/ip6table_security.c | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 393d9a8c7739..382345567a60 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -82,8 +82,8 @@ static int __init arptable_filter_init(void) static void __exit arptable_filter_fini(void) { - unregister_pernet_subsys(&arptable_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&arptable_filter_net_ops); kfree(arpfilter_ops); } diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index b2fbd9651d61..0dea754a9120 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -101,8 +101,8 @@ static int __init iptable_filter_init(void) static void __exit iptable_filter_fini(void) { - unregister_pernet_subsys(&iptable_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&iptable_filter_net_ops); kfree(filter_ops); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index a99e61996197..4d3b12492308 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -135,8 +135,8 @@ static int __init iptable_mangle_init(void) static void __exit iptable_mangle_fini(void) { - unregister_pernet_subsys(&iptable_mangle_net_ops); xt_unregister_template(&packet_mangler); + unregister_pernet_subsys(&iptable_mangle_net_ops); kfree(mangle_ops); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 42511721e538..6f7afec7954b 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -100,9 +100,9 @@ static int __init iptable_raw_init(void) static void __exit iptable_raw_fini(void) { + xt_unregister_template(&packet_raw); unregister_pernet_subsys(&iptable_raw_net_ops); kfree(rawtable_ops); - xt_unregister_template(&packet_raw); } module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 4646bf6d7d2b..81175c20ccbe 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -89,9 +89,9 @@ static int __init iptable_security_init(void) static void __exit iptable_security_fini(void) { + xt_unregister_template(&security_table); unregister_pernet_subsys(&iptable_security_net_ops); kfree(sectbl_ops); - xt_unregister_template(&security_table); } module_init(iptable_security_init); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index f05a9e4b2c67..cf561919bde8 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -100,8 +100,8 @@ static int __init ip6table_filter_init(void) static void __exit ip6table_filter_fini(void) { - unregister_pernet_subsys(&ip6table_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&ip6table_filter_net_ops); kfree(filter_ops); } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index afa4a5703e43..1a758f2bc537 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -128,8 +128,8 @@ static int __init ip6table_mangle_init(void) static void __exit ip6table_mangle_fini(void) { - unregister_pernet_subsys(&ip6table_mangle_net_ops); xt_unregister_template(&packet_mangler); + unregister_pernet_subsys(&ip6table_mangle_net_ops); kfree(mangle_ops); } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 32d2da81c52a..923455921c1d 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -98,8 +98,8 @@ static int __init ip6table_raw_init(void) static void __exit ip6table_raw_fini(void) { - unregister_pernet_subsys(&ip6table_raw_net_ops); xt_unregister_template(&packet_raw); + unregister_pernet_subsys(&ip6table_raw_net_ops); kfree(rawtable_ops); } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 3dfd8d6ea4b9..c44834d93fc7 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -88,8 +88,8 @@ static int __init ip6table_security_init(void) static void __exit ip6table_security_fini(void) { - unregister_pernet_subsys(&ip6table_security_net_ops); xt_unregister_template(&security_table); + unregister_pernet_subsys(&ip6table_security_net_ops); kfree(sectbl_ops); } From b4597d5fd7d2f8cebfffd40dffb5e003cc78964c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:17 +0200 Subject: [PATCH 076/377] netfilter: x_tables: add and use xtables_unregister_table_exit Previous change added xtables_unregister_table_pre_exit to detach the table from the packetpath and to unlink it from the active table list. In case of rmmod, userspace that is doing set/getsockopt for this table will not be able to re-instantiate the table: 1. The larval table has been removed already 2. existing instantiated table is no longer on the xt pernet table list. This adds the second stage helper: unlink the table from the dying list, free the hook ops (if any) and do the audit notification. It replaces xt_unregister_table(). Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Reported-by: Tristan Madani Reviewed-by: Tristan Madani Closes: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- net/ipv4/netfilter/arp_tables.c | 9 ++-- net/ipv4/netfilter/ip_tables.c | 9 ++-- net/ipv4/netfilter/iptable_nat.c | 5 +- net/ipv6/netfilter/ip6_tables.c | 9 ++-- net/ipv6/netfilter/ip6table_nat.c | 5 +- net/netfilter/x_tables.c | 81 +++++++++++++++++++++++------- 7 files changed, 83 insertions(+), 37 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 74486714ae20..5a1c5c336fa4 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -308,8 +308,8 @@ struct xt_table *xt_register_table(struct net *net, const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); -void *xt_unregister_table(struct xt_table *table); void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index bd348b7bad2c..ad2259678c78 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1501,13 +1501,11 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len static void __arpt_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; + void *loc_cpu_entry; struct arpt_entry *iter; - private = xt_unregister_table(table); - /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) @@ -1515,6 +1513,7 @@ static void __arpt_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int arpt_register_table(struct net *net, @@ -1556,7 +1555,7 @@ int arpt_register_table(struct net *net, void arpt_unregister_table(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_ARP, name); if (table) __arpt_unregister_table(net, table); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 864489928fb5..5cbdb0815857 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1704,12 +1704,10 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) static void __ipt_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; struct ipt_entry *iter; - - private = xt_unregister_table(table); + void *loc_cpu_entry; /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; @@ -1718,6 +1716,7 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int ipt_register_table(struct net *net, const struct xt_table *table, @@ -1758,7 +1757,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table, void ipt_unregister_table_exit(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV4, name); if (table) __ipt_unregister_table(net, table); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 8fc4912e790d..a0df72554025 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -119,8 +119,11 @@ static int iptable_nat_table_init(struct net *net) } ret = ipt_nat_register_lookups(net); - if (ret < 0) + if (ret < 0) { + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat"); + synchronize_rcu(); ipt_unregister_table_exit(net, "nat"); + } kfree(repl); return ret; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index edf50bc7787e..9d9c3763f2f5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1713,12 +1713,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) static void __ip6t_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; struct ip6t_entry *iter; - - private = xt_unregister_table(table); + void *loc_cpu_entry; /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; @@ -1727,6 +1725,7 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int ip6t_register_table(struct net *net, const struct xt_table *table, @@ -1767,7 +1766,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, void ip6t_unregister_table_exit(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV6, name); if (table) __ip6t_unregister_table(net, table); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index bb8aa3fc42b4..c2394e2c94b5 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -121,8 +121,11 @@ static int ip6table_nat_table_init(struct net *net) } ret = ip6t_nat_register_lookups(net); - if (ret < 0) + if (ret < 0) { + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat"); + synchronize_rcu(); ip6t_unregister_table_exit(net, "nat"); + } kfree(repl); return ret; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 9c1e896c7b03..4e6708c23922 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -55,6 +55,9 @@ static struct list_head xt_templates[NFPROTO_NUMPROTO]; struct xt_pernet { struct list_head tables[NFPROTO_NUMPROTO]; + + /* stash area used during netns exit */ + struct list_head dead_tables[NFPROTO_NUMPROTO]; }; struct compat_delta { @@ -1634,23 +1637,6 @@ struct xt_table *xt_register_table(struct net *net, } EXPORT_SYMBOL_GPL(xt_register_table); -void *xt_unregister_table(struct xt_table *table) -{ - struct xt_table_info *private; - - mutex_lock(&xt[table->af].mutex); - private = table->private; - list_del(&table->list); - mutex_unlock(&xt[table->af].mutex); - audit_log_nfcfg(table->name, table->af, private->number, - AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); - kfree(table->ops); - kfree(table); - - return private; -} -EXPORT_SYMBOL_GPL(xt_unregister_table); - /** * xt_unregister_table_pre_exit - pre-shutdown unregister of a table * @net: network namespace @@ -1660,6 +1646,14 @@ EXPORT_SYMBOL_GPL(xt_unregister_table); * Unregisters the specified netfilter table from the given network namespace * and also unregisters the hooks from netfilter core: no new packets will be * processed. + * + * This must be called prior to xt_unregister_table_exit() from the pernet + * .pre_exit callback. After this call, the table is no longer visible to + * the get/setsockopt path. In case of rmmod, module exit path must have + * called xt_unregister_template() prior to unregistering pernet ops to + * prevent re-instantiation of the table. + * + * See also: xt_unregister_table_exit() */ void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) { @@ -1669,6 +1663,7 @@ void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_net->tables[af], list) { if (strcmp(t->name, name) == 0) { + list_move(&t->list, &xt_net->dead_tables[af]); mutex_unlock(&xt[af].mutex); if (t->ops) /* nat table registers with nat core, t->ops is NULL. */ @@ -1679,6 +1674,50 @@ void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) mutex_unlock(&xt[af].mutex); } EXPORT_SYMBOL(xt_unregister_table_pre_exit); + +/** + * xt_unregister_table_exit - remove a table during namespace teardown + * @net: the network namespace from which to unregister the table + * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6) + * @name: name of the table to unregister + * + * Completes the unregister process for a table. This must be called from + * the pernet ops .exit callback. This is the second stage after + * xt_unregister_table_pre_exit(). + * + * pair with xt_unregister_table_pre_exit() during namespace shutdown. + * + * Return: the unregistered table or NULL if the table was never + * instantiated. The caller needs to kfree() the table after it + * has removed the family specific matches/targets. + */ +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *table; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(table, &xt_net->dead_tables[af], list) { + struct nf_hook_ops *ops = NULL; + + if (strcmp(table->name, name) != 0) + continue; + + list_del(&table->list); + + audit_log_nfcfg(table->name, table->af, table->private->number, + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + swap(table->ops, ops); + mutex_unlock(&xt[af].mutex); + + kfree(ops); + return table; + } + mutex_unlock(&xt[af].mutex); + + return NULL; +} +EXPORT_SYMBOL_GPL(xt_unregister_table_exit); #endif #ifdef CONFIG_PROC_FS @@ -2125,8 +2164,10 @@ static int __net_init xt_net_init(struct net *net) struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) { INIT_LIST_HEAD(&xt_net->tables[i]); + INIT_LIST_HEAD(&xt_net->dead_tables[i]); + } return 0; } @@ -2135,8 +2176,10 @@ static void __net_exit xt_net_exit(struct net *net) struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) { WARN_ON_ONCE(!list_empty(&xt_net->tables[i])); + WARN_ON_ONCE(!list_empty(&xt_net->dead_tables[i])); + } } static struct pernet_operations xt_net_ops = { From b7f0544d86d439cb946515d2ef6a0a75e8626710 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:18 +0200 Subject: [PATCH 077/377] netfilter: ebtables: move to two-stage removal scheme Like previous patches for x_tables, follow same pattern in ebtables. We can't reuse xt helpers: ebt_table struct layout is incompatible. table->ops assignment is now done while still holding the ebt mutex to make sure we never expose partially-filled table struct. Fixes: 87663c39f898 ("netfilter: ebtables: do not hook tables by default") Reviewed-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtable_broute.c | 2 +- net/bridge/netfilter/ebtable_filter.c | 2 +- net/bridge/netfilter/ebtable_nat.c | 2 +- net/bridge/netfilter/ebtables.c | 60 +++++++++++++++++---------- 4 files changed, 40 insertions(+), 26 deletions(-) diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 741360219552..e6f9e343b41f 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -128,8 +128,8 @@ static int __init ebtable_broute_init(void) static void __exit ebtable_broute_fini(void) { - unregister_pernet_subsys(&broute_net_ops); ebt_unregister_template(&broute_table); + unregister_pernet_subsys(&broute_net_ops); } module_init(ebtable_broute_init); diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index dacd81b12e62..02b6501c15a5 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -109,8 +109,8 @@ static int __init ebtable_filter_init(void) static void __exit ebtable_filter_fini(void) { - unregister_pernet_subsys(&frame_filter_net_ops); ebt_unregister_template(&frame_filter); + unregister_pernet_subsys(&frame_filter_net_ops); } module_init(ebtable_filter_init); diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 0f2a8c6118d4..9985a82555c4 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -109,8 +109,8 @@ static int __init ebtable_nat_init(void) static void __exit ebtable_nat_fini(void) { - unregister_pernet_subsys(&frame_nat_net_ops); ebt_unregister_template(&frame_nat); + unregister_pernet_subsys(&frame_nat_net_ops); } module_init(ebtable_nat_init); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index aea3e19875c6..3578ffbc14ae 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -42,6 +42,7 @@ struct ebt_pernet { struct list_head tables; + struct list_head dead_tables; }; struct ebt_template { @@ -1162,11 +1163,6 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) static void __ebt_unregister_table(struct net *net, struct ebt_table *table) { - mutex_lock(&ebt_mutex); - list_del(&table->list); - mutex_unlock(&ebt_mutex); - audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries, - AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, ebt_cleanup_entry, net, NULL); if (table->private->nentries) @@ -1267,13 +1263,15 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, for (i = 0; i < num_ops; i++) ops[i].priv = table; - list_add(&table->list, &ebt_net->tables); - mutex_unlock(&ebt_mutex); - table->ops = ops; ret = nf_register_net_hooks(net, ops, num_ops); - if (ret) + if (ret) { + synchronize_rcu(); __ebt_unregister_table(net, table); + } else { + list_add(&table->list, &ebt_net->tables); + } + mutex_unlock(&ebt_mutex); audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, AUDIT_XT_OP_REGISTER, GFP_KERNEL); @@ -1339,7 +1337,7 @@ void ebt_unregister_template(const struct ebt_table *t) } EXPORT_SYMBOL(ebt_unregister_template); -static struct ebt_table *__ebt_find_table(struct net *net, const char *name) +void ebt_unregister_table_pre_exit(struct net *net, const char *name) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_table *t; @@ -1348,30 +1346,36 @@ static struct ebt_table *__ebt_find_table(struct net *net, const char *name) list_for_each_entry(t, &ebt_net->tables, list) { if (strcmp(t->name, name) == 0) { + list_move(&t->list, &ebt_net->dead_tables); mutex_unlock(&ebt_mutex); - return t; + nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks)); + return; } } mutex_unlock(&ebt_mutex); - return NULL; -} - -void ebt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct ebt_table *table = __ebt_find_table(net, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(ebt_unregister_table_pre_exit); void ebt_unregister_table(struct net *net, const char *name) { - struct ebt_table *table = __ebt_find_table(net, name); + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + struct ebt_table *t; - if (table) - __ebt_unregister_table(net, table); + mutex_lock(&ebt_mutex); + + list_for_each_entry(t, &ebt_net->dead_tables, list) { + if (strcmp(t->name, name) == 0) { + list_del(&t->list); + audit_log_nfcfg(t->name, AF_BRIDGE, t->private->nentries, + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + __ebt_unregister_table(net, t); + mutex_unlock(&ebt_mutex); + return; + } + } + + mutex_unlock(&ebt_mutex); } /* userspace just supplied us with counters */ @@ -2556,11 +2560,21 @@ static int __net_init ebt_pernet_init(struct net *net) struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); INIT_LIST_HEAD(&ebt_net->tables); + INIT_LIST_HEAD(&ebt_net->dead_tables); return 0; } +static void __net_exit ebt_pernet_exit(struct net *net) +{ + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + + WARN_ON_ONCE(!list_empty(&ebt_net->tables)); + WARN_ON_ONCE(!list_empty(&ebt_net->dead_tables)); +} + static struct pernet_operations ebt_net_ops = { .init = ebt_pernet_init, + .exit = ebt_pernet_exit, .id = &ebt_pernet_id, .size = sizeof(struct ebt_pernet), }; From 92c603fa07bc0d6a17345de3ad7954730b8de44b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:19 +0200 Subject: [PATCH 078/377] netfilter: ebtables: close dangling table module init race sashiko reported for a related patch: In modules like iptable_raw.c, [..], if register_pernet_subsys() fails, the rollback might call kfree(rawtable_ops) before [..] During this window, could a concurrent userspace process find the globally visible template, trigger table_init(), [..] The table init functions must always register the template last. Otherwise, set/getsockopt can instantiate a table in a namespace while the required pernet ops (contain the destructor) isn't available. This change is also required in x_tables, handled in followup change. Fixes: 87663c39f898 ("netfilter: ebtables: do not hook tables by default") Reviewed-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtable_broute.c | 12 +++++------- net/bridge/netfilter/ebtable_filter.c | 12 +++++------- net/bridge/netfilter/ebtable_nat.c | 10 ++++------ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index e6f9e343b41f..f05c79f215ea 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -112,18 +112,16 @@ static struct pernet_operations broute_net_ops = { static int __init ebtable_broute_init(void) { - int ret = ebt_register_template(&broute_table, broute_table_init); + int ret = register_pernet_subsys(&broute_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&broute_net_ops); - if (ret) { - ebt_unregister_template(&broute_table); - return ret; - } + ret = ebt_register_template(&broute_table, broute_table_init); + if (ret) + unregister_pernet_subsys(&broute_net_ops); - return 0; + return ret; } static void __exit ebtable_broute_fini(void) diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 02b6501c15a5..0fc03b07e62a 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -93,18 +93,16 @@ static struct pernet_operations frame_filter_net_ops = { static int __init ebtable_filter_init(void) { - int ret = ebt_register_template(&frame_filter, frame_filter_table_init); + int ret = register_pernet_subsys(&frame_filter_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&frame_filter_net_ops); - if (ret) { - ebt_unregister_template(&frame_filter); - return ret; - } + ret = ebt_register_template(&frame_filter, frame_filter_table_init); + if (ret) + unregister_pernet_subsys(&frame_filter_net_ops); - return 0; + return ret; } static void __exit ebtable_filter_fini(void) diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 9985a82555c4..8a10375d8909 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -93,16 +93,14 @@ static struct pernet_operations frame_nat_net_ops = { static int __init ebtable_nat_init(void) { - int ret = ebt_register_template(&frame_nat, frame_nat_table_init); + int ret = register_pernet_subsys(&frame_nat_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&frame_nat_net_ops); - if (ret) { - ebt_unregister_template(&frame_nat); - return ret; - } + ret = ebt_register_template(&frame_nat, frame_nat_table_init); + if (ret) + unregister_pernet_subsys(&frame_nat_net_ops); return ret; } From 16bc4b6686b2c112c10e67d6b493adc3607256d3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:20 +0200 Subject: [PATCH 079/377] netfilter: x_tables: close dangling table module init race Similar to the previous ebtables patch: template add exposes the table to userspace, we must do this last to rnsure the pernet ops are set up (contain the destructors). Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arptable_filter.c | 23 ++++++++++++----------- net/ipv4/netfilter/iptable_filter.c | 23 ++++++++++++----------- net/ipv4/netfilter/iptable_mangle.c | 25 +++++++++++++------------ net/ipv4/netfilter/iptable_raw.c | 22 +++++++++++----------- net/ipv4/netfilter/iptable_security.c | 23 ++++++++++++----------- net/ipv6/netfilter/ip6table_filter.c | 22 +++++++++++----------- net/ipv6/netfilter/ip6table_mangle.c | 23 ++++++++++++----------- net/ipv6/netfilter/ip6table_raw.c | 20 ++++++++++---------- net/ipv6/netfilter/ip6table_security.c | 23 ++++++++++++----------- 9 files changed, 105 insertions(+), 99 deletions(-) diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 382345567a60..370b635e3523 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -58,25 +58,26 @@ static struct pernet_operations arptable_filter_net_ops = { static int __init arptable_filter_init(void) { - int ret = xt_register_template(&packet_filter, - arptable_filter_table_init); - - if (ret < 0) - return ret; + int ret; arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table); - if (IS_ERR(arpfilter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(arpfilter_ops)) return PTR_ERR(arpfilter_ops); - } ret = register_pernet_subsys(&arptable_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, + arptable_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(arpfilter_ops); - return ret; + unregister_pernet_subsys(&arptable_filter_net_ops); + goto err_free; } + return 0; +err_free: + kfree(arpfilter_ops); return ret; } diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 0dea754a9120..672d7da1071d 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -77,26 +77,27 @@ static struct pernet_operations iptable_filter_net_ops = { static int __init iptable_filter_init(void) { - int ret = xt_register_template(&packet_filter, - iptable_filter_table_init); - - if (ret < 0) - return ret; + int ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table); - if (IS_ERR(filter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(filter_ops)) return PTR_ERR(filter_ops); - } ret = register_pernet_subsys(&iptable_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, + iptable_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(filter_ops); - return ret; + unregister_pernet_subsys(&iptable_filter_net_ops); + goto err_free; } return 0; +err_free: + kfree(filter_ops); + return ret; } static void __exit iptable_filter_fini(void) diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 4d3b12492308..13d25d9a4610 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -111,25 +111,26 @@ static struct pernet_operations iptable_mangle_net_ops = { static int __init iptable_mangle_init(void) { - int ret = xt_register_template(&packet_mangler, - iptable_mangle_table_init); - if (ret < 0) - return ret; + int ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); - if (IS_ERR(mangle_ops)) { - xt_unregister_template(&packet_mangler); - ret = PTR_ERR(mangle_ops); - return ret; - } + if (IS_ERR(mangle_ops)) + return PTR_ERR(mangle_ops); ret = register_pernet_subsys(&iptable_mangle_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_mangler, + iptable_mangle_table_init); if (ret < 0) { - xt_unregister_template(&packet_mangler); - kfree(mangle_ops); - return ret; + unregister_pernet_subsys(&iptable_mangle_net_ops); + goto err_free; } + return 0; +err_free: + kfree(mangle_ops); return ret; } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 6f7afec7954b..2745c22f4034 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -77,24 +77,24 @@ static int __init iptable_raw_init(void) pr_info("Enabling raw table before defrag\n"); } - ret = xt_register_template(table, - iptable_raw_table_init); - if (ret < 0) - return ret; - rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table); - if (IS_ERR(rawtable_ops)) { - xt_unregister_template(table); + if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); - } ret = register_pernet_subsys(&iptable_raw_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(table, + iptable_raw_table_init); if (ret < 0) { - xt_unregister_template(table); - kfree(rawtable_ops); - return ret; + unregister_pernet_subsys(&iptable_raw_net_ops); + goto err_free; } + return 0; +err_free: + kfree(rawtable_ops); return ret; } diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 81175c20ccbe..491894511c54 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -65,25 +65,26 @@ static struct pernet_operations iptable_security_net_ops = { static int __init iptable_security_init(void) { - int ret = xt_register_template(&security_table, - iptable_security_table_init); - - if (ret < 0) - return ret; + int ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table); - if (IS_ERR(sectbl_ops)) { - xt_unregister_template(&security_table); + if (IS_ERR(sectbl_ops)) return PTR_ERR(sectbl_ops); - } ret = register_pernet_subsys(&iptable_security_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&security_table, + iptable_security_table_init); if (ret < 0) { - xt_unregister_template(&security_table); - kfree(sectbl_ops); - return ret; + unregister_pernet_subsys(&iptable_security_net_ops); + goto err_free; } + return 0; +err_free: + kfree(sectbl_ops); return ret; } diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index cf561919bde8..b074fc477676 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -76,25 +76,25 @@ static struct pernet_operations ip6table_filter_net_ops = { static int __init ip6table_filter_init(void) { - int ret = xt_register_template(&packet_filter, - ip6table_filter_table_init); - - if (ret < 0) - return ret; + int ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table); - if (IS_ERR(filter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(filter_ops)) return PTR_ERR(filter_ops); - } ret = register_pernet_subsys(&ip6table_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, ip6table_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(filter_ops); - return ret; + unregister_pernet_subsys(&ip6table_filter_net_ops); + goto err_free; } + return 0; +err_free: + kfree(filter_ops); return ret; } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 1a758f2bc537..e6ee036a9b2c 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -104,25 +104,26 @@ static struct pernet_operations ip6table_mangle_net_ops = { static int __init ip6table_mangle_init(void) { - int ret = xt_register_template(&packet_mangler, - ip6table_mangle_table_init); - - if (ret < 0) - return ret; + int ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); - if (IS_ERR(mangle_ops)) { - xt_unregister_template(&packet_mangler); + if (IS_ERR(mangle_ops)) return PTR_ERR(mangle_ops); - } ret = register_pernet_subsys(&ip6table_mangle_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_mangler, + ip6table_mangle_table_init); if (ret < 0) { - xt_unregister_template(&packet_mangler); - kfree(mangle_ops); - return ret; + unregister_pernet_subsys(&ip6table_mangle_net_ops); + goto err_free; } + return 0; +err_free: + kfree(mangle_ops); return ret; } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 923455921c1d..3b161ee875bc 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -75,24 +75,24 @@ static int __init ip6table_raw_init(void) pr_info("Enabling raw table before defrag\n"); } - ret = xt_register_template(table, ip6table_raw_table_init); - if (ret < 0) - return ret; - /* Register hooks */ rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table); - if (IS_ERR(rawtable_ops)) { - xt_unregister_template(table); + if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); - } ret = register_pernet_subsys(&ip6table_raw_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(table, ip6table_raw_table_init); if (ret < 0) { - kfree(rawtable_ops); - xt_unregister_template(table); - return ret; + unregister_pernet_subsys(&ip6table_raw_net_ops); + goto err_free; } + return 0; +err_free: + kfree(rawtable_ops); return ret; } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index c44834d93fc7..4bd5d97b8ab6 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -64,25 +64,26 @@ static struct pernet_operations ip6table_security_net_ops = { static int __init ip6table_security_init(void) { - int ret = xt_register_template(&security_table, - ip6table_security_table_init); - - if (ret < 0) - return ret; + int ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table); - if (IS_ERR(sectbl_ops)) { - xt_unregister_template(&security_table); + if (IS_ERR(sectbl_ops)) return PTR_ERR(sectbl_ops); - } ret = register_pernet_subsys(&ip6table_security_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&security_table, + ip6table_security_table_init); if (ret < 0) { - kfree(sectbl_ops); - xt_unregister_template(&security_table); - return ret; + unregister_pernet_subsys(&ip6table_security_net_ops); + goto err_free; } + return 0; +err_free: + kfree(sectbl_ops); return ret; } From 27414ff1b287ea9a2a11675149ec28e05539f3cc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 May 2026 11:19:22 +0200 Subject: [PATCH 080/377] netfilter: bridge: eb_tables: close module init race sashiko reports for unrelated patch: Does the core ebtables initialization in ebtables.c suffer from a similar race? Once nf_register_sockopt() completes, the sockopts are exposed globally. sockopt has to be registered last, just like in ip/ip6/arptables. Fixes: 5b53951cfc85 ("netfilter: ebtables: use net_generic infra") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 3578ffbc14ae..b9f4daac09af 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2583,19 +2583,20 @@ static int __init ebtables_init(void) { int ret; - ret = xt_register_target(&ebt_standard_target); + ret = register_pernet_subsys(&ebt_net_ops); if (ret < 0) return ret; - ret = nf_register_sockopt(&ebt_sockopts); + + ret = xt_register_target(&ebt_standard_target); if (ret < 0) { - xt_unregister_target(&ebt_standard_target); + unregister_pernet_subsys(&ebt_net_ops); return ret; } - ret = register_pernet_subsys(&ebt_net_ops); + ret = nf_register_sockopt(&ebt_sockopts); if (ret < 0) { - nf_unregister_sockopt(&ebt_sockopts); xt_unregister_target(&ebt_standard_target); + unregister_pernet_subsys(&ebt_net_ops); return ret; } From dcb0f9aefdd604d36710fda53c25bd7cf4a3e37a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 7 May 2026 13:00:28 +0200 Subject: [PATCH 081/377] netfilter: nf_conntrack_expect: restore helper propagation via expectation A recent series to fix expectations broke helper propagation via expectation, this mechanism is used by the sip and h323 helper. This also propagates the conntrack helper to expected connections. I changed semantics of exp->helper which now tells us the actual helper that created the expectation. Add an explicit assign_helper field to expectations for this purpose and update helpers to use it. Restore this feature for userspace conntrack helper via ctnetlink nfqueue integration so it is again possible to attach a helper to an expectation, where it makes sense. This is not restored via ctnetlink expectation creation as there is no client for such feature. Use the expectation layer 4 protocol number for the helper lookup for consistency. Make sure the expectation using this helper propagation mechanism also go away when the helper is unregistered. Fixes: 9c42bc9db90a ("netfilter: nf_conntrack_expect: honor expectation helper field") Fixes: 917b61fa2042 ("netfilter: ctnetlink: ignore explicit helper on new expectations") Reported-by: Ilya Maximets Tested-by: Ilya Maximets Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_expect.h | 5 ++++- net/netfilter/nf_conntrack_broadcast.c | 1 + net/netfilter/nf_conntrack_core.c | 7 +++++-- net/netfilter/nf_conntrack_expect.c | 1 + net/netfilter/nf_conntrack_h323_main.c | 12 ++++++------ net/netfilter/nf_conntrack_helper.c | 5 +++++ net/netfilter/nf_conntrack_netlink.c | 18 ++++++++++++++++-- net/netfilter/nf_conntrack_sip.c | 2 +- 8 files changed, 39 insertions(+), 12 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index e9a8350e7ccf..80f50fd0f7ad 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -45,9 +45,12 @@ struct nf_conntrack_expect { void (*expectfn)(struct nf_conn *new, struct nf_conntrack_expect *this); - /* Helper to assign to new connection */ + /* Helper that created this expectation */ struct nf_conntrack_helper __rcu *helper; + /* Helper to assign to new connection */ + struct nf_conntrack_helper __rcu *assign_helper; + /* The conntrack of the master connection */ struct nf_conn *master; diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 4f39bf7c843f..75e53fde6b29 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -72,6 +72,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, exp->flags = NF_CT_EXPECT_PERMANENT; exp->class = NF_CT_EXPECT_CLASS_DEFAULT; rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, NULL); write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b08189226320..8ba5b22a1eef 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1811,14 +1811,17 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { + struct nf_conntrack_helper *assign_helper; + /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; - if (exp->helper) { + assign_helper = rcu_dereference(exp->assign_helper); + if (assign_helper) { help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) - rcu_assign_pointer(help->helper, exp->helper); + rcu_assign_pointer(help->helper, assign_helper); } #ifdef CONFIG_NF_CONNTRACK_MARK diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 24d0576d84b7..8e943efbdf0a 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -344,6 +344,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, helper = rcu_dereference(help->helper); rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, NULL); write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 3f5c50455b71..b2fe6554b9cf 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -643,7 +643,7 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, &nf_conntrack_helper_h245); + rcu_assign_pointer(exp->assign_helper, &nf_conntrack_helper_h245); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -767,7 +767,7 @@ static int expect_callforwarding(struct sk_buff *skb, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -1234,7 +1234,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3 : NULL, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ nathook = rcu_dereference(nfct_h323_nat_hook); @@ -1306,7 +1306,7 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_UDP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_ras); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_ras); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect RAS "); @@ -1523,7 +1523,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); @@ -1577,7 +1577,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a715304a53d8..b594cd244fe1 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -400,6 +400,11 @@ static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) this = rcu_dereference_protected(exp->helper, lockdep_is_held(&nf_conntrack_expect_lock)); + if (this == me) + return true; + + this = rcu_dereference_protected(exp->assign_helper, + lockdep_is_held(&nf_conntrack_expect_lock)); return this == me; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index eda5fe4a75c8..d7209d124111 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2634,6 +2634,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, + const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); @@ -2860,6 +2861,7 @@ static int ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report) { + struct nf_conntrack_helper *assign_helper = NULL; struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; struct nf_conntrack_expect *exp; @@ -2875,8 +2877,18 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (err < 0) return err; + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + assign_helper = __nf_conntrack_helper_find(helpname, + nf_ct_l3num(ct), + tuple.dst.protonum); + if (!assign_helper) + return -EOPNOTSUPP; + } + exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, - &tuple, &mask); + assign_helper, &tuple, &mask); if (IS_ERR(exp)) return PTR_ERR(exp); @@ -3515,6 +3527,7 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, + const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { @@ -3568,6 +3581,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, exp->zone = ct->zone; #endif rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, assign_helper); exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; exp->mask.src.u.all = mask->src.u.all; @@ -3623,7 +3637,7 @@ ctnetlink_create_expect(struct net *net, ct = nf_ct_tuplehash_to_ctrack(h); rcu_read_lock(); - exp = ctnetlink_alloc_expect(cda, ct, &tuple, &mask); + exp = ctnetlink_alloc_expect(cda, ct, NULL, &tuple, &mask); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto err_rcu; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 1eb55907d470..d24bfa9e8234 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1383,7 +1383,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), saddr, &daddr, proto, NULL, &port); exp->timeout.expires = sip_timeout * HZ; - rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, helper); exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; hooks = rcu_dereference(nf_nat_sip_hooks); From d8ef54c83ad70b81735b506431affadd2f720aa1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 7 May 2026 23:57:55 +0200 Subject: [PATCH 082/377] netfilter: ctnetlink: check tuple and mask in expectations created via nfqueue Ensure the expectation tuple and mask attributes are present in netlink message, otherwise null-ptr-deref is possible. Fixes: bd0779370588 ("netfilter: nfnetlink_queue: allow to attach expectations to conntracks") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d7209d124111..befa7e83ee49 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2872,6 +2872,9 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (err < 0) return err; + if (!cda[CTA_EXPECT_TUPLE] || !cda[CTA_EXPECT_MASK]) + return -EINVAL; + err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda, ct, &tuple, &mask); if (err < 0) From eb6317739b1ea3ab28791e1f91b24781905fa815 Mon Sep 17 00:00:00 2001 From: Li Xiasong Date: Thu, 7 May 2026 22:04:22 +0800 Subject: [PATCH 083/377] netfilter: nf_conntrack_sip: get helper before allocating expectation process_register_request() allocates an expectation and then checks whether a conntrack helper is available. If helper lookup fails, the function returns early and the allocated expectation is left behind. Reorder the code to fetch and validate helper before calling nf_ct_expect_alloc(). This keeps the logic simpler and removes the leak path while preserving existing behavior. Fixes: e14575fa7529 ("netfilter: nf_conntrack: use rcu accessors where needed") Cc: stable@vger.kernel.org Signed-off-by: Li Xiasong Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index d24bfa9e8234..e69941f1a101 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1366,6 +1366,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, goto store_cseq; } + helper = rcu_dereference(nfct_help(ct)->helper); + if (!helper) + return NF_DROP; + exp = nf_ct_expect_alloc(ct); if (!exp) { nf_ct_helper_log(skb, ct, "cannot alloc expectation"); @@ -1376,10 +1380,6 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, if (sip_direct_signalling) saddr = &ct->tuplehash[!dir].tuple.src.u3; - helper = rcu_dereference(nfct_help(ct)->helper); - if (!helper) - return NF_DROP; - nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), saddr, &daddr, proto, NULL, &port); exp->timeout.expires = sip_timeout * HZ; From 19f94b6fee75b3ef7fbc06f3745b9a771a8a19a4 Mon Sep 17 00:00:00 2001 From: Li Xiasong Date: Thu, 7 May 2026 22:04:23 +0800 Subject: [PATCH 084/377] netfilter: nft_ct: fix missing expect put in obj eval nft_ct_expect_obj_eval() allocates an expectation and may call nf_ct_expect_related(), but never drops its local reference. Add nf_ct_expect_put(exp) before return to balance allocation. Fixes: 857b46027d6f ("netfilter: nft_ct: add ct expectations support") Cc: stable@vger.kernel.org Signed-off-by: Li Xiasong Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 60ee8d932fcb..fa2cc556331c 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1334,6 +1334,8 @@ static void nft_ct_expect_obj_eval(struct nft_object *obj, if (nf_ct_expect_related(exp, 0) != 0) regs->verdict.code = NF_DROP; + + nf_ct_expect_put(exp); } static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { From 1f91d0d5827512816789f74f4d72d16269bde1ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2026 14:16:59 -1000 Subject: [PATCH 085/377] sched_ext: Fix !CONFIG_EXT_SUB_SCHED build warnings W=1 with CONFIG_EXT_SUB_SCHED=n flags 'err_msg' uninitialized and 'err_free_lb_resched' unused. Initialize err_msg and gate the label. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 48b4834c7027..f4e2db8e56be 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5584,7 +5584,7 @@ static void refresh_watchdog(void) static s32 scx_link_sched(struct scx_sched *sch) { - const char *err_msg; + const char *err_msg = ""; s32 ret = 0; scoped_guard(raw_spinlock_irq, &scx_sched_lock) { @@ -6652,8 +6652,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #endif /* CONFIG_EXT_SUB_SCHED */ return sch; +#ifdef CONFIG_EXT_SUB_SCHED err_free_lb_resched: free_cpumask_var(sch->bypass_lb_resched_cpumask); +#endif err_free_lb_cpumask: free_cpumask_var(sch->bypass_lb_donee_cpumask); err_stop_helper: From 307abfac04a254c09c5705d816b33354acee97a0 Mon Sep 17 00:00:00 2001 From: Jianpeng Chang Date: Fri, 8 May 2026 09:56:36 +0900 Subject: [PATCH 086/377] kprobes: skip non-symbol addresses in kprobe_add_ksym_blacklist() When kprobe_add_area_blacklist() iterates through a section like .kprobes.text, the start address may not correspond to a named symbol. On ARM64 with CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS=y (introduced by commit baaf553d3bc3 ("arm64: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS")), the compiler flag -fpatchable-function-entry=4,2 inserts 2 NOPs before each function entry point for ftrace call_ops. These pre-function NOPs sit at the section base address, before the first named function symbol. The compiler emits a $x mapping symbol at offset 0x00 to mark the start of code, but find_kallsyms_symbol() ignores mapping symbols. Without CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS (e.g. defconfig), no pre-function NOPs are inserted, the first function starts at offset 0x00, and the bug does not trigger. This only affects modules that have a .kprobes.text section (i.e. those using the __kprobes annotation). Modules using NOKPROBE_SYMBOL() instead (like kretprobe_example.ko) blacklist exact function addresses via the _kprobe_blacklist section and are not affected. For kprobe_example.ko on ARM64 with -fpatchable-function-entry=4,2, the .kprobes.text section layout is: offset 0x00: $x + 2 NOPs (mapping symbol + ftrace preamble) offset 0x08: handler_post (64 bytes) offset 0x50: handler_pre (68 bytes) kprobe_add_area_blacklist() starts iterating from the section base address (offset 0x00), which only has the $x mapping symbol. kprobe_add_ksym_blacklist() then calls kallsyms_lookup_size_offset() for this address, which goes through: kallsyms_lookup_size_offset() -> module_address_lookup() -> find_kallsyms_symbol() find_kallsyms_symbol() scans all module symbols to find the closest preceding symbol. Since no named text symbol exists at offset 0x00, find_kallsyms_symbol() picks __UNIQUE_ID_vermagic (a .modinfo symbol whose address is in the temporary image) as the "best" match. The computed "size" = next_text_symbol - modinfo_symbol spans across these two unrelated memory regions, creating a blacklist entry with a bogus range of tens of terabytes. Whether this causes a visible failure depends on address randomization, here is what happens on Raspberry Pi 4/5: - On RPi5, the bogus size was ~35 TB. start + size stayed within 64-bit range, so the blacklist entry covered the entire kernel text. register_kprobe() in the module's own init function failed with -EINVAL. - On RPi4, the bogus size was ~75 TB. start + size overflowed 64 bits and wrapped to a small address near zero. The range check (addr >= start && addr < end) then failed because end wrapped around, so the bogus entry was accidentally harmless and kprobes worked by luck. The same bug exists on both machines, but randomization determines whether the integer overflow masks it or not. Fix this by adding notrace to the __kprobes macro. Functions in .kprobes.text are kprobe infrastructure handlers that should never be traced by ftrace. With notrace, the compiler stops inserting them and the non-symbol gap at the section start disappears entirely. Link: https://lore.kernel.org/all/20260506012706.2785785-1-jianpeng.chang.cn@windriver.com/ Fixes: baaf553d3bc3 ("arm64: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS") Signed-off-by: Jianpeng Chang Signed-off-by: Masami Hiramatsu (Google) --- include/asm-generic/kprobes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/kprobes.h b/include/asm-generic/kprobes.h index 060eab094e5a..5290a2b2e15a 100644 --- a/include/asm-generic/kprobes.h +++ b/include/asm-generic/kprobes.h @@ -14,7 +14,7 @@ static unsigned long __used \ _kbl_addr_##fname = (unsigned long)fname; # define NOKPROBE_SYMBOL(fname) __NOKPROBE_SYMBOL(fname) /* Use this to forbid a kprobes attach on very low level functions */ -# define __kprobes __section(".kprobes.text") +# define __kprobes notrace __section(".kprobes.text") # define nokprobe_inline __always_inline #else # define NOKPROBE_SYMBOL(fname) From ef5581bb30efb939cc2bf093475c6cc85258e5cd Mon Sep 17 00:00:00 2001 From: Martin Kaiser Date: Fri, 8 May 2026 09:56:36 +0900 Subject: [PATCH 087/377] test_kprobes: clear kprobes between test runs Running the kprobes sanity tests twice makes all tests fail and eventually crashes the kernel. [root@martin-riscv-1 ~]# echo 1 > /sys/kernel/debug/kunit/kprobes_test/run ... # Totals: pass:5 fail:0 skip:0 total:5 ok 1 kprobes_test [root@martin-riscv-1 ~]# echo 1 > /sys/kernel/debug/kunit/kprobes_test/run ... # test_kprobe: EXPECTATION FAILED at lib/tests/test_kprobes.c:64 Expected 0 == register_kprobe(&kp), but register_kprobe(&kp) == -22 (0xffffffffffffffea) ... Unable to handle kernel paging request ... The testsuite defines several kprobes and kretprobes as static variables that are preserved across test runs. After register_kprobe and unregister_kprobe, a kprobe contains some leftover data that must be cleared before the kprobe can be registered again. The tests are setting symbol_name to define the probe location. Address and flags must be cleared. The existing code clears some of the probes between subsequent tests, but not between two test runs. The leftover data from a previous test run makes the registrations fail in the next run. Move the cleanups for all kprobes into kprobes_test_init, this function is called before each single test (including the first test of a test run). Link: https://lore.kernel.org/all/20260507134615.1010905-1-martin@kaiser.cx/ Fixes: e44e81c5b90f ("kprobes: convert tests to kunit") Signed-off-by: Martin Kaiser Signed-off-by: Masami Hiramatsu (Google) --- lib/tests/test_kprobes.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/lib/tests/test_kprobes.c b/lib/tests/test_kprobes.c index b7582010125c..06e729e4de05 100644 --- a/lib/tests/test_kprobes.c +++ b/lib/tests/test_kprobes.c @@ -12,6 +12,12 @@ #define div_factor 3 +#define KP_CLEAR(_kp) \ +do { \ + (_kp).addr = NULL; \ + (_kp).flags = 0; \ +} while (0) + static u32 rand1, preh_val, posth_val; static u32 (*target)(u32 value); static u32 (*recursed_target)(u32 value); @@ -125,10 +131,6 @@ static void test_kprobes(struct kunit *test) current_test = test; - /* addr and flags should be cleard for reusing kprobe. */ - kp.addr = NULL; - kp.flags = 0; - KUNIT_EXPECT_EQ(test, 0, register_kprobes(kps, 2)); preh_val = 0; posth_val = 0; @@ -226,9 +228,6 @@ static void test_kretprobes(struct kunit *test) struct kretprobe *rps[2] = {&rp, &rp2}; current_test = test; - /* addr and flags should be cleard for reusing kprobe. */ - rp.kp.addr = NULL; - rp.kp.flags = 0; KUNIT_EXPECT_EQ(test, 0, register_kretprobes(rps, 2)); krph_val = 0; @@ -290,8 +289,6 @@ static void test_stacktrace_on_kretprobe(struct kunit *test) unsigned long myretaddr = (unsigned long)__builtin_return_address(0); current_test = test; - rp3.kp.addr = NULL; - rp3.kp.flags = 0; /* * Run the stacktrace_driver() to record correct return address in @@ -352,8 +349,6 @@ static void test_stacktrace_on_nested_kretprobe(struct kunit *test) struct kretprobe *rps[2] = {&rp3, &rp4}; current_test = test; - rp3.kp.addr = NULL; - rp3.kp.flags = 0; //KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); @@ -367,6 +362,18 @@ static void test_stacktrace_on_nested_kretprobe(struct kunit *test) static int kprobes_test_init(struct kunit *test) { + KP_CLEAR(kp); + KP_CLEAR(kp2); + KP_CLEAR(kp_missed); +#ifdef CONFIG_KRETPROBES + KP_CLEAR(rp.kp); + KP_CLEAR(rp2.kp); +#ifdef CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE + KP_CLEAR(rp3.kp); + KP_CLEAR(rp4.kp); +#endif +#endif + target = kprobe_target; target2 = kprobe_target2; recursed_target = kprobe_recursed_target; From 41337097f2823e99478d7cbe68d4893582ed0b18 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Wed, 6 May 2026 21:21:52 +0800 Subject: [PATCH 088/377] riscv: cpufeature: Use pre-defined ISA ext macros to index isa2hwcap We have pre-defined ISA extension macros, here use those macros to replace a magic number for isa2hwcap definition and some array indexing for isa2hwcap access. This doesn't change the original functionality, just improve the code maintainability and readability. Signed-off-by: Hui Wang Link: https://patch.msgid.link/20260506132152.53239-1-hui.wang@canonical.com Signed-off-by: Paul Walmsley --- arch/riscv/kernel/cpufeature.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 3dc4c0d31550..f46aa5602d74 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -1102,16 +1102,16 @@ early_param("riscv_isa_fallback", riscv_isa_fallback_setup); void __init riscv_fill_hwcap(void) { char print_str[NUM_ALPHA_EXTS + 1]; - unsigned long isa2hwcap[26] = {0}; + unsigned long isa2hwcap[RISCV_ISA_EXT_BASE] = {0}; int i, j; - isa2hwcap['i' - 'a'] = COMPAT_HWCAP_ISA_I; - isa2hwcap['m' - 'a'] = COMPAT_HWCAP_ISA_M; - isa2hwcap['a' - 'a'] = COMPAT_HWCAP_ISA_A; - isa2hwcap['f' - 'a'] = COMPAT_HWCAP_ISA_F; - isa2hwcap['d' - 'a'] = COMPAT_HWCAP_ISA_D; - isa2hwcap['c' - 'a'] = COMPAT_HWCAP_ISA_C; - isa2hwcap['v' - 'a'] = COMPAT_HWCAP_ISA_V; + isa2hwcap[RISCV_ISA_EXT_i] = COMPAT_HWCAP_ISA_I; + isa2hwcap[RISCV_ISA_EXT_m] = COMPAT_HWCAP_ISA_M; + isa2hwcap[RISCV_ISA_EXT_a] = COMPAT_HWCAP_ISA_A; + isa2hwcap[RISCV_ISA_EXT_f] = COMPAT_HWCAP_ISA_F; + isa2hwcap[RISCV_ISA_EXT_d] = COMPAT_HWCAP_ISA_D; + isa2hwcap[RISCV_ISA_EXT_c] = COMPAT_HWCAP_ISA_C; + isa2hwcap[RISCV_ISA_EXT_v] = COMPAT_HWCAP_ISA_V; if (!acpi_disabled) { riscv_fill_hwcap_from_isa_string(isa2hwcap); From f03e8583532941b07761c5429de7d50766fa3110 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Sun, 3 May 2026 12:28:58 +0800 Subject: [PATCH 089/377] batman-adv: stop caching unowned originator pointers in BAT IV BAT IV keeps the last-hop neighbor address in each neigh_node, but some paths also cache an originator pointer derived from a temporary lookup. That pointer is not owned by the neigh_node and may no longer refer to a live originator entry after purge handling runs. Stop storing the auxiliary originator pointer in the BAT IV neighbor state. When BAT IV needs the neighbor originator data, resolve it from the stored neighbor address and drop the reference again after use. Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei [sven: avoid bonding logic for outgoing OGM] Signed-off-by: Sven Eckelmann --- net/batman-adv/bat_iv_ogm.c | 83 ++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 618d1889c04e..74ef7dc2b2f9 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -173,19 +173,12 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh) + struct batadv_orig_node *orig_node) { struct batadv_neigh_node *neigh_node; neigh_node = batadv_neigh_node_get_or_create(orig_node, hard_iface, neigh_addr); - if (!neigh_node) - goto out; - - neigh_node->orig_node = orig_neigh; - -out: return neigh_node; } @@ -906,6 +899,31 @@ static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, return sum; } +/** + * batadv_iv_ogm_neigh_ifinfo_sum() - Get bcast_own sum for a last-hop neighbor + * @bat_priv: the bat priv with all the mesh interface information + * @neigh_node: last-hop neighbor of an originator + * + * Return: Number of replied (rebroadcasted) OGMs for the originator currently + * announced by the neighbor. Returns 0 if the neighbor's originator entry is + * not available anymore. + */ +static u8 batadv_iv_ogm_neigh_ifinfo_sum(struct batadv_priv *bat_priv, + const struct batadv_neigh_node *neigh_node) +{ + struct batadv_orig_node *orig_neigh; + u8 sum; + + orig_neigh = batadv_orig_hash_find(bat_priv, neigh_node->addr); + if (!orig_neigh) + return 0; + + sum = batadv_iv_orig_ifinfo_sum(orig_neigh, neigh_node->if_incoming); + batadv_orig_node_put(orig_neigh); + + return sum; +} + /** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator @@ -975,17 +993,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, } if (!neigh_node) { - struct batadv_orig_node *orig_tmp; - - orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); - if (!orig_tmp) - goto unlock; - neigh_node = batadv_iv_ogm_neigh_new(if_incoming, ethhdr->h_source, - orig_node, orig_tmp); - - batadv_orig_node_put(orig_tmp); + orig_node); if (!neigh_node) goto unlock; } else { @@ -1037,10 +1047,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, */ if (router_ifinfo && neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { - sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node, - router->if_incoming); - sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node, - neigh_node->if_incoming); + sum_orig = batadv_iv_ogm_neigh_ifinfo_sum(bat_priv, router); + sum_neigh = batadv_iv_ogm_neigh_ifinfo_sum(bat_priv, + neigh_node); if (sum_orig >= sum_neigh) goto out; } @@ -1106,7 +1115,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, if (!neigh_node) neigh_node = batadv_iv_ogm_neigh_new(if_incoming, orig_neigh_node->orig, - orig_neigh_node, orig_neigh_node); if (!neigh_node) @@ -1302,6 +1310,32 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, return ret; } +/** + * batadv_orig_to_direct_router() - get direct next hop neighbor to an orig address + * @bat_priv: the bat priv with all the mesh interface information + * @orig_addr: the originator MAC address to search the best next hop router for + * @if_outgoing: the interface where the OGM should be sent to + * + * Return: A neighbor node which is the best router towards the given originator + * address. Bonding candidates are ignored. + */ +static struct batadv_neigh_node * +batadv_orig_to_direct_router(struct batadv_priv *bat_priv, u8 *orig_addr, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_node *neigh_node; + struct batadv_orig_node *orig_node; + + orig_node = batadv_orig_hash_find(bat_priv, orig_addr); + if (!orig_node) + return NULL; + + neigh_node = batadv_orig_router_get(orig_node, if_outgoing); + batadv_orig_node_put(orig_node); + + return neigh_node; +} + /** * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing * interface @@ -1372,8 +1406,9 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, router = batadv_orig_router_get(orig_node, if_outgoing); if (router) { - router_router = batadv_orig_router_get(router->orig_node, - if_outgoing); + router_router = batadv_orig_to_direct_router(bat_priv, + router->addr, + if_outgoing); router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); } From ce425dd05d0fe7594930a0fb103634f35ac47bb6 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 May 2026 22:20:49 +0200 Subject: [PATCH 090/377] batman-adv: tp_meter: fix tp_num leak on kmalloc failure When batadv_tp_start() or batadv_tp_init_recv() fail to allocate a new tp_vars object, the previously incremented bat_priv->tp_num counter is never decremented. This causes tp_num to drift upward on each allocation failure. Since only BATADV_TP_MAX_NUM sessions can be started and the count is never reduced for these failed allocations, it causes to an exhaustion of throughput meter sessions. In worst case, no new throughput meter session can be started until the mesh interface is removed. The error handling must decrement tp_num releasing the lock and aborting the creation of an throughput meter session Cc: stable@kernel.org Fixes: 33a3bb4a3345 ("batman-adv: throughput meter implementation") Signed-off-by: Sven Eckelmann --- net/batman-adv/tp_meter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 58ca59a2799e..066c76113fc4 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -994,6 +994,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); if (!tp_vars) { + atomic_dec(&bat_priv->tp_num); spin_unlock_bh(&bat_priv->tp_list_lock); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: %s cannot allocate list elements\n", @@ -1366,8 +1367,10 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv, } tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); - if (!tp_vars) + if (!tp_vars) { + atomic_dec(&bat_priv->tp_num); goto out_unlock; + } ether_addr_copy(tp_vars->other_end, icmp->orig); tp_vars->role = BATADV_TP_RECEIVER; From 4ae1709a314060a196981b344610d023ea841e57 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 May 2026 22:20:50 +0200 Subject: [PATCH 091/377] batman-adv: bla: prevent use-after-free when deleting claims When batadv_bla_del_backbone_claims() removes all claims for a backbone, it does this by dropping the link entry in the hash list. This list entry itself was one of the references which need to be dropped at the same time via batadv_claim_put(). But the batadv_claim_put() must not be done before the last access to the claim object in this function. Otherwise the claim might be freed already by the batadv_claim_release() function before the list entry was dropped. Cc: stable@kernel.org Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code") Signed-off-by: Sven Eckelmann --- net/batman-adv/bridge_loop_avoidance.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 51fe028b9088..8b77dd2ecfa4 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -318,8 +318,8 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) if (claim->backbone_gw != backbone_gw) continue; - batadv_claim_put(claim); hlist_del_rcu(&claim->hash_entry); + batadv_claim_put(claim); } spin_unlock_bh(list_lock); } From cf6b604011591865ae39ac82de8978c1120d17af Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 May 2026 22:20:51 +0200 Subject: [PATCH 092/377] batman-adv: bla: only purge non-released claims When batadv_bla_purge_claims() goes through the list of claims, it is only traversing the hash list with an rcu_read_lock(). Due to a potential parallel batadv_claim_put(), it can happen that it encounters a claim which was actually in the process of being released+freed by batadv_claim_release(). In this case, backbone_gw is set to NULL before the delayed RCU kfree is started. Calling batadv_bla_claim_get_backbone_gw() is then no longer allowed because it would cause a NULL-ptr derefence. To avoid this, only claims with a valid reference counter must be purged. All others are already taken care of. Cc: stable@kernel.org Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code") Signed-off-by: Sven Eckelmann --- net/batman-adv/bridge_loop_avoidance.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 8b77dd2ecfa4..879ab043d57a 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1288,6 +1288,13 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { + /* only purge claims not currently in the process of being released. + * Such claims could otherwise have a NULL-ptr backbone_gw set because + * they already went through batadv_claim_release() + */ + if (!kref_get_unless_zero(&claim->refcount)) + continue; + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; @@ -1313,6 +1320,7 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, claim->addr, claim->vid); skip: batadv_backbone_gw_put(backbone_gw); + batadv_claim_put(claim); } rcu_read_unlock(); } From ba9d20ee9076dac32c371116bacbe72480eb356c Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 May 2026 22:20:52 +0200 Subject: [PATCH 093/377] batman-adv: bla: put backbone reference on failed claim hash insert When batadv_bla_add_claim() fails to insert a new claim into the hash, it leaked a reference to the backbone_gw for which the claim was intended. Call batadv_backbone_gw_put() on the error path to release the reference and avoid leaking the backbone_gw object. Cc: stable@kernel.org Fixes: 3db0decf1185 ("batman-adv: Fix non-atomic bla_claim::backbone_gw access") Signed-off-by: Sven Eckelmann --- net/batman-adv/bridge_loop_avoidance.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 879ab043d57a..cec11f1251d6 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -723,6 +723,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, if (unlikely(hash_added != 0)) { /* only local changes happened. */ + batadv_backbone_gw_put(backbone_gw); kfree(claim); return; } From 786a45757dcdf8f2beb9d4a6db605db16c18b2b4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 28 Apr 2026 21:59:52 +0100 Subject: [PATCH 094/377] x86/kexec: Push kjump return address even for non-kjump kexec The version of purgatory code shipped by kexec-tools attempts to look above the top of its stack to find a return address for a kjump, even in a non-kjump kexec. After the commit in Fixes: the word above the stack might not be there, leading to a fault (which is at least now caught by my exception-handling code in kexec). That commit fixed things for the actual kjump path, but no longer "gratuitously" pushes the unused return address to the stack in the non-kjump path. Put that *back* in the non-kjump path, to prevent purgatory from crashing when trying to access it. Fixes: 2cacf7f23a02 ("x86/kexec: Fix stack and handling of re-entry point for ::preserve_context") Reported-by: Rohan Kakulawaram Signed-off-by: David Woodhouse Signed-off-by: Borislav Petkov (AMD) Acked-by: Dave Hansen Tested-by: Rohan Kakulawaram Cc: Link: https://patch.msgid.link/32d627134143ffd957891cb697138e839c623211.camel@infradead.org --- arch/x86/kernel/relocate_kernel_64.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 4ffba68dc57b..eaeb77464c06 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -136,6 +136,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * %r13 original CR4 when relocate_kernel() was invoked */ + /* + * Set return address to 0 if not preserving context. The purgatory + * shipped in kexec-tools will unconditionally look for the return + * address on the stack and set a kexec_jump_back_entry= command + * line option if it's non-zero. There's no other way that it can + * tell a preserve-context (kjump) kexec from a normal one. + */ + pushq $0 /* store the start address on the stack */ pushq %rdx From 411c1cf430392c905e39f12bc305dd994da0b426 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 8 May 2026 15:20:23 +0100 Subject: [PATCH 095/377] arm64/entry: Fix arm64-specific rseq brokenness Mathias Stearn reports that since v6.19, there are two big issues affecting rseq: (1) On arm64 specifically, rseq critical sections aren't aborted when they should be. (2) The 'cpu_id_start' field is no longer written by the kernel in all cases it used to be, including some cases where TCMalloc depends on the kernel clobbering the field. This patch fixes issue #1. This patch DOES NOT fix issue #2, which will need to be addressed by other patches. The arm64-specific brokenness is a result of commits: 2fc0e4b4126c ("rseq: Record interrupt from user space") 39a167560a61 ("rseq: Optimize event setting") The first commit failed to add a call to rseq_note_user_irq_entry() on arm64. Thus arm64 never sets rseq_event::user_irq to record that it may be necessary to abort an active rseq critical section upon return to userspace. On its own, this commit had no functional impact as the value of rseq_event::user_irq was not consumed. The second commit relied upon rseq_event::user_irq to determine whether or not to bother to perform rseq work when returning to userspace. As rseq_event::user_irq wasn't set on arm64, this work would be skipped, and consequently an active rseq critical section would not be aborted. Fix this by giving arm64 syscall-specific entry/exit paths, and performing the relevant logic in syscall and non-syscall paths, including calling rseq_note_user_irq_entry() for non-syscall entry. Currently arm64 cannot use syscall_enter_from_user_mode(), syscall_exit_to_user_mode(), and irqentry_exit_to_user_mode(), due to ordering constraints with exception masking, and risk of ABI breakage for syscall tracing/audit/etc. For the moment the entry/exit logic is left as arm64-specific, directly using enter_from_user_mode() and exit_to_user_mode(), but mirroring the generic code. I intend to follow up with refactoring/cleanup, as we did for kernel mode entry paths in commit: 041aa7a85390 ("entry: Split preemption from irqentry_exit_to_kernel_mode()") ... which will allow arm64 to use the GENERIC_IRQ_ENTRY functions directly. Fixes: 39a167560a61 ("rseq: Optimize event setting") Reported-by: Mathias Stearn Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Catalin Marinas Link: https://lore.kernel.org/regressions/CAHnCjA25b+nO2n5CeifknSKHssJpPrjnf+dtr7UgzRw4Zgu=oA@mail.gmail.com/ Link: https://patch.msgid.link/20260508142023.3268622-1-mark.rutland@arm.com --- arch/arm64/kernel/entry-common.c | 31 ++++++++++++++++++++++++------- include/linux/irq-entry-common.h | 8 -------- include/linux/rseq_entry.h | 19 ------------------- 3 files changed, 24 insertions(+), 34 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index cb54335465f6..c7a23f7c2212 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -62,6 +62,13 @@ static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, irqentry_exit_to_kernel_mode_after_preempt(regs, state); } +static __always_inline void arm64_syscall_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + mte_disable_tco_entry(current); + sme_enter_from_user_mode(); +} + /* * Handle IRQ/context state management when entering from user mode. * Before this function is called it is not safe to call regular kernel code, @@ -70,20 +77,30 @@ static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); + rseq_note_user_irq_entry(); mte_disable_tco_entry(current); sme_enter_from_user_mode(); } +static __always_inline void arm64_syscall_exit_to_user_mode(struct pt_regs *regs) +{ + local_irq_disable(); + syscall_exit_to_user_mode_prepare(regs); + local_daif_mask(); + sme_exit_to_user_mode(); + mte_check_tfsr_exit(); + exit_to_user_mode(); +} + /* * Handle IRQ/context state management when exiting to user mode. * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ - static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { local_irq_disable(); - exit_to_user_mode_prepare_legacy(regs); + irqentry_exit_to_user_mode_prepare(regs); local_daif_mask(); sme_exit_to_user_mode(); mte_check_tfsr_exit(); @@ -92,7 +109,7 @@ static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) { - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); } /* @@ -716,12 +733,12 @@ static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr) static void noinstr el0_svc(struct pt_regs *regs) { - arm64_enter_from_user_mode(regs); + arm64_syscall_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); fpsimd_syscall_exit(); } @@ -868,11 +885,11 @@ static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) static void noinstr el0_svc_compat(struct pt_regs *regs) { - arm64_enter_from_user_mode(regs); + arm64_syscall_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); do_el0_svc_compat(regs); - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); } static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 167fba7dbf04..1fabf0f5ea8e 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -218,14 +218,6 @@ static __always_inline void __exit_to_user_mode_validate(void) lockdep_sys_exit(); } -/* Temporary workaround to keep ARM64 alive */ -static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) -{ - __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); - rseq_exit_to_user_mode_legacy(); - __exit_to_user_mode_validate(); -} - /** * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 2d0295df5107..63bc72086e75 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -749,24 +749,6 @@ static __always_inline void rseq_irqentry_exit_to_user_mode(void) ev->events = 0; } -/* Required to keep ARM64 working */ -static __always_inline void rseq_exit_to_user_mode_legacy(void) -{ - struct rseq_event *ev = ¤t->rseq.event; - - rseq_stat_inc(rseq_stats.exit); - - if (static_branch_unlikely(&rseq_debug_enabled)) - WARN_ON_ONCE(ev->sched_switch); - - /* - * Ensure that event (especially user_irq) is cleared when the - * interrupt did not result in a schedule and therefore the - * rseq processing did not clear it. - */ - ev->events = 0; -} - void __rseq_debug_syscall_return(struct pt_regs *regs); static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) @@ -782,7 +764,6 @@ static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned } static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } -static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ */ From ab28a0673daabe7f0fcbd7a5e36334f2f003f02f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 8 May 2026 19:50:45 +0800 Subject: [PATCH 096/377] sched_ext: Use IRQ_WORK_INIT_HARD() to initialize sch->disable_irq_work For built with PREEMPT_RT kernels, the scx_disable_irq_workfn() is called from per-cpu irq_work kthreads context, this means that when call the scx_dump_state() in the scx_disable_irq_workfn() to output current->comm/pid, it always output current irq_work kthread's comm/pid. this commit therefore use the IRQ_WORK_INIT_HARD() to initialize sch->disable_irq_work to make scx_disable_irq_workfn() is called from hardirq context. Fixes: f4a6c506d118 ("sched_ext: Always bounce scx_disable() through irq_work") Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f4e2db8e56be..df305712a2d4 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6589,7 +6589,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, sch->slice_dfl = SCX_SLICE_DFL; atomic_set(&sch->exit_kind, SCX_EXIT_NONE); - init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); + sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); From b2ed01e7ad3de80333e9b962a44024b094bc0b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 28 Apr 2026 11:44:42 +0200 Subject: [PATCH 097/377] drm/ttm: Fix ttm_bo_swapout() infinite LRU walk on swapout failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When ttm_tt_swapout() fails, the current code calls ttm_resource_add_bulk_move() followed by ttm_resource_move_to_lru_tail() to restore the resource's bulk_move membership. However, ttm_resource_move_to_lru_tail() places the resource at the tail of the LRU list which, relative to the walk cursor's hitch node (placed immediately after the resource when it was yielded), puts the resource *in front of the* the hitch. The next list_for_each_entry_continue() from the hitch finds the same resource again, causing an infinite loop. Fix by deferring del_bulk_move to the success path only. On the success path, TTM_TT_FLAG_SWAPPED has just been set by ttm_tt_swapout() but the resource is still tracked in the bulk_move range, so ttm_resource_del_bulk_move()'s !ttm_resource_unevictable() guard would incorrectly skip the removal. Introduce ttm_resource_del_bulk_move_unevictable() which bypasses that guard. Reported-by: Jatin Kataria Fixes: fc5d96670eb2 ("drm/ttm: Move swapped objects off the manager's LRU list") Cc: Christian König Cc: Matthew Brost Cc: Cc: # v6.13+ Assisted-by: GitHub_Copilot:claude-sonnet-4.6 Signed-off-by: Thomas Hellström Reviewed-by: Christian König Tested-by: Boqun Feng Link: https://patch.msgid.link/20260428094442.16985-1-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/ttm/ttm_bo.c | 16 ++++++---------- drivers/gpu/drm/ttm/ttm_resource.c | 13 +++++++++++++ include/drm/ttm/ttm_resource.h | 2 ++ 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index d85f0a37ac35..293401705542 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1177,17 +1177,13 @@ ttm_bo_swapout_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo) bdev->funcs->swap_notify(bo); if (ttm_tt_is_populated(tt)) { - spin_lock(&bdev->lru_lock); - ttm_resource_del_bulk_move(bo->resource, bo); - spin_unlock(&bdev->lru_lock); - ret = ttm_tt_swapout(bdev, tt, swapout_walk->gfp_flags); - - spin_lock(&bdev->lru_lock); - if (ret) - ttm_resource_add_bulk_move(bo->resource, bo); - ttm_resource_move_to_lru_tail(bo->resource); - spin_unlock(&bdev->lru_lock); + if (!ret) { + spin_lock(&bdev->lru_lock); + ttm_resource_del_bulk_move_unevictable(bo->resource, bo); + ttm_resource_move_to_lru_tail(bo->resource); + spin_unlock(&bdev->lru_lock); + } } out: diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index 9f36631d48b6..0e5f1582f13d 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -292,6 +292,19 @@ void ttm_resource_del_bulk_move(struct ttm_resource *res, ttm_lru_bulk_move_del(bo->bulk_move, res); } +/* + * Remove a resource from its bulk_move, bypassing the unevictable check. + * Use only when the resource is known to still be tracked in the range despite + * the BO having just become unevictable; asserts that this is the case. + */ +void ttm_resource_del_bulk_move_unevictable(struct ttm_resource *res, + struct ttm_buffer_object *bo) +{ + WARN_ON_ONCE(!ttm_resource_unevictable(res, bo)); + if (bo->bulk_move) + ttm_lru_bulk_move_del(bo->bulk_move, res); +} + /* Move a resource to the LRU or bulk tail */ void ttm_resource_move_to_lru_tail(struct ttm_resource *res) { diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h index 33e80f30b8b8..a5d386583fb6 100644 --- a/include/drm/ttm/ttm_resource.h +++ b/include/drm/ttm/ttm_resource.h @@ -448,6 +448,8 @@ void ttm_resource_add_bulk_move(struct ttm_resource *res, struct ttm_buffer_object *bo); void ttm_resource_del_bulk_move(struct ttm_resource *res, struct ttm_buffer_object *bo); +void ttm_resource_del_bulk_move_unevictable(struct ttm_resource *res, + struct ttm_buffer_object *bo); void ttm_resource_move_to_lru_tail(struct ttm_resource *res); void ttm_resource_init(struct ttm_buffer_object *bo, From a7488f089bdfa87c4fef1744d4dca9f4f8b46f8b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 7 May 2026 04:04:46 -0700 Subject: [PATCH 098/377] workqueue: Release PENDING in __queue_work() drain/destroy reject path The caller of __queue_work() owns WORK_STRUCT_PENDING, won via test_and_set_bit() in queue_work_on()/__queue_delayed_work(). The state machine documented above __queue_work() requires that owner to either hand the token to a pwq (insert_work() -> set_work_pwq()), hand it to a timer, or release it via set_work_pool_and_clear_pending(). try_to_grab_pending() relies on this: when it observes "PENDING && off-queue" it busy-loops, trusting the current owner to make progress. The (__WQ_DESTROYING | __WQ_DRAINING) early-return path violates that contract. It WARN_ONCE()s and bare-returns, leaving work->data with PENDING set, WORK_STRUCT_PWQ clear, and work->entry empty. The path is reachable without explicit API abuse: queue_delayed_work() arms a timer with PENDING set; if drain_workqueue() runs while the timer is still pending, delayed_work_timer_fn() -> __queue_work() in softirq context hits the WARN, current is not a wq worker so is_chained_work() is false, and the work is silently dropped with PENDING leaked. Mirror what clear_pending_if_disabled() already does on its analogous reject path: unpack the off-queue data and call set_work_pool_and_clear_pending() to release the token before returning. I was able to reproduce this by queueing several slow works on a max_active=1 wq, arm a delayed_work whose timer fires while drain_workqueue() is blocked, then call cancel_delayed_work_sync(). Without this patch the cancel livelocks at 100% CPU; with it the cancel returns immediately. Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3d2e3b2ec528..c0e58f877e08 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2296,6 +2296,18 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", work->func, wq->name))) { + struct work_offq_data offqd; + + /* + * State on entry: PENDING is set, work is off-queue (no + * insert_work() has run). + * + * Returning without clearing PENDING would leave the work + * in a weird state (PENDING=1, PWQ=0, entry empty) + */ + work_offqd_unpack(&offqd, *work_data_bits(work)); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); return; } rcu_read_lock(); From 0143033dc22cdff912cfc13419f5db92fea3b4cb Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 8 May 2026 09:22:03 -0700 Subject: [PATCH 099/377] workqueue: Fix wq->cpu_pwq leak in alloc_and_link_pwqs() WQ_UNBOUND path For WQ_UNBOUND workqueues, alloc_and_link_pwqs() allocates wq->cpu_pwq via alloc_percpu() and then calls apply_workqueue_attrs_locked(). On failure it returns the error directly, bypassing the enomem: label which holds the only free_percpu(wq->cpu_pwq) in this function. The caller's error path kfree()s wq without touching wq->cpu_pwq, leaking one percpu pointer table (nr_cpu_ids * sizeof(void *) bytes) per failed call. If kmemleak is enabled, we can see: unreferenced object (percpu) 0xc0fffa5b121048 (size 8): comm "insmod", pid 776, jiffies 4294682844 backtrace (crc 0): pcpu_alloc_noprof+0x665/0xac0 __alloc_workqueue+0x33f/0xa20 alloc_workqueue_noprof+0x60/0x100 Route the error through the existing enomem: cleanup and any error before this one. Cc: stable@kernel.org Fixes: 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c0e58f877e08..33b721a9af02 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5654,7 +5654,9 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]); } - return ret; + if (ret) + goto enomem; + return 0; enomem: if (wq->cpu_pwq) { From db5dadb562cabb6da49959b473ed0d9645b6f2da Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 4 May 2026 18:01:37 -0500 Subject: [PATCH 100/377] Revert "ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn" Some older systems don't support CPPC in the firmware and this just makes noise for them when booting. Drop back to debug. This reverts commit 21fb59ab4b9767085f4fe1edbdbe3177fbb9ec97. Fixes: 21fb59ab4b976 ("ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn") Suggested-by: Kim Phillips Signed-off-by: Mario Limonciello Tested-by: Kim Phillips Cc: All applicable Link: https://patch.msgid.link/20260504230141.484743-2-mario.limonciello@amd.com Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/cppc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index d7c8ef1e354d..be4c5e9e5ff6 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -88,19 +88,19 @@ static void amd_set_max_freq_ratio(void) rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { - pr_warn("Could not retrieve perf counters (%d)\n", rc); + pr_debug("Could not retrieve perf counters (%d)\n", rc); return; } rc = amd_get_boost_ratio_numerator(0, &numerator); if (rc) { - pr_warn("Could not retrieve highest performance (%d)\n", rc); + pr_debug("Could not retrieve highest performance (%d)\n", rc); return; } nominal_perf = perf_caps.nominal_perf; if (!nominal_perf) { - pr_warn("Could not retrieve nominal performance\n"); + pr_debug("Could not retrieve nominal performance\n"); return; } From 97c8a3c1f73d828de43a5a88e8a9a143efb2b661 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 May 2026 03:59:18 +0000 Subject: [PATCH 101/377] tcp: Fix potential UAF in reqsk_timer_handler(). When TCP socket migration fails at inet_ehash_insert() in reqsk_timer_handler(), we jump to the no_ownership: label and free the new reqsk immediately with __reqsk_free(). Thus, we must stop the new reqsk's timer before jumping to the label, but the timer might be missed since the cited commit, resulting in UAF. As we are in the original reqsk's timer context, we can safely call timer_delete_sync() for the new reqsk. Let's pass false to __inet_csk_reqsk_queue_drop() to stop the new reqsk's timer. Fixes: 83fccfc3940c ("inet: fix potential deadlock in reqsk_queue_unlink()") Reported-by: Damiano Melotti Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260506035954.1563147-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 928654c34156..971f9db2c586 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1108,7 +1108,7 @@ static void reqsk_timer_handler(struct timer_list *t) if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ - __inet_csk_reqsk_queue_drop(sk_listener, nreq, true); + __inet_csk_reqsk_queue_drop(sk_listener, nreq, false); goto no_ownership; } From 7eca3292cac7c26dad4c236f51ba225c39a0523f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 May 2026 03:59:19 +0000 Subject: [PATCH 102/377] tcp: Fix imbalanced icsk_accept_queue count. When TCP socket migration happens in reqsk_timer_handler(), @sk_listener will be updated with the new listener. When we call __inet_csk_reqsk_queue_drop(), the listener must be the one stored in req->rsk_listener. The cited commit accidentally replaced oreq->rsk_listener with sk_listener, leading to imbalanced icsk_accept_queue count. Let's pass the correct listener to __inet_csk_reqsk_queue_drop(). Fixes: e8c526f2bdf1 ("tcp/dccp: Don't use timer_pending() in reqsk_queue_unlink().") Reported-by: Damiano Melotti Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260506035954.1563147-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 971f9db2c586..dbcd37dfdc15 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1134,7 +1134,7 @@ static void reqsk_timer_handler(struct timer_list *t) } drop: - __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); + __inet_csk_reqsk_queue_drop(oreq->rsk_listener, oreq, true); reqsk_put(oreq); } From e539acf9f9c2550452914fb85aeb8fda67dd762f Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 27 Apr 2026 11:23:17 +0100 Subject: [PATCH 103/377] MAINTAINERS: Add self for the 3c509 network driver It appears there's a need for a maintainer for the 3Com EtherLink III family of Ethernet network adapters. There is documentation available and the driver is very mature so the task ought to be of little hassle, so I think I should be able to squeeze in any issues to be addressed. Signed-off-by: Maciej W. Rozycki Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/alpine.DEB.2.21.2604271056460.28583@angie.orcam.me.uk Signed-off-by: Jakub Kicinski --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0650fa014f24..1a485549ee96 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -68,6 +68,12 @@ Maintainers List first. When adding to this list, please keep the entries in alphabetical order. +3C509 NETWORK DRIVER +M: "Maciej W. Rozycki" +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/3com/3c509.c + 3C59X NETWORK DRIVER M: Steffen Klassert L: netdev@vger.kernel.org From 7ce5556f255a680d80daa31b1cedecf7f89e2c22 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 16:24:15 +0800 Subject: [PATCH 104/377] ipv6: flowlabel: take ip6_fl_lock across mem_check and fl_intern mem_check() in net/ipv6/ip6_flowlabel.c reads fl_size without holding ip6_fl_lock. fl_intern() takes the lock immediately afterwards. The two checks therefore race against concurrent fl_intern, ip6_fl_gc and ip6_fl_purge writers, which makes the mem_check budget check approximate. Move spin_lock_bh(&ip6_fl_lock) and the matching unlock from fl_intern() into its only caller ipv6_flowlabel_get(). The mem_check() call now runs under the same critical section as the fl_intern() insert, so the budget check is exact. With all writers and the read of fl_size under ip6_fl_lock, convert fl_size from atomic_t to plain int. The four sites that update or read fl_size are fl_intern (insert path), ip6_fl_gc (garbage collector, the !sched check and the per-entry decrement), ip6_fl_purge (per-netns purge), and mem_check (budget check), and all four now run under ip6_fl_lock. This is a prerequisite for adding a per-netns budget alongside fl_size. The follow-up patch adds netns_ipv6::flowlabel_count and folds it into mem_check(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506082416.2259567-2-maoyixie.tju@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_flowlabel.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index c92f98c6f6ec..a8974643195a 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -40,7 +40,7 @@ #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) -static atomic_t fl_size = ATOMIC_INIT(0); +static int fl_size; static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); @@ -163,7 +163,7 @@ static void ip6_fl_gc(struct timer_list *unused) if (time_after_eq(now, ttd)) { *flp = fl->next; fl_free(fl); - atomic_dec(&fl_size); + fl_size--; continue; } if (!sched || time_before(ttd, sched)) @@ -172,7 +172,7 @@ static void ip6_fl_gc(struct timer_list *unused) flp = &fl->next; } } - if (!sched && atomic_read(&fl_size)) + if (!sched && fl_size) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); @@ -196,7 +196,7 @@ static void __net_exit ip6_fl_purge(struct net *net) atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); - atomic_dec(&fl_size); + fl_size--; continue; } flp = &fl->next; @@ -210,10 +210,10 @@ static struct ip6_flowlabel *fl_intern(struct net *net, { struct ip6_flowlabel *lfl; + lockdep_assert_held(&ip6_fl_lock); + fl->label = label & IPV6_FLOWLABEL_MASK; - rcu_read_lock(); - spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; @@ -235,8 +235,6 @@ static struct ip6_flowlabel *fl_intern(struct net *net, lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); - spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock(); return lfl; } } @@ -244,9 +242,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); - atomic_inc(&fl_size); - spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock(); + fl_size++; return NULL; } @@ -464,10 +460,14 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, static int mem_check(struct sock *sk) { - int room = FL_MAX_SIZE - atomic_read(&fl_size); + int room; struct ipv6_fl_socklist *sfl; int count = 0; + lockdep_assert_held(&ip6_fl_lock); + + room = FL_MAX_SIZE - fl_size; + if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; @@ -692,11 +692,19 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, if (!sfl1) goto done; + rcu_read_lock(); + spin_lock_bh(&ip6_fl_lock); err = mem_check(sk); + if (err == 0) + fl1 = fl_intern(net, fl, freq->flr_label); + else + fl1 = NULL; + spin_unlock_bh(&ip6_fl_lock); + rcu_read_unlock(); + if (err != 0) goto done; - fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; From e68eadffb724b36ffd3d5619e0efcaf29ec2a175 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 16:24:16 +0800 Subject: [PATCH 105/377] ipv6: flowlabel: enforce per-netns limit for unprivileged callers fl_size, fl_ht and ip6_fl_lock in net/ipv6/ip6_flowlabel.c are file scope and shared across netns. mem_check() reads fl_size to decide whether to deny non-CAP_NET_ADMIN callers. capable() runs against init_user_ns, so an unprivileged user in any non-init userns can push fl_size past FL_MAX_SIZE - FL_MAX_SIZE / 4 and starve every other unprivileged userns on the host. Add struct netns_ipv6::flowlabel_count, bumped and decremented next to fl_size in fl_intern, ip6_fl_gc and ip6_fl_purge. The new field fills the existing 4-byte hole after ipmr_seq, so struct netns_ipv6 stays the same size on 64-bit builds. Bump FL_MAX_SIZE from 4096 to 8192. It has been 4096 since the file was added. Machines and connection counts have grown. mem_check() folds an extra per-netns ceiling into the existing non-CAP_NET_ADMIN conditional. The ceiling is half of the total budget that unprivileged callers have ever been able to use, i.e. (FL_MAX_SIZE - FL_MAX_SIZE / 4) / 2 = 3072 entries. With FL_MAX_SIZE doubled, this preserves the original per-user reach of 3K (what an unprivileged caller could already obtain before this change), while forcing an attacker to spread allocations across at least two netns to exhaust the global non-CAP_NET_ADMIN budget. CAP_NET_ADMIN against init_user_ns still bypasses both caps. The previous patch took ip6_fl_lock across mem_check and fl_intern, so the new flowlabel_count read in mem_check and the new flowlabel_count++ in fl_intern run under the same critical section. flowlabel_count is therefore plain int, like fl_size. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Willem de Bruijn Reviewed-by: Willem de Bruijn Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506082416.2259567-3-maoyixie.tju@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 1 + net/ipv6/ip6_flowlabel.c | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 499e4288170f..875916d60bfe 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -119,6 +119,7 @@ struct netns_ipv6 { struct fib_notifier_ops *notifier_ops; struct fib_notifier_ops *ip6mr_notifier_ops; atomic_t ipmr_seq; + int flowlabel_count; struct { struct hlist_head head; spinlock_t lock; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index a8974643195a..b1ccdf0dc646 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -36,7 +36,7 @@ /* FL hash table */ #define FL_MAX_PER_SOCK 32 -#define FL_MAX_SIZE 4096 +#define FL_MAX_SIZE 8192 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) @@ -162,8 +162,9 @@ static void ip6_fl_gc(struct timer_list *unused) ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; - fl_free(fl); fl_size--; + fl->fl_net->ipv6.flowlabel_count--; + fl_free(fl); continue; } if (!sched || time_before(ttd, sched)) @@ -197,6 +198,7 @@ static void __net_exit ip6_fl_purge(struct net *net) *flp = fl->next; fl_free(fl); fl_size--; + net->ipv6.flowlabel_count--; continue; } flp = &fl->next; @@ -243,6 +245,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); fl_size++; + net->ipv6.flowlabel_count++; return NULL; } @@ -460,6 +463,9 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, static int mem_check(struct sock *sk) { + const int unpriv_total_limit = FL_MAX_SIZE - (FL_MAX_SIZE / 4); + const int unpriv_user_limit = unpriv_total_limit / 2; + struct net *net = sock_net(sk); int room; struct ipv6_fl_socklist *sfl; int count = 0; @@ -478,7 +484,9 @@ static int mem_check(struct sock *sk) if (room <= 0 || ((count >= FL_MAX_PER_SOCK || - (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && + (count > 0 && room < FL_MAX_SIZE / 2) || + room < FL_MAX_SIZE / 4 || + net->ipv6.flowlabel_count >= unpriv_user_limit) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; From 58e2330bd45572a6e3d46ea94cf7a9641f43591a Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 6 May 2026 09:08:08 +0000 Subject: [PATCH 106/377] net: napi: Avoid gro timer misfiring at end of busypoll When in irq deferral mode (defer-hard-irqs > 0), a short enough gro-flush timeout can trigger before NAPI_STATE_SCHED is cleared if the last poll in busy_poll_stop() takes too long. This can have the effect of leaving the queue stuck with interrupts disabled and no timer armed which results in a tx timeout if there is no subsequent busypoll cycle. To prevent this, defer the gro-flush timer arm after the last poll. Fixes: 7fd3253a7de6 ("net: Introduce preferred busy-polling") Co-developed-by: Martin Karsten Signed-off-by: Martin Karsten Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Reviewed-by: Cosmin Ratiu Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260506090808.820559-2-dtatulea@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 8bfa8313ef62..0c6c270d9f7d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6862,9 +6862,9 @@ static void skb_defer_free_flush(void) #if defined(CONFIG_NET_RX_BUSY_POLL) -static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +static void __busy_poll_stop(struct napi_struct *napi, unsigned long timeout) { - if (!skip_schedule) { + if (!timeout) { gro_normal_list(&napi->gro); __napi_schedule(napi); return; @@ -6874,6 +6874,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) gro_flush_normal(&napi->gro, HZ >= 1000); clear_bit(NAPI_STATE_SCHED, &napi->state); + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); } enum { @@ -6885,8 +6887,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; - bool skip_schedule = false; - unsigned long timeout; + unsigned long timeout = 0; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6906,10 +6907,12 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); - timeout = napi_get_gro_flush_timeout(napi); - if (napi->defer_hard_irqs_count && timeout) { - hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); - skip_schedule = true; + if (napi->defer_hard_irqs_count) { + /* A short enough gro flush timeout and long enough + * poll can result in timer firing too early. + * Timer will be armed later if necessary. + */ + timeout = napi_get_gro_flush_timeout(napi); } } @@ -6924,7 +6927,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); if (rc == budget) - __busy_poll_stop(napi, skip_schedule); + __busy_poll_stop(napi, timeout); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } From 0a549298f452a83ae57e6582e6ca389357f9355d Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Thu, 7 May 2026 14:04:44 +0200 Subject: [PATCH 107/377] MAINTAINERS: change maintainers for macb Ethernet driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I would like to hand over the macb maintenance to Théo, as I'm unable to keep up with the recent flow of patches for this driver. After speaking with Claudiu, he indicated that he is in the same position as me. To help with this work, Conor has agreed to act as a reviewer. I was given responsibility for this driver years ago, and I'm glad to see it continue with talented developers. Signed-off-by: Nicolas Ferre Acked-by: Claudiu Beznea Acked-by: Conor Dooley Link: https://patch.msgid.link/20260507120444.9733-1-nicolas.ferre@microchip.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1a485549ee96..8a134cf6e9bb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4187,8 +4187,8 @@ F: include/uapi/linux/sonet.h F: net/atm/ ATMEL MACB ETHERNET DRIVER -M: Nicolas Ferre -M: Claudiu Beznea +M: Théo Lebrun +R: Conor Dooley S: Maintained F: drivers/net/ethernet/cadence/ From 4908f1395fb1b832ceec11584af649874a2732ea Mon Sep 17 00:00:00 2001 From: Quan Sun <2022090917019@std.uestc.edu.cn> Date: Thu, 7 May 2026 21:17:38 +0800 Subject: [PATCH 108/377] net: ethtool: fix NULL pointer dereference in phy_reply_size In phy_prepare_data(), several strings such as 'name', 'drvname', 'upstream_sfp_name', and 'downstream_sfp_name' are allocated using kstrdup(). However, these allocations were not checked for failure. If kstrdup() fails for 'name', it returns NULL while the function continues. This leads to a kernel NULL pointer dereference and panic later in phy_reply_size() when it unconditionally calls strlen() on the NULL pointer. While other strings like 'upstream_sfp_name' might be checked before access in certain code paths, failing to handle these allocations consistently can lead to incomplete data reporting or hidden bugs. Fix this by adding proper NULL checks for all kstrdup() calls in phy_prepare_data() and implement a centralized error handling path using goto labels to ensure all previously allocated resources are freed on failure. Fixes: 9dd2ad5e92b9 ("net: ethtool: phy: Convert the PHY_GET command to generic phy dump") Signed-off-by: Quan Sun <2022090917019@std.uestc.edu.cn> Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20260507131738.1173835-1-2022090917019@std.uestc.edu.cn Signed-off-by: Jakub Kicinski --- net/ethtool/phy.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index d4e6887055ab..f76d94d848d6 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -76,6 +76,7 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, struct nlattr **tb = info->attrs; struct phy_device_node *pdn; struct phy_device *phydev; + int ret; /* RTNL is held by the caller */ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER, @@ -88,8 +89,17 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, return -EOPNOTSUPP; rep_data->phyindex = phydev->phyindex; + rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL); + if (!rep_data->name) + return -ENOMEM; + rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + if (!rep_data->drvname) { + ret = -ENOMEM; + goto err_free_name; + } + rep_data->upstream_type = pdn->upstream_type; if (pdn->upstream_type == PHY_UPSTREAM_PHY) { @@ -97,15 +107,33 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, rep_data->upstream_index = upstream->phyindex; } - if (pdn->parent_sfp_bus) + if (pdn->parent_sfp_bus) { rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus), GFP_KERNEL); + if (!rep_data->upstream_sfp_name) { + ret = -ENOMEM; + goto err_free_drvname; + } + } - if (phydev->sfp_bus) + if (phydev->sfp_bus) { rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus), GFP_KERNEL); + if (!rep_data->downstream_sfp_name) { + ret = -ENOMEM; + goto err_free_upstream_sfp; + } + } return 0; + +err_free_upstream_sfp: + kfree(rep_data->upstream_sfp_name); +err_free_drvname: + kfree(rep_data->drvname); +err_free_name: + kfree(rep_data->name); + return ret; } static int phy_fill_reply(struct sk_buff *skb, From f2ab4fd02777c4081be38c35f939e4dc529b8952 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 7 May 2026 14:04:26 +0200 Subject: [PATCH 109/377] net: nsh: fix incorrect header length macros NSH header length is a 6-bit field that encodes the total length of the header in 4-byte words. So the maximum length is 0b111111 * 4, which is 252 and not 256. The maximum context length is the same number minus the length of the base header (8), so 244. These macros are used to validate push_nsh() action in openvswitch. Miscalculation here doesn't cause any real issues. In the worst case the oversized context is truncated while building the header, so we'll construct and send a broken packet, which is not a big problem, as any receiver should validate the fields. No invalid memory accesses will happen during the header push. But we should fix the macros to reject the incorrect actions in the first place. Using previously defined values and calculating the length instead of defining numbers directly, so it's easier to understand where they come from and harder to make a mistake. Fixes: 1f0b7744c505 ("net: add NSH header structures and helpers") Signed-off-by: Ilya Maximets Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20260507120434.2962505-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/net/nsh.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/nsh.h b/include/net/nsh.h index 16a751093896..15a26c590815 100644 --- a/include/net/nsh.h +++ b/include/net/nsh.h @@ -247,10 +247,10 @@ struct nshhdr { #define NSH_M_TYPE1_LEN 24 /* NSH header maximum Length. */ -#define NSH_HDR_MAX_LEN 256 +#define NSH_HDR_MAX_LEN ((NSH_LEN_MASK >> NSH_LEN_SHIFT) * 4) /* NSH context headers maximum Length. */ -#define NSH_CTX_HDRS_MAX_LEN 248 +#define NSH_CTX_HDRS_MAX_LEN (NSH_HDR_MAX_LEN - NSH_BASE_HDR_LEN) static inline struct nshhdr *nsh_hdr(struct sk_buff *skb) { From efda25ee84325385f859d10872590e90ce837243 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 6 May 2026 20:07:13 +0000 Subject: [PATCH 110/377] genetlink: free the skb on 'group >= family->n_mcgrps' These methods generally consume ownership of the provided skb, so even if an error path is encountered, the skb is freed. This is because the very first thing they do after some initial setup is to unconditionally consume the skb via consume_skb(skb). Any subsequent errors lead to the core netlink layer freeing the skb. However, there is one check that occurs before ownership is passed, which is the check for the group index. So if this error condition is encountered, then the skb is leaked. This error condition is generally considered a violation of the netlink API, so it's not expected to occur under normal circumstances. For the same reason, no callers check for this error condition, and no callers need to be adjusted. However, we should still follow the same ownership semantics of the rest of the function. Thus, free the skb in this codepath. Suggested-by: Andrew Lunn Suggested-by: Matthew Maurer Fixes: 2a94fe48f32c ("genetlink: make multicast groups const, prevent abuse") Link: https://lore.kernel.org/r/845b36ba-7b3a-41f2-acb2-b284f253e2ca@lunn.ch Signed-off-by: Alice Ryhl Link: https://patch.msgid.link/20260506-genlmsg-return-v2-1-a63ee2a055d6@google.com Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 4 +++- net/netlink/genetlink.c | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 7b84f2cef8b1..d70510ac31ab 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -489,8 +489,10 @@ genlmsg_multicast_netns_filtered(const struct genl_family *family, netlink_filter_fn filter, void *filter_data) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + nlmsg_free(skb); return -EINVAL; + } group = family->mcgrp_offset + group; return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, flags, filter, filter_data); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d251d894afd4..0da39eaed255 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1972,8 +1972,10 @@ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + kfree_skb(skb); return -EINVAL; + } group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group); @@ -1986,8 +1988,10 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + kfree_skb(skb); return; + } group = family->mcgrp_offset + group; nlmsg_notify(sk, skb, info->snd_portid, group, From a77d5a069d959dc45f5f472d48cba37d8cba0f1c Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Wed, 6 May 2026 16:37:45 -0700 Subject: [PATCH 111/377] net: shaper: Reject reparenting of existing nodes When an existing node-scope shaper is moved to a different parent via the group operation, the framework fails to update the leaves count on both the old and new parent shapers. Only newly created nodes (handle.id == NET_SHAPER_ID_UNSPEC) trigger the parent leaves increment at line 1039. This causes the parent's leaves counter to diverge from the actual number of children in the xarray. When the node is later deleted, pre_del_node() allocates an array sized by the stale leaves count, but the xarray iteration finds more children than expected, hitting the WARN_ON_ONCE guard and returning -EINVAL. Rather than adding reparenting support with complex leaves count bookkeeping, reject group calls that attempt to change an existing node's parent. Updates to an existing node's rate or leaves under the same parent remain permitted. We expect that for any modification of the topology user should always create new groups and let the kernel garbage collect the leaf-less nodes. Fixes: 5d5d4700e75d ("net-shapers: implement NL group operation") Signed-off-by: Mohsin Bashir Link: https://patch.msgid.link/20260506233745.111895-1-mohsin.bashr@gmail.com Signed-off-by: Jakub Kicinski --- net/shaper/shaper.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 94bc9c7382ea..1069fa4eb9f6 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -964,15 +964,22 @@ static int __net_shaper_group(struct net_shaper_binding *binding, int i, ret; if (node->handle.scope == NET_SHAPER_SCOPE_NODE) { + struct net_shaper *cur = NULL; + new_node = node->handle.id == NET_SHAPER_ID_UNSPEC; - if (!new_node && !net_shaper_lookup(binding, &node->handle)) { - /* The related attribute is not available when - * reaching here from the delete() op. - */ - NL_SET_ERR_MSG_FMT(extack, "Node shaper %d:%d does not exists", - node->handle.scope, node->handle.id); - return -ENOENT; + if (!new_node) { + cur = net_shaper_lookup(binding, &node->handle); + if (!cur) { + /* The related attribute is not available + * when reaching here from the delete() op. + */ + NL_SET_ERR_MSG_FMT(extack, + "Node shaper %d:%d does not exist", + node->handle.scope, + node->handle.id); + return -ENOENT; + } } /* When unspecified, the node parent scope is inherited from @@ -986,6 +993,15 @@ static int __net_shaper_group(struct net_shaper_binding *binding, return ret; } + if (cur && net_shaper_handle_cmp(&cur->parent, + &node->parent)) { + NL_SET_ERR_MSG_FMT(extack, + "Cannot reparent node shaper %d:%d", + node->handle.scope, + node->handle.id); + return -EOPNOTSUPP; + } + } else { net_shaper_default_parent(&node->handle, &node->parent); } From 1619553b0a6ba7a966b17b0226f3acb9dd4d5380 Mon Sep 17 00:00:00 2001 From: Matt Vollrath Date: Wed, 6 May 2026 14:48:10 -0700 Subject: [PATCH 112/377] i40e: Cleanup PTP registration on probe failure Fix two conditions which would leak PTP registration on probe failure: 1. i40e_setup_pf_switch can encounter an error in i40e_setup_pf_filter_control, call i40e_ptp_init, then return non-zero, sending i40e_probe to err_vsis. 2. i40e_setup_misc_vector can return non-zero, sending i40e_probe to err_vsis. Both of these conditions have been present since PTP was introduced in this driver. Found with coccinelle. Fixes: beb0dff1251db ("i40e: enable PTP") Signed-off-by: Matt Vollrath Tested-by: Sunitha Mekala Reviewed-by: Aleksandr Loktionov Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-1-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 028bd500603a..f06fcef644e5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16108,6 +16108,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Unwind what we've done if something failed in the setup */ err_vsis: set_bit(__I40E_DOWN, pf->state); + i40e_ptp_stop(pf); i40e_clear_interrupt_scheme(pf); kfree(pf->vsi); err_switch_setup: From 678b713ece1e853f11e670a84cb887c35e1381b7 Mon Sep 17 00:00:00 2001 From: Matt Vollrath Date: Wed, 6 May 2026 14:48:11 -0700 Subject: [PATCH 113/377] i40e: Cleanup PTP pins on probe failure PTP pin structs are allocated early in probe, but never cleaned up. Fix this by calling i40e_ptp_free_pins in the error path. To support this, i40e_ptp_free_pins is added to the header and pin_config is correctly nullified after being freed. This has been an issue since i40e_ptp_alloc_pins was introduced. Fixes: 1050713026a08 ("i40e: add support for PTP external synchronization clock") Reported-by: Kohei Enju Cc: stable@vger.kernel.org Signed-off-by: Matt Vollrath Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Reviewed-by: Kohei Enju Tested-by: Sunitha Mekala Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-2-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 1 + drivers/net/ethernet/intel/i40e/i40e_ptp.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index dcb50c2e1aa2..83e780919ac9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1318,6 +1318,7 @@ void i40e_ptp_restore_hw_time(struct i40e_pf *pf); void i40e_ptp_init(struct i40e_pf *pf); void i40e_ptp_stop(struct i40e_pf *pf); int i40e_ptp_alloc_pins(struct i40e_pf *pf); +void i40e_ptp_free_pins(struct i40e_pf *pf); int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset); int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi); int i40e_get_partition_bw_setting(struct i40e_pf *pf); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f06fcef644e5..6d4f9218dc68 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16112,6 +16112,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) i40e_clear_interrupt_scheme(pf); kfree(pf->vsi); err_switch_setup: + i40e_ptp_free_pins(pf); i40e_reset_interrupt_capability(pf); timer_shutdown_sync(&pf->service_timer); err_mac_addr: diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c index 404a716db8da..7d07c389bb23 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c @@ -940,12 +940,13 @@ int i40e_ptp_hwtstamp_get(struct net_device *netdev, * * Release memory allocated for PTP pins. **/ -static void i40e_ptp_free_pins(struct i40e_pf *pf) +void i40e_ptp_free_pins(struct i40e_pf *pf) { if (i40e_is_ptp_pin_dev(&pf->hw)) { kfree(pf->ptp_pins); kfree(pf->ptp_caps.pin_config); pf->ptp_pins = NULL; + pf->ptp_caps.pin_config = NULL; } } From da4f76b6a84ede14a71282ef841768299ead0221 Mon Sep 17 00:00:00 2001 From: Emil Tantilov Date: Wed, 6 May 2026 14:48:12 -0700 Subject: [PATCH 114/377] idpf: fix read_dev_clk_lock spinlock init in idpf_ptp_init() In idpf_ptp_init(), read_dev_clk_lock is initialized after ptp_schedule_worker() had already been called (and after idpf_ptp_settime64() could reach the lock). The PTP aux worker fires immediately upon scheduling and can call into idpf_ptp_read_src_clk_reg_direct(), which takes spin_lock(&ptp->read_dev_clk_lock) on an uninitialized lock, triggering the lockdep "non-static key" warning: [12973.796587] idpf 0000:83:00.0: Device HW Reset initiated [12974.094507] INFO: trying to register non-static key. ... [12974.097208] Call Trace: [12974.097213] [12974.097218] dump_stack_lvl+0x93/0xe0 [12974.097234] register_lock_class+0x4c4/0x4e0 [12974.097249] ? __lock_acquire+0x427/0x2290 [12974.097259] __lock_acquire+0x98/0x2290 [12974.097272] lock_acquire+0xc6/0x310 [12974.097281] ? idpf_ptp_read_src_clk_reg+0xb7/0x150 [idpf] [12974.097311] ? lockdep_hardirqs_on_prepare+0xde/0x190 [12974.097318] ? finish_task_switch.isra.0+0xd2/0x350 [12974.097330] ? __pfx_ptp_aux_kworker+0x10/0x10 [ptp] [12974.097343] _raw_spin_lock+0x30/0x40 [12974.097353] ? idpf_ptp_read_src_clk_reg+0xb7/0x150 [idpf] [12974.097373] idpf_ptp_read_src_clk_reg+0xb7/0x150 [idpf] [12974.097391] ? kthread_worker_fn+0x88/0x3d0 [12974.097404] ? kthread_worker_fn+0x4e/0x3d0 [12974.097411] idpf_ptp_update_cached_phctime+0x26/0x120 [idpf] [12974.097428] ? _raw_spin_unlock_irq+0x28/0x50 [12974.097436] idpf_ptp_do_aux_work+0x15/0x20 [idpf] [12974.097454] ptp_aux_kworker+0x20/0x40 [ptp] [12974.097464] kthread_worker_fn+0xd5/0x3d0 [12974.097474] ? __pfx_kthread_worker_fn+0x10/0x10 [12974.097482] kthread+0xf4/0x130 [12974.097489] ? __pfx_kthread+0x10/0x10 [12974.097498] ret_from_fork+0x32c/0x410 [12974.097512] ? __pfx_kthread+0x10/0x10 [12974.097519] ret_from_fork_asm+0x1a/0x30 [12974.097540] Move the call to spin_lock_init() up a bit to make sure read_dev_clk_lock is not touched before it's been initialized. Fixes: 5cb8805d2366 ("idpf: negotiate PTP capabilities and get PTP clock") Signed-off-by: Emil Tantilov Reviewed-by: Madhu Chittim Reviewed-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Samuel Salin Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-3-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_ptp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_ptp.c b/drivers/net/ethernet/intel/idpf/idpf_ptp.c index eec91c4f0a75..4a51d2727547 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_ptp.c +++ b/drivers/net/ethernet/intel/idpf/idpf_ptp.c @@ -952,6 +952,8 @@ int idpf_ptp_init(struct idpf_adapter *adapter) goto free_ptp; } + spin_lock_init(&adapter->ptp->read_dev_clk_lock); + err = idpf_ptp_create_clock(adapter); if (err) goto free_ptp; @@ -977,8 +979,6 @@ int idpf_ptp_init(struct idpf_adapter *adapter) goto remove_clock; } - spin_lock_init(&adapter->ptp->read_dev_clk_lock); - pci_dbg(adapter->pdev, "PTP init successful\n"); return 0; From 6c77b9510829a424d1b74409b7db9456e3522871 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 6 May 2026 14:48:13 -0700 Subject: [PATCH 115/377] idpf: fix double free and use-after-free in aux device error paths When auxiliary_device_add() fails in idpf_plug_vport_aux_dev() or idpf_plug_core_aux_dev(), the err_aux_dev_add label calls auxiliary_device_uninit() and falls through to err_aux_dev_init. The uninit call will trigger put_device(), which invokes the release callback (idpf_vport_adev_release / idpf_core_adev_release) that frees iadev. The fall-through then reads adev->id from the freed iadev for ida_free() and double-frees iadev with kfree(). Free the IDA slot and clear the back-pointer before uninit, while adev is still valid, then return immediately. Commit 65637c3a1811 ("idpf: fix UAF in RDMA core aux dev deinitialization") fixed the same use-after-free in the matching unplug path in this file but missed both probe error paths. Cc: Tony Nguyen Cc: Przemek Kitszel Cc: Andrew Lunn Cc: stable@kernel.org Fixes: be91128c579c ("idpf: implement RDMA vport auxiliary dev create, init, and destroy") Fixes: f4312e6bfa2a ("idpf: implement core RDMA auxiliary dev create, init, and destroy") Signed-off-by: Greg Kroah-Hartman Reviewed-by: Aleksandr Loktionov Reviewed-by: Paul Menzel Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-4-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_idc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_idc.c b/drivers/net/ethernet/intel/idpf/idpf_idc.c index 7e4f4ac92653..b7d6b08fc89e 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_idc.c +++ b/drivers/net/ethernet/intel/idpf/idpf_idc.c @@ -90,7 +90,10 @@ static int idpf_plug_vport_aux_dev(struct iidc_rdma_core_dev_info *cdev_info, return 0; err_aux_dev_add: + ida_free(&idpf_idc_ida, adev->id); + vdev_info->adev = NULL; auxiliary_device_uninit(adev); + return ret; err_aux_dev_init: ida_free(&idpf_idc_ida, adev->id); err_ida_alloc: @@ -228,7 +231,10 @@ static int idpf_plug_core_aux_dev(struct iidc_rdma_core_dev_info *cdev_info) return 0; err_aux_dev_add: + ida_free(&idpf_idc_ida, adev->id); + cdev_info->adev = NULL; auxiliary_device_uninit(adev); + return ret; err_aux_dev_init: ida_free(&idpf_idc_ida, adev->id); err_ida_alloc: From b3cda96feb60d91fe88d52b974ff110dcfa91239 Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Wed, 6 May 2026 14:48:14 -0700 Subject: [PATCH 116/377] ice: fix setting RSS VSI hash for E830 ice_set_rss_hfunc() performs a VSI update, in which it sets hashing function, leaving other VSI options unchanged. However, ::q_opt_flags is mistakenly set to the value of another field, instead of its original value, probably due to a typo. What happens next is hardware-dependent: On E810, only the first bit is meaningful (see ICE_AQ_VSI_Q_OPT_PE_FLTR_EN) and can potentially end up in a different state than before VSI update. On E830, some of the remaining bits are not reserved. Setting them to some unrelated values can cause the firmware to reject the update because of invalid settings, or worse - succeed. Reproducer: sudo ethtool -X $PF1 equal 8 Output in dmesg: Failed to configure RSS hash for VSI 6, error -5 Fixes: 352e9bf23813 ("ice: enable symmetric-xor RSS for Toeplitz hash function") Reviewed-by: Aleksandr Loktionov Reviewed-by: Przemek Kitszel Signed-off-by: Marcin Szycik Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-5-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 1d1947a7fe11..c52c465280f7 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -8046,7 +8046,7 @@ int ice_set_rss_hfunc(struct ice_vsi *vsi, u8 hfunc) ctx->info.q_opt_rss |= FIELD_PREP(ICE_AQ_VSI_Q_OPT_RSS_HASH_M, hfunc); ctx->info.q_opt_tc = vsi->info.q_opt_tc; - ctx->info.q_opt_flags = vsi->info.q_opt_rss; + ctx->info.q_opt_flags = vsi->info.q_opt_flags; err = ice_update_vsi(hw, vsi->idx, ctx, NULL); if (err) { From 0ded1f36ba4021cba50513e80be6b6e173710168 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 6 May 2026 14:48:15 -0700 Subject: [PATCH 117/377] ice: fix locking in ice_dcb_rebuild() Move the mutex_lock() call up to prevent that DCB settings change after the first ice_query_port_ets() call. The second ice_query_port_ets() call in ice_dcb_rebuild() is already protected by pf->tc_mutex. This also fixes a bug in an error path, as before taking the first "goto dcb_error" in the function jumped over mutex_lock() to mutex_unlock(). This bug has been detected by the clang thread-safety analyzer. Cc: intel-wired-lan@lists.osuosl.org Fixes: 242b5e068b25 ("ice: Fix DCB rebuild after reset") Signed-off-by: Bart Van Assche Reviewed-by: Aleksandr Loktionov Reviewed-by: Przemek Kitszel Tested-by: Arpana Arland Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-6-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index 16aa25535152..0bc6dd375687 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -537,14 +537,14 @@ void ice_dcb_rebuild(struct ice_pf *pf) struct ice_dcbx_cfg *err_cfg; int ret; + mutex_lock(&pf->tc_mutex); + ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL); if (ret) { dev_err(dev, "Query Port ETS failed\n"); goto dcb_error; } - mutex_lock(&pf->tc_mutex); - if (!pf->hw.port_info->qos_cfg.is_sw_lldp) ice_cfg_etsrec_defaults(pf->hw.port_info); From cce709d8df6ba6d2a0a0dbf34acc2cdd9e23bd46 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 6 May 2026 14:48:16 -0700 Subject: [PATCH 118/377] ice: dpll: fix rclk pin state get for E810 The refactoring of ice_dpll_rclk_state_on_pin_get() to use ice_dpll_pin_get_parent_idx() omitted the base_rclk_idx adjustment that was correctly added in the ice_dpll_rclk_state_on_pin_set() path. This breaks E810 devices where base_rclk_idx is non-zero, causing the wrong hardware index to be used for pin state lookup and incorrect recovered clock state to be reported via the DPLL subsystem. E825C is unaffected as its base_rclk_idx is 0. While at it, add bounds check against ICE_DPLL_RCLK_NUM_MAX on hw_idx after the base_rclk_idx subtraction in both ice_dpll_rclk_state_on_pin_{get,set}() to prevent out-of-bounds access on the pin state array. Fixes: ad1df4f2d591 ("ice: dpll: Support E825-C SyncE and dynamic pin discovery") Signed-off-by: Ivan Vecera Reviewed-by: Aleksandr Loktionov Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-7-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_dpll.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 27b460926bac..892bc7c2e28b 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -2523,6 +2523,8 @@ ice_dpll_rclk_state_on_pin_set(const struct dpll_pin *pin, void *pin_priv, if (hw_idx < 0) goto unlock; hw_idx -= pf->dplls.base_rclk_idx; + if (hw_idx >= ICE_DPLL_RCLK_NUM_MAX) + goto unlock; if ((enable && p->state[hw_idx] == DPLL_PIN_STATE_CONNECTED) || (!enable && p->state[hw_idx] == DPLL_PIN_STATE_DISCONNECTED)) { @@ -2586,6 +2588,9 @@ ice_dpll_rclk_state_on_pin_get(const struct dpll_pin *pin, void *pin_priv, hw_idx = ice_dpll_pin_get_parent_idx(p, parent_pin); if (hw_idx < 0) goto unlock; + hw_idx -= pf->dplls.base_rclk_idx; + if (hw_idx >= ICE_DPLL_RCLK_NUM_MAX) + goto unlock; ret = ice_dpll_pin_state_update(pf, p, ICE_DPLL_PIN_TYPE_RCLK_INPUT, extack); From 30f1658fc5387384c7a60b9d15c79cb959512c1a Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 6 May 2026 14:48:17 -0700 Subject: [PATCH 119/377] ice: dpll: fix misplaced header macros The CGU register definitions (ICE_CGU_R10, ICE_CGU_R11 and related field masks) were placed after the #endif of the _ICE_DPLL_H_ include guard, leaving them unprotected. Move them inside the guard. Fixes: ad1df4f2d591 ("ice: dpll: Support E825-C SyncE and dynamic pin discovery") Signed-off-by: Ivan Vecera Reviewed-by: Aleksandr Loktionov Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-8-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_dpll.h | 32 +++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.h b/drivers/net/ethernet/intel/ice/ice_dpll.h index ae42cdea0ee1..8678575359b9 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.h +++ b/drivers/net/ethernet/intel/ice/ice_dpll.h @@ -8,6 +8,22 @@ #define ICE_DPLL_RCLK_NUM_MAX 4 +#define ICE_CGU_R10 0x28 +#define ICE_CGU_R10_SYNCE_CLKO_SEL GENMASK(8, 5) +#define ICE_CGU_R10_SYNCE_CLKODIV_M1 GENMASK(13, 9) +#define ICE_CGU_R10_SYNCE_CLKODIV_LOAD BIT(14) +#define ICE_CGU_R10_SYNCE_DCK_RST BIT(15) +#define ICE_CGU_R10_SYNCE_ETHCLKO_SEL GENMASK(18, 16) +#define ICE_CGU_R10_SYNCE_ETHDIV_M1 GENMASK(23, 19) +#define ICE_CGU_R10_SYNCE_ETHDIV_LOAD BIT(24) +#define ICE_CGU_R10_SYNCE_DCK2_RST BIT(25) +#define ICE_CGU_R10_SYNCE_S_REF_CLK GENMASK(31, 27) + +#define ICE_CGU_R11 0x2C +#define ICE_CGU_R11_SYNCE_S_BYP_CLK GENMASK(6, 1) + +#define ICE_CGU_BYPASS_MUX_OFFSET_E825C 3 + /** * enum ice_dpll_pin_sw - enumerate ice software pin indices: * @ICE_DPLL_PIN_SW_1_IDX: index of first SW pin @@ -157,19 +173,3 @@ static inline void ice_dpll_deinit(struct ice_pf *pf) { } #endif #endif - -#define ICE_CGU_R10 0x28 -#define ICE_CGU_R10_SYNCE_CLKO_SEL GENMASK(8, 5) -#define ICE_CGU_R10_SYNCE_CLKODIV_M1 GENMASK(13, 9) -#define ICE_CGU_R10_SYNCE_CLKODIV_LOAD BIT(14) -#define ICE_CGU_R10_SYNCE_DCK_RST BIT(15) -#define ICE_CGU_R10_SYNCE_ETHCLKO_SEL GENMASK(18, 16) -#define ICE_CGU_R10_SYNCE_ETHDIV_M1 GENMASK(23, 19) -#define ICE_CGU_R10_SYNCE_ETHDIV_LOAD BIT(24) -#define ICE_CGU_R10_SYNCE_DCK2_RST BIT(25) -#define ICE_CGU_R10_SYNCE_S_REF_CLK GENMASK(31, 27) - -#define ICE_CGU_R11 0x2C -#define ICE_CGU_R11_SYNCE_S_BYP_CLK GENMASK(6, 1) - -#define ICE_CGU_BYPASS_MUX_OFFSET_E825C 3 From c4f3d6eb1fcf6cd9ce4644f604d5aad1ce594dfc Mon Sep 17 00:00:00 2001 From: Myeonghun Pak Date: Wed, 6 May 2026 21:43:11 +0900 Subject: [PATCH 120/377] net: lan966x: avoid unregistering netdev on register failure lan966x_probe_port() stores the newly allocated net_device in the port before calling register_netdev(). If register_netdev() fails, the probe error path calls lan966x_cleanup_ports(), which sees port->dev and calls unregister_netdev() for a device that was never registered. Destroy the phylink instance created for this port and clear port->dev before returning the registration error. The common cleanup path now skips ports without port->dev before reaching the registered netdev cleanup, so it only handles ports that reached the registered-netdev lifetime. This also avoids treating an uninitialized FDMA netdev and the failed port as a NULL == NULL match in the common cleanup path. Fixes: d28d6d2e37d1 ("net: lan966x: add port module support") Co-developed-by: Ijae Kim Signed-off-by: Ijae Kim Signed-off-by: Myeonghun Pak Link: https://patch.msgid.link/20260506124331.31945-1-mhun512@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan966x/lan966x_main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index 47752d3fde0b..1179a6e127c5 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -749,11 +749,10 @@ static void lan966x_cleanup_ports(struct lan966x *lan966x) for (p = 0; p < lan966x->num_phys_ports; p++) { port = lan966x->ports[p]; - if (!port) + if (!port || !port->dev) continue; - if (port->dev) - unregister_netdev(port->dev); + unregister_netdev(port->dev); lan966x_xdp_port_deinit(port); if (lan966x->fdma && lan966x->fdma_ndev == port->dev) @@ -873,6 +872,9 @@ static int lan966x_probe_port(struct lan966x *lan966x, u32 p, err = register_netdev(dev); if (err) { dev_err(lan966x->dev, "register_netdev failed\n"); + phylink_destroy(phylink); + port->phylink = NULL; + port->dev = NULL; return err; } From 6635fa84403c3a59455b66007c019a7cc632db30 Mon Sep 17 00:00:00 2001 From: Shitalkumar Gandhi Date: Thu, 7 May 2026 01:28:13 +0530 Subject: [PATCH 121/377] net: ti: icssm-prueth: fix eth_ports_node leak in probe The error path on of_property_read_u32() failure inside icssm_prueth_probe() returns without putting eth_ports_node, which was acquired before the for_each_child_of_node() loop. Drop it before returning. Fixes: 511f6c1ae093 ("net: ti: icssm-prueth: Adds ICSSM Ethernet driver") Signed-off-by: Shitalkumar Gandhi Link: https://patch.msgid.link/20260506195813.641610-1-shitalkumar.gandhi@cambiumnetworks.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/icssm/icssm_prueth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ti/icssm/icssm_prueth.c b/drivers/net/ethernet/ti/icssm/icssm_prueth.c index 53bbd9290904..b7e94244355a 100644 --- a/drivers/net/ethernet/ti/icssm/icssm_prueth.c +++ b/drivers/net/ethernet/ti/icssm/icssm_prueth.c @@ -1825,6 +1825,7 @@ static int icssm_prueth_probe(struct platform_device *pdev) dev_err(dev, "%pOF error reading port_id %d\n", eth_node, ret); of_node_put(eth_node); + of_node_put(eth_ports_node); return ret; } From abb5f36771cc4c05899b34000829a787572a8817 Mon Sep 17 00:00:00 2001 From: Ben Morris Date: Thu, 7 May 2026 17:14:55 -0700 Subject: [PATCH 122/377] sctp: revalidate list cursor after sctp_sendmsg_to_asoc() in SCTP_SENDALL The SCTP_SENDALL path in sctp_sendmsg() iterates ep->asocs with list_for_each_entry_safe(), which caches the next entry in @tmp before the loop body runs. The body calls sctp_sendmsg_to_asoc(), which may drop the socket lock inside sctp_wait_for_sndbuf(). While the lock is dropped, another thread can SCTP_SOCKOPT_PEELOFF the association cached in @tmp, migrating it to a new endpoint via sctp_sock_migrate() (list_del_init() + list_add_tail() to newep->asocs), and optionally close the new socket which frees the association via kfree_rcu(). The cached @tmp can also be freed by a network ABORT for that association, processed in softirq while the lock is dropped. sctp_wait_for_sndbuf() revalidates @asoc (the current entry) on re-lock via the "sk != asoc->base.sk" and "asoc->base.dead" checks, but nothing revalidates @tmp. After a successful return, the iterator advances to the stale @tmp, yielding either a use-after-free (if the peeled socket was closed) or a list-walk onto the new endpoint's list head (type confusion of &newep->asocs as a struct sctp_association *). Both are reachable from CapEff=0; the type-confusion path gives controlled indirect call via the outqueue.sched->init_sid pointer. Fix by re-deriving @tmp from @asoc after sctp_sendmsg_to_asoc() returns. @asoc is known to still be on ep->asocs at that point: the only callers that list_del an association from ep->asocs are sctp_association_free() (which sets asoc->base.dead) and sctp_assoc_migrate() (which changes asoc->base.sk), and sctp_wait_for_sndbuf() checks both under the lock before any successful return; a tripped check propagates as err < 0 and the loop bails before the re-derive. The SCTP_ABORT path in sctp_sendmsg_check_sflags() returns 0 and the loop hits 'continue' before sctp_sendmsg_to_asoc() is ever called, so the @tmp cached by list_for_each_entry_safe() still covers the lock-held free that ba59fb027307 ("sctp: walk the list of asoc safely") was added for. Fixes: 4910280503f3 ("sctp: add support for snd flag SCTP_SENDALL process in sendmsg") Cc: stable@vger.kernel.org Signed-off-by: Ben Morris Acked-by: Xin Long Link: https://patch.msgid.link/20260508001455.3137-1-joycathacker@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/socket.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 58d0d9747f0b..1d2568bb6bc2 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1986,6 +1986,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_unlock; iov_iter_revert(&msg->msg_iter, err); + + /* sctp_sendmsg_to_asoc() may have released the socket + * lock (sctp_wait_for_sndbuf), during which other + * associations on ep->asocs could have been peeled + * off or freed. @asoc itself is revalidated by the + * base.dead and base.sk checks in sctp_wait_for_sndbuf, + * so re-derive the cached cursor from it. + */ + tmp = list_next_entry(asoc, asocs); } goto out_unlock; From 496c0c4c53bbe1bad97e82cd12103df61a6e459d Mon Sep 17 00:00:00 2001 From: Holger Brunck Date: Thu, 7 May 2026 17:53:32 +0200 Subject: [PATCH 123/377] net: wan: fsl_ucc_hdlc: free tx_skbuff in uhdlc_memclean When the device is removed all allocated resources should be freed. In uhdlc_memclean the netdev transmit queue was already stopped. But at this point we may have pending skb in the transmit queue which must be freed. Therefore iterate over the tx_skbuff pointers and free all pending skb. The issue was discovered by sashiko. Tested on a ls1043a board running HDLC in bus mode on kernel 6.12. https: //sashiko.dev/#/patchset/20260429114208.941011-1-holger.brunck%40hitachienergy.com Fixes: c19b6d246a35 ("drivers/net: support hdlc function for QE-UCC") Signed-off-by: Holger Brunck Link: https://patch.msgid.link/20260507155332.3452319-1-holger.brunck@hitachienergy.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/fsl_ucc_hdlc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index 15bfb78381d4..809f21fb93f5 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -740,6 +740,8 @@ static int uhdlc_open(struct net_device *dev) static void uhdlc_memclean(struct ucc_hdlc_private *priv) { + int i; + qe_muram_free(ioread16be(&priv->ucc_pram->riptr)); qe_muram_free(ioread16be(&priv->ucc_pram->tiptr)); @@ -770,6 +772,11 @@ static void uhdlc_memclean(struct ucc_hdlc_private *priv) kfree(priv->rx_skbuff); priv->rx_skbuff = NULL; + for (i = 0; i < TX_BD_RING_LEN; i++) { + dev_kfree_skb(priv->tx_skbuff[i]); + priv->tx_skbuff[i] = NULL; + } + kfree(priv->tx_skbuff); priv->tx_skbuff = NULL; From 46e9b0224475abc739612ef72c35b7c90211a0c1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2026 13:41:13 -0700 Subject: [PATCH 124/377] tools/ynl: add missing uapi header deps in Makefile.deps ethtool.h includes linux/typelimits.h which is a relatively new header not yet shipped in most distro kernel-header packages. Without the explicit entry, the build silently falls through to -idirafter. dev_energymodel.h is a new YNL family whose uapi header is not in system paths at all and was missing a CFLAGS entry entirely. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20260508204114.205896-2-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- tools/net/ynl/Makefile.deps | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 08205f9fc525..cc53b2f21c44 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -15,9 +15,11 @@ UAPI_PATH:=../../../../include/uapi/ get_hdr_inc=-D$(1) -include $(UAPI_PATH)/linux/$(2) get_hdr_inc2=-D$(1) -D$(2) -include $(UAPI_PATH)/linux/$(3) +CFLAGS_dev-energymodel:=$(call get_hdr_inc,_LINUX_DEV_ENERGYMODEL_H,dev_energymodel.h) CFLAGS_devlink:=$(call get_hdr_inc,_LINUX_DEVLINK_H_,devlink.h) CFLAGS_dpll:=$(call get_hdr_inc,_LINUX_DPLL_H,dpll.h) -CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_H,ethtool.h) \ +CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_TYPELIMITS_H,typelimits.h) \ + $(call get_hdr_inc,_LINUX_ETHTOOL_H,ethtool.h) \ $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) \ $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_GENERATED_H,ethtool_netlink_generated.h) CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h) From 304d81a2fbf2b454def4debcb38ea173911b72cd Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 7 Apr 2026 18:08:57 -0400 Subject: [PATCH 125/377] nfsd: fix file change detection in CB_GETATTR RFC 8881, section 10.4.3 doesn't say anything about caching the file size in the delegation record, nor does it say anything about comparing a cached file size with the size reported by the client in the CB_GETATTR reply for the purpose of determining if the client holds modified data for the file. What section 10.4.3 of RFC 8881 does say is that the server should compare the *current* file size with the size reported by the client holding the delegation in the CB_GETATTR reply, and if they differ to treat it as a modification regardless of the change attribute retrieved via the CB_GETATTR. Doing otherwise would cause the server to believe the client holding the delegation has a modified version of the file, even if the client flushed the modifications to the server prior to the CB_GETATTR. This would have the added side effect of subsequent CB_GETATTRs causing updates to the mtime, ctime, and change attribute even if the client holding the delegation makes no further updates to the file. Modify nfsd4_deleg_getattr_conflict() to obtain the current file size via i_size_read(). Retain the ncf_cur_fsize field, since it's a convenient way to return the file size back to nfsd4_encode_fattr4(), but don't use it for the purpose of detecting file changes. Remove the unnecessary initialization of ncf_cur_fsize in nfs4_open_delegation(). Also, if we recall the delegation (because the client didn't respond to the CB_GETATTR), then skip the logic that checks the nfs4_cb_fattr fields. Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation") Cc: stable@vger.kernel.org Signed-off-by: Scott Mayhew Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b4d0e82b2690..a9f7bd491b8c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6378,7 +6378,6 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, } open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : OPEN_DELEGATE_WRITE; - dp->dl_cb_fattr.ncf_cur_fsize = stat.size; dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat); dp->dl_atime = stat.atime; dp->dl_ctime = stat.ctime; @@ -9429,11 +9428,15 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, if (status != nfserr_jukebox || !nfsd_wait_for_delegreturn(rqstp, inode)) goto out_status; + status = nfs_ok; + goto out_status; + } + if (!ncf->ncf_file_modified) { + if (ncf->ncf_initial_cinfo != ncf->ncf_cb_change) + ncf->ncf_file_modified = true; + else if (i_size_read(inode) != ncf->ncf_cb_fsize) + ncf->ncf_file_modified = true; } - if (!ncf->ncf_file_modified && - (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || - ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) - ncf->ncf_file_modified = true; if (ncf->ncf_file_modified) { int err; From 2863bac7f49c4acd80a048ce52506a2b9c8db015 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 10 Apr 2026 12:09:19 -0400 Subject: [PATCH 126/377] nfsd: update mtime/ctime on CLONE in presense of delegated attributes When delegated attributes are given on open, the file is opened with NOCMTIME and modifying operations do not update mtime/ctime as to not get out-of-sync with the client's delegated view. However, for CLONE operation, the server should update its view of mtime/ctime and reflect that in any GETATTR queries. Fixes: e5e9b24ab8fa ("nfsd: freeze c/mtime updates with outstanding WRITE_ATTRS delegation") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 3 +++ fs/nfsd/nfs4state.c | 44 +++++++++++++++++++++++++++++--------------- fs/nfsd/state.h | 1 + 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2797da8cc950..5de8f37df78a 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1413,6 +1413,9 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dst, clone->cl_dst_pos, clone->cl_count, EX_ISSYNC(cstate->current_fh.fh_export)); + if (!status && (READ_ONCE(dst->nf_file->f_mode) & FMODE_NOCMTIME) != 0) + nfsd_update_cmtime_attr(dst->nf_file, 0); + nfsd_file_put(dst); nfsd_file_put(src); out: diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a9f7bd491b8c..3cd8b3b59de4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1221,10 +1221,6 @@ static void put_deleg_file(struct nfs4_file *fp) static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct file *f) { - struct iattr ia = { .ia_valid = ATTR_ATIME | ATTR_CTIME | ATTR_MTIME | ATTR_DELEG }; - struct inode *inode = file_inode(f); - int ret; - /* don't do anything if FMODE_NOCMTIME isn't set */ if ((READ_ONCE(f->f_mode) & FMODE_NOCMTIME) == 0) return; @@ -1242,17 +1238,7 @@ static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct f return; /* Stamp everything to "now" */ - inode_lock(inode); - ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &ia, NULL); - inode_unlock(inode); - if (ret) { - struct inode *inode = file_inode(f); - - pr_notice_ratelimited("nfsd: Unable to update timestamps on inode %02x:%02x:%lu: %d\n", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), - inode->i_ino, ret); - } + nfsd_update_cmtime_attr(f, ATTR_ATIME); } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) @@ -9563,3 +9549,31 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, put_nfs4_file(fp); return ERR_PTR(status); } + +/** + * nfsd_update_cmtime_attr - update file's delegated ctime/mtime, + * and optionally other attributes (ie ATTR_ATIME). + * @f: pointer to an opened file + * @flags: any additional flags that should be updated + * + * Given upon opening a file delegated attributes were issues, update + * @f attributes to current times. + */ +void nfsd_update_cmtime_attr(struct file *f, unsigned int flags) +{ + int ret; + struct inode *inode = file_inode(f); + struct iattr attr = { + .ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_DELEG | flags, + }; + + inode_lock(inode); + ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &attr, NULL); + inode_unlock(inode); + if (ret) + pr_notice_ratelimited("nfsd: Unable to update timestamps on " + "inode %02x:%02x:%lu: %d\n", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino, ret); +} diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 953675eba5c3..c5ccea64c281 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -843,6 +843,7 @@ extern void nfsd4_shutdown_copy(struct nfs4_client *clp); void nfsd4_put_client(struct nfs4_client *clp); void nfsd4_async_copy_reaper(struct nfsd_net *nn); bool nfsd4_has_active_async_copies(struct nfs4_client *clp); +void nfsd_update_cmtime_attr(struct file *f, unsigned int flags); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); From 4183cf383b6faec17a0882b84cd2d901dba62b16 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 10 Apr 2026 12:09:20 -0400 Subject: [PATCH 127/377] nfsd: update mtime/ctime on COPY in presence of delegated attributes When delegated attributes are given on open, the file is opened with NOCMTIME and modifying operations do not update mtime/ctime as to not get out-of-sync with the client's delegated view. However, for COPY operation, the server should update its view of mtime/ctime and reflect that in any GETATTR queries. Fixes: e5e9b24ab8fa ("nfsd: freeze c/mtime updates with outstanding WRITE_ATTRS delegation") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 11 ++++++++++- fs/nfsd/xdr4.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5de8f37df78a..ab39ec885440 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2121,8 +2121,10 @@ static int nfsd4_do_async_copy(void *data) set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); trace_nfsd_copy_async_done(copy); - nfsd4_send_cb_offload(copy); atomic_dec(©->cp_nn->pending_async_copies); + if (copy->cp_res.wr_bytes_written > 0 && copy->attr_update) + nfsd_update_cmtime_attr(copy->nf_dst->nf_file, 0); + nfsd4_send_cb_offload(copy); return 0; } @@ -2182,6 +2184,9 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); + if ((READ_ONCE(copy->nf_dst->nf_file->f_mode) & + FMODE_NOCMTIME) != 0) + async_copy->attr_update = true; memcpy(async_copy->cp_cb_offload.co_referring_sessionid.data, cstate->session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); @@ -2200,6 +2205,10 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else { status = nfsd4_do_copy(copy, copy->nf_src->nf_file, copy->nf_dst->nf_file, true); + if ((READ_ONCE(copy->nf_dst->nf_file->f_mode) & + FMODE_NOCMTIME) != 0 && + copy->cp_res.wr_bytes_written > 0) + nfsd_update_cmtime_attr(copy->nf_dst->nf_file, 0); } out: trace_nfsd_copy_done(copy, status); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 417e9ad9fbb3..9a4124c77e04 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -752,6 +752,7 @@ struct nfsd4_copy { struct nfsd_file *nf_src; struct nfsd_file *nf_dst; + bool attr_update; copy_stateid_t cp_stateid; From c00b472a1322d4f5424cd7b6c7d00270eae673bd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 11 Apr 2026 17:12:16 -0400 Subject: [PATCH 128/377] sunrpc: start cache request seqno at 1 to fix netlink GET_REQS sunrpc_cache_requests_snapshot() filters requests with crq->seqno <= min_seqno. The min_seqno for the first netlink dump call is cb->args[0] which is 0. Since next_seqno was initialized to 0, the very first cache request got seqno=0 and was silently skipped by the snapshot (0 <= 0 is true). This caused netlink-based GET_REQS to return 0 pending requests even when a request was queued, preventing mountd from resolving cache entries (particularly expkey/nfsd.fh). The unresolved CACHE_PENDING state blocked all further notifications for the entry, leading to permanent NFS4ERR_DELAY hangs. Start next_seqno at 1 so all requests have seqno >= 1 and pass the snapshot filter when min_seqno is 0. Fixes: facc4e3c8042 ("sunrpc: split cache_detail queue into request and reader lists") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 7081c1214e6c..b5474ce534fb 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -403,7 +403,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) INIT_LIST_HEAD(&cd->readers); spin_lock_init(&cd->queue_lock); init_waitqueue_head(&cd->queue_wait); - cd->next_seqno = 0; + cd->next_seqno = 1; spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; From 4f8ef58c10bfe5f86a643c7c8331b37e69e3dae1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 19 Apr 2026 14:52:59 -0400 Subject: [PATCH 129/377] NFSD: Fix infinite loop in layout state revocation find_one_sb_stid() skips stids whose sc_status is non-zero, but the SC_TYPE_LAYOUT case in nfsd4_revoke_states() never sets sc_status before calling nfsd4_close_layout(). The retry loop therefore finds the same layout stid on every iteration, hanging the revoker indefinitely. Fixes: 1e33e1414bec ("nfsd: allow layout state to be admin-revoked.") Reported-by: Dai Ngo Reviewed-by: Jeff Layton Tested-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3cd8b3b59de4..c92bc0c11065 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1851,6 +1851,13 @@ void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb) break; case SC_TYPE_LAYOUT: ls = layoutstateid(stid); + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + } + spin_unlock(&clp->cl_lock); nfsd4_close_layout(ls); break; } From e42c755582f0960e684298762f0ab927b3778376 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Fri, 8 May 2026 06:21:21 +0000 Subject: [PATCH 130/377] net: ena: PHC: Fix potential use-after-free in get_timestamp Move the phc->active check and resp pointer assignment to after acquiring the spinlock. Previously, phc->active was checked without holding the lock, and resp was cached from ena_dev->phc.virt_addr before the lock was acquired. If ena_com_phc_destroy() runs between the lockless active check and the lock acquisition, it sets active=false, releases the lock, frees the DMA memory, and sets virt_addr=NULL. The get_timestamp path would then read a NULL virt_addr and dereference it. With both the active check and the pointer read under the lock, destroy cannot free the memory while get_timestamp is using it. Fixes: e0ea34158ee8 ("net: ena: Add PHC support in the ENA driver") Cc: stable@vger.kernel.org Signed-off-by: Arthur Kiyanovski Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20260508062126.7273-1-akiyano@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_com.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index e67b592e5697..8c86789d867a 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -1782,20 +1782,23 @@ void ena_com_phc_destroy(struct ena_com_dev *ena_dev) int ena_com_phc_get_timestamp(struct ena_com_dev *ena_dev, u64 *timestamp) { - volatile struct ena_admin_phc_resp *resp = ena_dev->phc.virt_addr; const ktime_t zero_system_time = ktime_set(0, 0); struct ena_com_phc_info *phc = &ena_dev->phc; + volatile struct ena_admin_phc_resp *resp; ktime_t expire_time; ktime_t block_time; unsigned long flags = 0; int ret = 0; + spin_lock_irqsave(&phc->lock, flags); + if (!phc->active) { + spin_unlock_irqrestore(&phc->lock, flags); netdev_err(ena_dev->net_device, "PHC feature is not active in the device\n"); return -EOPNOTSUPP; } - spin_lock_irqsave(&phc->lock, flags); + resp = ena_dev->phc.virt_addr; /* Check if PHC is in blocked state */ if (unlikely(ktime_compare(phc->system_time, zero_system_time))) { From a450063ef86b9967234ca1f896c0d77400c74f11 Mon Sep 17 00:00:00 2001 From: Shitalkumar Gandhi Date: Thu, 7 May 2026 19:50:24 +0530 Subject: [PATCH 131/377] net: xgene: fix mdio_np leak in xgene_mdiobus_register() The for_each_child_of_node() loop captures mdio_np via break, holding the refcount. of_mdiobus_register() does not consume the reference, so it leaks on success. Put it after registration. Fixes: e6ad767305eb ("drivers: net: Add APM X-Gene SoC ethernet driver support.") Signed-off-by: Shitalkumar Gandhi Link: https://patch.msgid.link/20260507142024.811543-1-shitalkumar.gandhi@cambiumnetworks.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/apm/xgene/xgene_enet_hw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c index b854b6b42d77..2926e1e59941 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c @@ -910,7 +910,9 @@ static int xgene_mdiobus_register(struct xgene_enet_pdata *pdata, return -ENXIO; } - return of_mdiobus_register(mdio, mdio_np); + ret = of_mdiobus_register(mdio, mdio_np); + of_node_put(mdio_np); + return ret; } /* Mask out all PHYs from auto probing. */ From 6947bea4b79115f50138882512f85fa9c93b2827 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 132/377] sched_ext: Cleanups in preparation for the SCX_TASK_INIT_BEGIN/DEAD work Cleanups in preparation for the state-machine work that follows: - Convert three sub-sched call sites that open-code rcu_assign_pointer(p->scx.sched, ...) to scx_set_task_sched(). - Move scx_get_task_state()/scx_set_task_state() above the SCX task iter section so scx_task_iter_next_locked() can use them without a forward declaration. No functional change. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 76 +++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index df305712a2d4..10c6e0261f11 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -711,6 +711,41 @@ struct bpf_iter_scx_dsq { } __attribute__((aligned(8))); +static u32 scx_get_task_state(const struct task_struct *p) +{ + return p->scx.flags & SCX_TASK_STATE_MASK; +} + +static void scx_set_task_state(struct task_struct *p, u32 state) +{ + u32 prev_state = scx_get_task_state(p); + bool warn = false; + + switch (state) { + case SCX_TASK_NONE: + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_READY: + warn = prev_state == SCX_TASK_NONE; + break; + case SCX_TASK_ENABLED: + warn = prev_state != SCX_TASK_READY; + break; + default: + WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", + prev_state, state, p->comm, p->pid); + return; + } + + WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", + prev_state, state, p->comm, p->pid); + + p->scx.flags &= ~SCX_TASK_STATE_MASK; + p->scx.flags |= state; +} + /* * SCX task iterator. */ @@ -3499,41 +3534,6 @@ static struct cgroup *tg_cgrp(struct task_group *tg) #endif /* CONFIG_EXT_GROUP_SCHED */ -static u32 scx_get_task_state(const struct task_struct *p) -{ - return p->scx.flags & SCX_TASK_STATE_MASK; -} - -static void scx_set_task_state(struct task_struct *p, u32 state) -{ - u32 prev_state = scx_get_task_state(p); - bool warn = false; - - switch (state) { - case SCX_TASK_NONE: - break; - case SCX_TASK_INIT: - warn = prev_state != SCX_TASK_NONE; - break; - case SCX_TASK_READY: - warn = prev_state == SCX_TASK_NONE; - break; - case SCX_TASK_ENABLED: - warn = prev_state != SCX_TASK_READY; - break; - default: - WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", - prev_state, state, p->comm, p->pid); - return; - } - - WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", - prev_state, state, p->comm, p->pid); - - p->scx.flags &= ~SCX_TASK_STATE_MASK; - p->scx.flags |= state; -} - static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) { int ret; @@ -5696,7 +5696,7 @@ static void scx_fail_parent(struct scx_sched *sch, scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { scx_disable_and_exit_task(sch, p); - rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_sched(p, parent); } } scx_task_iter_stop(&sti); @@ -5783,7 +5783,7 @@ static void scx_sub_disable(struct scx_sched *sch) */ scx_disable_and_exit_task(sch, p); scx_set_task_state(p, SCX_TASK_INIT); - rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_sched(p, parent); scx_set_task_state(p, SCX_TASK_READY); scx_enable_task(parent, p); } @@ -7212,7 +7212,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work) * $p is now only initialized for @sch and READY, which * is what we want. Assign it to @sch and enable. */ - rcu_assign_pointer(p->scx.sched, sch); + scx_set_task_sched(p, sch); scx_enable_task(sch, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; From 938dd9ab2bd7df0a7e58ce4249794156be9530b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 133/377] sched_ext: Inline scx_init_task() and move RESET_RUNNABLE_AT into scx_set_task_state() Prepare for the SCX_TASK_INIT_BEGIN/DEAD work that follows by collapsing the scx_init_task() helper. Move the SCX_TASK_RESET_RUNNABLE_AT setting into scx_set_task_state() on the INIT transition (it was set unconditionally at every INIT site through the scx_init_task() helper), inline scx_init_task() into scx_fork() and scx_root_enable_workfn(), and drop the helper. As a side effect, scx_sub_disable() migration sequence now also sets RESET_RUNNABLE_AT (it previously wrote INIT directly without going through scx_init_task()). The flag triggers a runnable_at reset on the next set_task_runnable(), which is harmless on a task that has just been moved between scheds. On root-enable, p->scx.flags is written without the task's rq lock. The task isn't visible to scx yet, and a follow-up patch restores the lock-held write. v2: Note p->scx.flags rq-lock relaxation on root-enable path. (Andrea) Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 10c6e0261f11..81841277a54f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -726,6 +726,7 @@ static void scx_set_task_state(struct task_struct *p, u32 state) break; case SCX_TASK_INIT: warn = prev_state != SCX_TASK_NONE; + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; break; case SCX_TASK_READY: warn = prev_state == SCX_TASK_NONE; @@ -3585,22 +3586,6 @@ static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fo return 0; } -static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) -{ - int ret; - - ret = __scx_init_task(sch, p, fork); - if (!ret) { - /* - * While @p's rq is not locked. @p is not visible to the rest of - * SCX yet and it's safe to update the flags and state. - */ - p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; - scx_set_task_state(p, SCX_TASK_INIT); - } - return ret; -} - static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) { struct rq *rq = task_rq(p); @@ -3763,10 +3748,11 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) #else struct scx_sched *sch = scx_root; #endif - ret = scx_init_task(sch, p, true); - if (!ret) - scx_set_task_sched(p, sch); - return ret; + ret = __scx_init_task(sch, p, true); + if (unlikely(ret)) + return ret; + scx_set_task_state(p, SCX_TASK_INIT); + scx_set_task_sched(p, sch); } return 0; @@ -6897,8 +6883,8 @@ static void scx_root_enable_workfn(struct kthread_work *work) scx_task_iter_unlock(&sti); - ret = scx_init_task(sch, p, false); - if (ret) { + ret = __scx_init_task(sch, p, false); + if (unlikely(ret)) { put_task_struct(p); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", @@ -6906,6 +6892,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_disable_unlock_all; } + scx_set_task_state(p, SCX_TASK_INIT); scx_set_task_sched(p, sch); scx_set_task_state(p, SCX_TASK_READY); From cceb8fa9cb2cf98e31d81ecf6353b6ba5ac57744 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 134/377] sched_ext: Replace SCX_TASK_OFF_TASKS flag with SCX_TASK_DEAD state SCX_TASK_OFF_TASKS marked tasks already through sched_ext_dead() so cgroup task iteration would skip them. This can be expressed better with a task state. Replace the flag with SCX_TASK_DEAD. scx_disable_and_exit_task() resets state to NONE on its way out, so sched_ext_dead() now sets DEAD after the wrapper returns. The validation matrix grows NONE -> DEAD, warns on DEAD -> NONE, and tightens READY's predecessor to INIT or ENABLED so the new DEAD value cannot silently transition to READY. Prepares for the following enable vs dead race fix. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 9 +++++---- kernel/sched/ext.c | 17 +++++++++++------ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index adb9a4de068a..9f1a326ad03e 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,24 +101,25 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ - SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* - * Bits 8 and 9 are used to carry task state: + * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext + * DEAD terminal state set by sched_ext_dead() */ - SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ - SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_SHIFT = 8, + SCX_TASK_STATE_BITS = 3, SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 81841277a54f..2fc4a12711f9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -723,17 +723,22 @@ static void scx_set_task_state(struct task_struct *p, u32 state) switch (state) { case SCX_TASK_NONE: + warn = prev_state == SCX_TASK_DEAD; break; case SCX_TASK_INIT: warn = prev_state != SCX_TASK_NONE; p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; break; case SCX_TASK_READY: - warn = prev_state == SCX_TASK_NONE; + warn = !(prev_state == SCX_TASK_INIT || + prev_state == SCX_TASK_ENABLED); break; case SCX_TASK_ENABLED: warn = prev_state != SCX_TASK_READY; break; + case SCX_TASK_DEAD: + warn = prev_state != SCX_TASK_NONE; + break; default: WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", prev_state, state, p->comm, p->pid); @@ -972,11 +977,11 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) /* * cgroup_task_dead() removes the dead tasks from cset->tasks * after sched_ext_dead() and cgroup iteration may see tasks - * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS - * is set by sched_ext_dead() under @p's rq lock. Test it to + * which already finished sched_ext_dead(). %SCX_TASK_DEAD is + * set by sched_ext_dead() under @p's rq lock. Test it to * avoid visiting tasks which are already dead from SCX POV. */ - if (p->scx.flags & SCX_TASK_OFF_TASKS) { + if (scx_get_task_state(p) == SCX_TASK_DEAD) { __scx_task_iter_rq_unlock(iter); continue; } @@ -3847,7 +3852,7 @@ void sched_ext_dead(struct task_struct *p) * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> * ENABLED transitions can't race us. Disable ops for @p. * - * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see + * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup * iteration is only used from sub-sched paths, which require root * enabled. Root enable transitions every live task to at least READY. @@ -3858,7 +3863,7 @@ void sched_ext_dead(struct task_struct *p) rq = task_rq_lock(p, &rf); scx_disable_and_exit_task(scx_task_sched(p), p); - p->scx.flags |= SCX_TASK_OFF_TASKS; + scx_set_task_state(p, SCX_TASK_DEAD); task_rq_unlock(rq, p, &rf); } } From c941d7391f258d5d06e0f7e962a52f99a547a83e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 135/377] sched_ext: Close root-enable vs sched_ext_dead() race with SCX_TASK_INIT_BEGIN scx_root_enable_workfn() drops the iter rq lock for ops.init_task() and a TASK_DEAD @p can fall through sched_ext_dead() in that window. The race hits when sched_ext_dead() observes SCX_TASK_INIT (the intermediate state before @p->scx.sched is published) and dereferences NULL via SCX_HAS_OP(NULL, exit_task), or observes SCX_TASK_NONE during the unlocked init window and skips cleanup so exit_task() never runs. Add SCX_TASK_INIT_BEGIN. The enable path writes NONE -> INIT_BEGIN under the iter rq lock, then takes the rq lock again after init to walk INIT_BEGIN -> INIT -> READY. sched_ext_dead() that wins the rq-lock race observes INIT_BEGIN and sets DEAD without calling into ops; the post-init recheck unwinds via scx_sub_init_cancel_task(). scx_fork() runs single-threaded against sched_ext_dead() (the task is not on scx_tasks until scx_post_fork() adds it) so its INIT_BEGIN -> INIT walk needs no rq-lock pairing; it rolls back to NONE on ops.init_task() failure. The validation matrix grows the INIT_BEGIN row and the INIT_BEGIN -> DEAD edge; INIT now requires INIT_BEGIN as the predecessor. scx_sub_disable()'s migration writes INIT_BEGIN as a synthetic predecessor to satisfy the tightened verification. The sub-sched paths still race with sched_ext_dead() during the unlocked init window. This will be fixed by the next patch. Reported-by: zhidao su Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/ Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 10 ++++--- kernel/sched/ext.c | 56 ++++++++++++++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 9f1a326ad03e..2129e18ada58 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -106,6 +106,7 @@ enum scx_ent_flags { * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet + * INIT_BEGIN ops.init_task() in flight; see sched_ext_dead() * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext @@ -116,10 +117,11 @@ enum scx_ent_flags { SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, - SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, - SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, - SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, - SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT_BEGIN = 1 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT = 2 << SCX_TASK_STATE_SHIFT, + SCX_TASK_READY = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_ENABLED = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 5 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2fc4a12711f9..29fa9ffe7c7b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -725,8 +725,11 @@ static void scx_set_task_state(struct task_struct *p, u32 state) case SCX_TASK_NONE: warn = prev_state == SCX_TASK_DEAD; break; - case SCX_TASK_INIT: + case SCX_TASK_INIT_BEGIN: warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_INIT_BEGIN; p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; break; case SCX_TASK_READY: @@ -737,7 +740,8 @@ static void scx_set_task_state(struct task_struct *p, u32 state) warn = prev_state != SCX_TASK_READY; break; case SCX_TASK_DEAD: - warn = prev_state != SCX_TASK_NONE; + warn = !(prev_state == SCX_TASK_NONE || + prev_state == SCX_TASK_INIT_BEGIN); break; default: WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", @@ -3753,9 +3757,12 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) #else struct scx_sched *sch = scx_root; #endif + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); ret = __scx_init_task(sch, p, true); - if (unlikely(ret)) + if (unlikely(ret)) { + scx_set_task_state(p, SCX_TASK_NONE); return ret; + } scx_set_task_state(p, SCX_TASK_INIT); scx_set_task_sched(p, sch); } @@ -3856,13 +3863,18 @@ void sched_ext_dead(struct task_struct *p) * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup * iteration is only used from sub-sched paths, which require root * enabled. Root enable transitions every live task to at least READY. + * + * %INIT_BEGIN means ops.init_task() is running for @p. Don't call + * into ops; transition to %DEAD so the post-init recheck unwinds + * via scx_sub_init_cancel_task(). */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - scx_disable_and_exit_task(scx_task_sched(p), p); + if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) + scx_disable_and_exit_task(scx_task_sched(p), p); scx_set_task_state(p, SCX_TASK_DEAD); task_rq_unlock(rq, p, &rf); } @@ -5773,6 +5785,7 @@ static void scx_sub_disable(struct scx_sched *sch) * $p having already been initialized, and then enable. */ scx_disable_and_exit_task(sch, p); + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); scx_set_task_state(p, SCX_TASK_INIT); scx_set_task_sched(p, parent); scx_set_task_state(p, SCX_TASK_READY); @@ -6878,6 +6891,9 @@ static void scx_root_enable_workfn(struct kthread_work *work) scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { + struct rq_flags rf; + struct rq *rq; + /* * @p may already be dead, have lost all its usages counts and * be waiting for RCU grace period before being freed. @p can't @@ -6886,10 +6902,26 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (!tryget_task_struct(p)) continue; + /* + * Set %INIT_BEGIN under the iter's rq lock so that a concurrent + * sched_ext_dead() does not call ops.exit_task() on @p while + * ops.init_task() is running. If sched_ext_dead() runs before + * this store, it has already removed @p from scx_tasks and the + * iter won't visit @p; if it runs after, it observes + * %INIT_BEGIN and transitions to %DEAD without calling ops, + * leaving the post-init recheck below to unwind. + */ + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); scx_task_iter_unlock(&sti); ret = __scx_init_task(sch, p, false); + + rq = task_rq_lock(p, &rf); + if (unlikely(ret)) { + if (scx_get_task_state(p) != SCX_TASK_DEAD) + scx_set_task_state(p, SCX_TASK_NONE); + task_rq_unlock(rq, p, &rf); put_task_struct(p); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", @@ -6897,10 +6929,20 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_disable_unlock_all; } - scx_set_task_state(p, SCX_TASK_INIT); - scx_set_task_sched(p, sch); - scx_set_task_state(p, SCX_TASK_READY); + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. + * ops.exit_task() is owed to the sched __scx_init_task() + * ran against; call it now. + */ + scx_sub_init_cancel_task(sch, p); + } else { + scx_set_task_state(p, SCX_TASK_INIT); + scx_set_task_sched(p, sch); + scx_set_task_state(p, SCX_TASK_READY); + } + task_rq_unlock(rq, p, &rf); put_task_struct(p); } scx_task_iter_stop(&sti); From cd6aab736702f981ac4d128e04a4e33105ea797d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 136/377] sched_ext: Close sub-sched init race with post-init DEAD recheck scx_sub_enable_workfn()'s init pass and scx_sub_disable() migration both drop the rq lock to call __scx_init_task() against the other sched. A TASK_DEAD @p can fall through sched_ext_dead() in that window. sched_ext_dead() runs ops.exit_task() on the sched @p was attached to, not on the sched whose init just completed, so the new allocation leaks. Reuse the DEAD signal set by sched_ext_dead(). After __scx_init_task() returns, take task_rq_lock(p) and check for DEAD; on hit, call scx_sub_init_cancel_task() against the sub sched the init ran for and drop @p; on miss, proceed as before. Reported-by: zhidao su Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/ Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 29fa9ffe7c7b..6fbe3160eccd 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5777,6 +5777,21 @@ static void scx_sub_disable(struct scx_sched *sch) } rq = task_rq_lock(p, &rf); + + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() raced us between __scx_init_task() + * and this rq lock and ran exit_task() on @sch (the + * sched @p was on at that point), not on $parent. + * $parent's just-completed init is owed an exit_task() + * and we issue it here. + */ + scx_sub_init_cancel_task(parent, p); + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + continue; + } + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { /* * $p is initialized for $parent and still attached to @@ -5791,8 +5806,8 @@ static void scx_sub_disable(struct scx_sched *sch) scx_set_task_state(p, SCX_TASK_READY); scx_enable_task(parent, p); } - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, &rf); put_task_struct(p); } scx_task_iter_stop(&sti); @@ -7212,6 +7227,21 @@ static void scx_sub_enable_workfn(struct kthread_work *work) goto abort; rq = task_rq_lock(p, &rf); + + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() raced us between __scx_init_task() + * and this rq lock and ran exit_task() on $parent (the + * sched @p was on at that point), not on @sch. @sch's + * just-completed init is owed an exit_task() and we + * issue it here. + */ + scx_sub_init_cancel_task(sch, p); + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + continue; + } + p->scx.flags |= SCX_TASK_SUB_INIT; task_rq_unlock(rq, p, &rf); From d3e73a0808ddfb91ac36cd548643cbbeb00ad4db Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 137/377] sched_ext: Handle SCX_TASK_NONE in disable/switched_from paths scx_fail_parent() leaves cgroup tasks at (state=NONE, sched=parent, sched_class=ext) until the parent itself is torn down by the scx_error() it raised. When the later root_disable iterates them, two paths trip on NONE. scx_disable_and_exit_task() re-enters the wrapper at NONE: the inner switch returns early but the trailing scx_set_task_sched(p, NULL) clobbers the parent sched left by scx_fail_parent(), and scx_set_task_state(p, NONE) wastes a write on an already-NONE task. switched_from_scx() then calls scx_disable_task(), which WARNs on non-ENABLED state and writes state=READY, producing a NONE -> READY transition the validation matrix rejects. Treat NONE as "nothing to do" in both paths. Add a NONE early-return at the top of scx_disable_and_exit_task() and a parallel NONE check in switched_from_scx() next to task_dead_and_done(). Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6fbe3160eccd..4efe0099f79a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3703,6 +3703,15 @@ static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct * static void scx_disable_and_exit_task(struct scx_sched *sch, struct task_struct *p) { + /* + * %NONE means @p is already detached at the SCX level (e.g. handed + * back to the parent by scx_fail_parent() with no init to undo). + * Skip to avoid clobbering scx_task_sched() and writing %NONE again + * on a state that's already %NONE. + */ + if (scx_get_task_state(p) == SCX_TASK_NONE) + return; + __scx_disable_and_exit_task(sch, p); /* @@ -3921,6 +3930,16 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) if (task_dead_and_done(p)) return; + /* + * %NONE means SCX is no longer tracking @p at the task level (e.g. + * scx_fail_parent() handed @p back to the parent at NONE pending the + * parent's own teardown). There is nothing to disable; calling + * scx_disable_task() would WARN on the non-%ENABLED state and trigger a + * NONE -> READY validation failure. + */ + if (scx_get_task_state(p) == SCX_TASK_NONE) + return; + scx_disable_task(scx_task_sched(p), p); } From 796ad622040f7f955ccc3973085e953415920496 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Mon, 11 May 2026 09:31:50 +0800 Subject: [PATCH 138/377] cgroup/dmem: Return -ENOMEM on failed pool preallocation get_cg_pool_unlocked() handles allocation failures under dmemcg_lock by dropping the lock, preallocating a pool with GFP_KERNEL, and retrying the locked lookup and creation path. If the fallback allocation fails too, pool remains NULL. Since the loop condition is while (!pool), the function can keep retrying instead of propagating the allocation failure to the caller. Set pool to ERR_PTR(-ENOMEM) when the fallback allocation fails so the loop exits through the existing common return path. The callers already handle ERR_PTR() from get_cg_pool_unlocked(), so this restores the expected error path. Fixes: b168ed458dde ("kernel/cgroup: Add "dmem" memory accounting cgroup") Cc: stable@vger.kernel.org # v6.14+ Signed-off-by: Guopeng Zhang Signed-off-by: Tejun Heo --- kernel/cgroup/dmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 1ab1fb47f271..4753a67d0f0f 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -602,6 +602,7 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = NULL; continue; } + pool = ERR_PTR(-ENOMEM); } } From e32e6f02168f2ad7991eb5d160d312d2001520c8 Mon Sep 17 00:00:00 2001 From: Hongfu Li Date: Sat, 9 May 2026 16:03:28 +0800 Subject: [PATCH 139/377] selftests/cgroup: Fix cg_read_strcmp() empty string comparison cg_read_strcmp() allocated a buffer sized to strlen(expected) + 1, then passed it to read_text() which calls read(fd, buf, size-1). When comparing against an empty string (""), strlen("") = 0 gives a 1-byte buffer, and read() is asked to read 0 bytes. The file content is never actually read, so strcmp("", buf) always returns 0 regardless of the real content. This caused cg_test_proc_killed() to always report the cgroup as empty immediately, making OOM tests pass without verifying that processes were killed. Signed-off-by: Hongfu Li Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/lib/cgroup_util.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 6a7295347e90..42f54936f4bb 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -106,8 +106,9 @@ int cg_read_strcmp(const char *cgroup, const char *control, /* Handle the case of comparing against empty string */ if (!expected) return -1; - else - size = strlen(expected) + 1; + + /* needs size > 1, otherwise cg_read() reads 0 bytes */ + size = (expected[0] == '\0') ? 2 : strlen(expected) + 1; buf = malloc(size); if (!buf) From 2a3d7256faf06d1a15bb5b07e851ac4e1680c26d Mon Sep 17 00:00:00 2001 From: Hongfu Li Date: Mon, 11 May 2026 09:39:57 +0800 Subject: [PATCH 140/377] selftests/cgroup: Fix string comparison in write_test Use string comparison (!=) instead of numeric comparison (-ne) for cpuset values like "0-1". For example: $ [[ "0-1" != "2-3" ]] && echo "true" || echo "false" true $ [[ "0-1" -ne "2-3" ]] && echo "true" || echo "false" false Signed-off-by: Hongfu Li Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_cpuset_v1_base.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh index 42a6628fb8bc..1c0444729e70 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh @@ -18,7 +18,7 @@ write_test() { echo "testing $interface $value" echo $value > $dir/$interface new=$(cat $dir/$interface) - [[ $value -ne $(cat $dir/$interface) ]] && { + [[ "$value" != "$new" ]] && { echo "$interface write $value failed: new:$new" exit 1 } From 3788e32516530dee66cf9186f846480a16799b05 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sun, 10 May 2026 19:52:11 +0200 Subject: [PATCH 141/377] selftests/sched_ext: Fix build error in dequeue selftest Building the dequeue selftest with newer compilers (e.g., gcc 16) triggers the following error: dequeue.c:28:22: error: variable 'sum' set but not used The 'volatile' qualifier prevents the writes from being optimized away, but does not silence the unused variable 'sum' is indeed only written and never read. Consume 'sum' via an empty asm() with a register input constraint. This forces the compiler to keep the accumulated value (preserving the CPU stress loop) and avoiding the build error. Fixes: 658ad2259b3e ("selftests/sched_ext: Add test to validate ops.dequeue() semantics") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/dequeue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c index 4e93262703ca..383d06e972a4 100644 --- a/tools/testing/selftests/sched_ext/dequeue.c +++ b/tools/testing/selftests/sched_ext/dequeue.c @@ -33,6 +33,7 @@ static void worker_fn(int id) /* Do some work to trigger scheduling events */ for (j = 0; j < 10000; j++) sum += j; + asm volatile("" : : "r"(sum)); /* Sleep to trigger dequeue */ usleep(1000 + (id * 100)); From bbf30b383cf6e87f2fe57c292fbd640b1d88b4c3 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 11 May 2026 08:18:12 +0200 Subject: [PATCH 142/377] sched_ext: Fix ops->priv clobber on concurrent attach/detach Under heavy concurrent attach/detach operations, scx_claim_exit() can trigger a NULL pointer dereference. This can be reproduced running the reload_loop kselftests inside a virtme-ng session: $ vng -v -- ./tools/testing/selftests/sched_ext/runner -t reload_loop ... BUG: kernel NULL pointer dereference, address: 0000000000000400 RIP: 0010:scx_claim_exit+0x3b/0x120 Call Trace: bpf_scx_unreg+0x45/0xb0 bpf_struct_ops_map_link_dealloc+0x39/0x50 bpf_link_release+0x18/0x20 __fput+0x10b/0x2e0 __x64_sys_close+0x47/0xa0 The underlying race (diagnosed by Tejun Heo) is a stomp of @ops->priv, not a missing NULL check: T2 unreg(K) T1 reg(K) ----------- --------- sch = ops->priv = sch_b800 scx_disable; flush_disable_work [scx_root_disable: scx_root=NULL, mutex_unlock, state=DISABLED] mutex_lock; state ok scx_alloc_and_add_sched: ops->priv = sch_a800 scx_root = sch_a800; init=0 state=ENABLED; mutex_unlock [flush returns] RCU_INIT_POINTER(ops->priv, NULL) <-- clobbers sch_a800 kobject_put(sch_b800) T1 acquires scx_enable_mutex inside scx_root_disable()'s mutex_unlock window and starts a fresh attach on the same kdata, assigning sch_a800 to @ops->priv. T2 then continues out of scx_disable()/flush_disable_work and clobbers @ops->priv to NULL, leaking sch_a800; the bpf_link is gone but state stays SCX_ENABLED, so all future attaches fail with -EBUSY permanently. The next bpf_scx_unreg() on that kdata then reads NULL @ops->priv and dereferences it in scx_claim_exit(). Make @ops->priv the lifecycle binding: in scx_root_enable_workfn() and scx_sub_enable_workfn(), after the existing state check and still under scx_enable_mutex, refuse with -EBUSY if @ops->priv is non-NULL. This rejects an attempt to reuse a kdata that is still bound to a previous scheduler instance, closing the race without changing the unreg side. Fixes: 105dcd005be2 ("sched_ext: Introduce scx_prog_sched()") Suggested-by: Tejun Heo Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4efe0099f79a..8e06694094d7 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6803,6 +6803,19 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_unlock; } + /* + * @ops->priv binds @ops to its scx_sched instance. It is set here by + * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(), + * which runs after scx_root_disable() has dropped scx_enable_mutex. If + * it's still non-NULL here, a previous attachment on @ops has not + * finished tearing down; proceeding would let the in-flight unreg's + * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign. + */ + if (rcu_access_pointer(ops->priv)) { + ret = -EBUSY; + goto err_unlock; + } + ret = alloc_kick_syncs(); if (ret) goto err_unlock; @@ -7120,6 +7133,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work) goto out_unlock; } + /* See scx_root_enable_workfn() for the @ops->priv check. */ + if (rcu_access_pointer(ops->priv)) { + ret = -EBUSY; + goto out_unlock; + } + cgrp = cgroup_get_from_id(ops->sub_cgroup_id); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); From 8dfd3d8d74435344ee8dc9237596959c8b2a6cbe Mon Sep 17 00:00:00 2001 From: Eder Zulian Date: Fri, 10 Apr 2026 14:55:50 +0200 Subject: [PATCH 143/377] iommu/amd: Remove latent out-of-bounds access in IOMMU debugfs In iommu_mmio_write() and iommu_capability_write(), the variables dbg_mmio_offset and dbg_cap_offset are declared as int. However, they are populated using kstrtou32_from_user(). If a user provides a sufficiently large value, it can become a negative integer. Prior to this patch, the AMD IOMMU debugfs implementation was already protected by different mechanisms. 1. #define OFS_IN_SZ 8 ensures the user string <= 8 bytes, so e.g. 0xffffffff isn't a valid input. if (cnt > OFS_IN_SZ) return -EINVAL; 2. Implicit type promotion in iommu_mmio_write(), dbg_mmio_offset is int and iommu->mmio_phys_end is u64 if (dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) return -EINVAL; 3. The show handlers would currently catch the negative number and refuse to perform the read. Replace kstrtou32_from_user() with kstrtos32_from_user() to parse the input, and check for negative values to explicitly prevent out-of-bounds memory accesses directly in iommu_mmio_write() and iommu_capability_write(). Signed-off-by: Eder Zulian Fixes: 7a4ee419e8c1 ("iommu/amd: Add debugfs support to dump IOMMU MMIO registers") Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd/debugfs.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c index 4e66473d7cea..4c53b6361314 100644 --- a/drivers/iommu/amd/debugfs.c +++ b/drivers/iommu/amd/debugfs.c @@ -31,11 +31,12 @@ static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf, if (cnt > OFS_IN_SZ) return -EINVAL; - ret = kstrtou32_from_user(ubuf, cnt, 0, &dbg_mmio_offset); + ret = kstrtos32_from_user(ubuf, cnt, 0, &dbg_mmio_offset); if (ret) return ret; - if (dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) + if (dbg_mmio_offset < 0 || dbg_mmio_offset > + iommu->mmio_phys_end - sizeof(u64)) return -EINVAL; iommu->dbg_mmio_offset = dbg_mmio_offset; @@ -71,12 +72,12 @@ static ssize_t iommu_capability_write(struct file *filp, const char __user *ubuf if (cnt > OFS_IN_SZ) return -EINVAL; - ret = kstrtou32_from_user(ubuf, cnt, 0, &dbg_cap_offset); + ret = kstrtos32_from_user(ubuf, cnt, 0, &dbg_cap_offset); if (ret) return ret; /* Capability register at offset 0x14 is the last IOMMU capability register. */ - if (dbg_cap_offset > 0x14) + if (dbg_cap_offset < 0 || dbg_cap_offset > 0x14) return -EINVAL; iommu->dbg_cap_offset = dbg_cap_offset; From 07d0f496fe7ec5abe3bee7e38be709521567bb33 Mon Sep 17 00:00:00 2001 From: "Jose Fernandez (Anthropic)" Date: Tue, 21 Apr 2026 19:26:13 +0000 Subject: [PATCH 144/377] iommu/amd: Bounds-check devid in __rlookup_amd_iommu() iommu_device_register() walks every device on the PCI bus via bus_for_each_dev() and calls amd_iommu_probe_device() for each. The inlined check_device() path computes the device's sbdf, calls rlookup_amd_iommu() to find the owning IOMMU, and only afterwards verifies devid <= pci_seg->last_bdf. __rlookup_amd_iommu() indexes rlookup_table[devid] with no bounds check of its own, so for a PCI device whose BDF is not described by the IVRS, the lookup reads past the end of the allocation before the caller's bounds check can run. This was harmless before commit e874c666b15b ("iommu/amd: Change rlookup, irq_lookup, and alias to use kvalloc()"): the table was a zeroed page-order allocation, so the over-read returned NULL and the caller's NULL check skipped the device. After that commit the table is a tight kvcalloc() and the over-read returns adjacent slab contents, which check_device() then dereferences as a struct amd_iommu *, causing a boot-time GPF. Seen on Google Compute Engine ct6e VMs, where the virtualized IVRS describes only the four TPU endpoints 00:04.0-07.0; the gVNIC at 00:08.0 (devid 0x40) indexes 56 bytes past the 456-byte allocation, into the adjacent kmalloc-512 slab object: pci 0000:00:04.0: Adding to iommu group 0 pci 0000:00:05.0: Adding to iommu group 1 pci 0000:00:06.0: Adding to iommu group 2 pci 0000:00:07.0: Adding to iommu group 3 Oops: general protection fault, probably for non-canonical address 0x3a64695f78746382: 0000 [#1] SMP NOPTI CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.18.22 #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 12/06/2025 RIP: 0010:amd_iommu_probe_device+0x54/0x3a0 Call Trace: __iommu_probe_device+0x107/0x520 probe_iommu_group+0x29/0x50 bus_for_each_dev+0x7e/0xe0 iommu_device_register+0xc9/0x240 iommu_go_to_state+0x9c0/0x1c60 amd_iommu_init+0x14/0x40 pci_iommu_init+0x16/0x60 do_one_initcall+0x47/0x2f0 Guard the array access in __rlookup_amd_iommu(). With the fix applied on 6.18.22, the gVNIC at 00:08.0 is skipped cleanly and the VM boots. Fixes: e874c666b15b ("iommu/amd: Change rlookup, irq_lookup, and alias to use kvalloc()") Cc: stable@vger.kernel.org Reported-by: Ziyuan Chen Tested-by: Ziyuan Chen Reviewed-by: Josef Bacik Assisted-by: Claude:unspecified Signed-off-by: Jose Fernandez (Anthropic) Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index f78e23f03938..57dc8fabc7d9 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -351,8 +351,12 @@ static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid) struct amd_iommu_pci_seg *pci_seg; for_each_pci_segment(pci_seg) { - if (pci_seg->id == seg) - return pci_seg->rlookup_table[devid]; + if (pci_seg->id != seg) + continue; + /* IVRS may not describe every device on the bus */ + if (devid > pci_seg->last_bdf) + return NULL; + return pci_seg->rlookup_table[devid]; } return NULL; } From d769711fcddd005f1e654b3bde547140917fe696 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:20 -0700 Subject: [PATCH 145/377] iommu: Fix NULL group->domain dereference in pci_dev_reset_iommu_done() Local sashiko review pointed it out that group->domain could be NULL when a default domain fails to allocate during the first probe, which can crash at domain->ops->attach_dev dereference in __iommu_attach_device() invoked by pci_dev_reset_iommu_done(). pci_dev_reset_iommu_prepare() is fine as an old_domain pointer can be NULL. Skip the re-attach in pci_dev_reset_iommu_done() to fix the bug. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 61c12ba78206..b8847cc43e76 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4073,8 +4073,13 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) if (WARN_ON(!group->blocking_domain)) return; - /* Re-attach RID domain back to group->domain */ - if (group->domain != group->blocking_domain) { + /* + * Re-attach RID domain back to group->domain + * + * Leave the device parked in the blocking_domain if group->domain isn't + * initialized yet + */ + if (group->domain && group->domain != group->blocking_domain) { WARN_ON(__iommu_attach_device(group->domain, &pdev->dev, group->blocking_domain)); } From 834ab85aa96656f87bb215a8285d34af870f4b66 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:21 -0700 Subject: [PATCH 146/377] iommu: Fix kdocs of pci_dev_reset_iommu_done() Remove the duplicated word. No functional change. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b8847cc43e76..66659e5d1ec2 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4045,9 +4045,9 @@ EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); * @pdev: PCI device that has finished a reset routine * * After a PCIe device finishes a reset routine, it wants to restore its IOMMU - * IOMMU activity, including new translation as well as cache invalidation, by - * re-attaching all RID/PASID of the device's back to the domains retained in - * the core-level structure. + * activity, including new translation and cache invalidation, by re-attaching + * all RID/PASID of the device back to the domains retained in the core-level + * structure. * * Caller must pair it with a successful pci_dev_reset_iommu_prepare(). * From b296ca1fb43aa435edd86131f5230f70c03b2829 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:22 -0700 Subject: [PATCH 147/377] iommu: Replace per-group resetting_domain with per-gdev blocked flag The core tracks device resetting states with a per-group resetting_domain, while a reset is actually per group-device. Such a mismatch might lead to confusion and even difficulty to untangle per-gdev handling requirement. Shuai found that cxl_reset_bus_function() calls pci_reset_bus_function() internally while both are calling pci_dev_reset_iommu_prepare/done(). And the solution requires the core to track at the group_device level as well. Introduce a 'blocked' flag to struct group_device, to allow a multi-device group to isolate concurrent device resets independently. As the reset routine is per gdev, it cannot clear group->resetting_domain without iterating over the device list to ensure no other device is being reset. Simplify it by replacing the resetting_domain with a 'recovery_cnt' in the struct iommu_group. No functional change. But this is essential to apply following bug fixes. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Reported-by: Shuai Xue Closes: https://lore.kernel.org/all/absKsk7qQOwzhpzv@Asurada-Nvidia/ Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 102 ++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 66659e5d1ec2..2515786d4156 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -62,14 +62,14 @@ struct iommu_group { int id; struct iommu_domain *default_domain; struct iommu_domain *blocking_domain; - /* - * During a group device reset, @resetting_domain points to the physical - * domain, while @domain points to the attached domain before the reset. - */ - struct iommu_domain *resetting_domain; struct iommu_domain *domain; struct list_head entry; unsigned int owner_cnt; + /* + * Number of devices in the group undergoing or awaiting recovery. + * If non-zero, concurrent domain attachments are rejected. + */ + unsigned int recovery_cnt; void *owner; }; @@ -77,12 +77,32 @@ struct group_device { struct list_head list; struct device *dev; char *name; + /* + * Device is blocked for a pending recovery while its group->domain is + * retained. This can happen when: + * - Device is undergoing a reset + */ + bool blocked; }; /* Iterate over each struct group_device in a struct iommu_group */ #define for_each_group_device(group, pos) \ list_for_each_entry(pos, &(group)->devices, list) +static struct group_device *__dev_to_gdev(struct device *dev) +{ + struct iommu_group *group = dev->iommu_group; + struct group_device *gdev; + + lockdep_assert_held(&group->mutex); + + for_each_group_device(group, gdev) { + if (gdev->dev == dev) + return gdev; + } + return NULL; +} + struct iommu_group_attribute { struct attribute attr; ssize_t (*show)(struct iommu_group *group, char *buf); @@ -2196,6 +2216,8 @@ EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { + struct group_device *gdev; + /* * This is called on the dma mapping fast path so avoid locking. This is * racy, but we have an expectation that the driver will setup its DMAs @@ -2206,14 +2228,18 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) guard(mutex)(&dev->iommu_group->mutex); + gdev = __dev_to_gdev(dev); + if (WARN_ON(!gdev)) + return -ENODEV; + /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. * * Note that this might fail the iommu_dma_map(). But there's nothing * more we can do here. */ - if (dev->iommu_group->resetting_domain) + if (gdev->blocked) return -EBUSY; return __iommu_attach_device(domain, dev, NULL); } @@ -2270,19 +2296,24 @@ EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev); struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev) { struct iommu_group *group = dev->iommu_group; + struct group_device *gdev; lockdep_assert_held(&group->mutex); + gdev = __dev_to_gdev(dev); + if (WARN_ON(!gdev)) + return NULL; + /* * Driver handles the low-level __iommu_attach_device(), including the * one invoked by pci_dev_reset_iommu_done() re-attaching the device to * the cached group->domain. In this case, the driver must get the old - * domain from group->resetting_domain rather than group->domain. This + * domain from group->blocking_domain rather than group->domain. This * prevents it from re-attaching the device from group->domain (old) to * group->domain (new). */ - if (group->resetting_domain) - return group->resetting_domain; + if (gdev->blocked) + return group->blocking_domain; return group->domain; } @@ -2441,10 +2472,10 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, return -EINVAL; /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. */ - if (group->resetting_domain) + if (group->recovery_cnt) return -EBUSY; /* @@ -3615,10 +3646,10 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, mutex_lock(&group->mutex); /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. */ - if (group->resetting_domain) { + if (group->recovery_cnt) { ret = -EBUSY; goto out_unlock; } @@ -3708,10 +3739,10 @@ int iommu_replace_device_pasid(struct iommu_domain *domain, mutex_lock(&group->mutex); /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. */ - if (group->resetting_domain) { + if (group->recovery_cnt) { ret = -EBUSY; goto out_unlock; } @@ -3982,12 +4013,12 @@ EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); * routine wants to block any IOMMU activity: translation and ATS invalidation. * * This function attaches the device's RID/PASID(s) the group->blocking_domain, - * setting the group->resetting_domain. This allows the IOMMU driver pausing any + * incrementing the group->recovery_cnt, to allow the IOMMU driver pausing any * IOMMU activity while leaving the group->domain pointer intact. Later when the * reset is finished, pci_dev_reset_iommu_done() can restore everything. * * Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done() - * before/after the core-level reset routine, to unset the resetting_domain. + * before/after the core-level reset routine, to decrement the recovery_cnt. * * Return: 0 on success or negative error code if the preparation failed. * @@ -4000,6 +4031,7 @@ EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) { struct iommu_group *group = pdev->dev.iommu_group; + struct group_device *gdev; unsigned long pasid; void *entry; int ret; @@ -4009,8 +4041,12 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) guard(mutex)(&group->mutex); + gdev = __dev_to_gdev(&pdev->dev); + if (WARN_ON(!gdev)) + return -ENODEV; + /* Re-entry is not allowed */ - if (WARN_ON(group->resetting_domain)) + if (WARN_ON(gdev->blocked)) return -EBUSY; ret = __iommu_group_alloc_blocking_domain(group); @@ -4025,6 +4061,13 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) return ret; } + /* + * Update gdev->blocked upon the domain change, as it is used to return + * the correct domain in iommu_driver_get_domain_for_dev() that might be + * called in a set_dev_pasid callback function. + */ + gdev->blocked = true; + /* * Stage PASID domains at blocking_domain while retaining pasid_array. * @@ -4035,7 +4078,7 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) iommu_remove_dev_pasid(&pdev->dev, pasid, pasid_array_entry_to_domain(entry)); - group->resetting_domain = group->blocking_domain; + group->recovery_cnt++; return ret; } EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); @@ -4057,6 +4100,7 @@ EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); void pci_dev_reset_iommu_done(struct pci_dev *pdev) { struct iommu_group *group = pdev->dev.iommu_group; + struct group_device *gdev; unsigned long pasid; void *entry; @@ -4065,11 +4109,13 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) guard(mutex)(&group->mutex); - /* pci_dev_reset_iommu_prepare() was bypassed for the device */ - if (!group->resetting_domain) + gdev = __dev_to_gdev(&pdev->dev); + if (WARN_ON(!gdev)) + return; + + if (!gdev->blocked) return; - /* pci_dev_reset_iommu_prepare() was not successfully called */ if (WARN_ON(!group->blocking_domain)) return; @@ -4084,6 +4130,13 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) group->blocking_domain)); } + /* + * Update gdev->blocked upon the domain change, as it is used to return + * the correct domain in iommu_driver_get_domain_for_dev() that might be + * called in a set_dev_pasid callback function. + */ + gdev->blocked = false; + /* * Re-attach PASID domains back to the domains retained in pasid_array. * @@ -4095,7 +4148,8 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) pasid_array_entry_to_domain(entry), group, pasid, group->blocking_domain)); - group->resetting_domain = NULL; + if (!WARN_ON(group->recovery_cnt == 0)) + group->recovery_cnt--; } EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done); From 1615e8896a8f6d7b2adf6495e538a81bf6cea3e0 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:23 -0700 Subject: [PATCH 148/377] iommu: Fix pasid attach in pci_dev_reset_iommu_prepare/done() Now the helpers handle per-gdev resets. Replace __iommu_set_group_pasid() with set_dev_pasid() accordingly, in the pci_dev_reset_iommu_done(). Also add max_pasids check as other callers. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Reported-by: Shuai Xue Closes: https://lore.kernel.org/all/ad858513-09fc-455e-bbc5-fe38a225cc78@linux.alibaba.com/ Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2515786d4156..221be84db5ad 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4074,9 +4074,14 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) * The pasid_array is mostly fenced by group->mutex, except one reader * in iommu_attach_handle_get(), so it's safe to read without xa_lock. */ - xa_for_each_start(&group->pasid_array, pasid, entry, 1) - iommu_remove_dev_pasid(&pdev->dev, pasid, - pasid_array_entry_to_domain(entry)); + if (pdev->dev.iommu->max_pasids > 0) { + xa_for_each_start(&group->pasid_array, pasid, entry, 1) { + struct iommu_domain *pasid_dom = + pasid_array_entry_to_domain(entry); + + iommu_remove_dev_pasid(&pdev->dev, pasid, pasid_dom); + } + } group->recovery_cnt++; return ret; @@ -4143,10 +4148,16 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) * The pasid_array is mostly fenced by group->mutex, except one reader * in iommu_attach_handle_get(), so it's safe to read without xa_lock. */ - xa_for_each_start(&group->pasid_array, pasid, entry, 1) - WARN_ON(__iommu_set_group_pasid( - pasid_array_entry_to_domain(entry), group, pasid, - group->blocking_domain)); + if (pdev->dev.iommu->max_pasids > 0) { + xa_for_each_start(&group->pasid_array, pasid, entry, 1) { + struct iommu_domain *pasid_dom = + pasid_array_entry_to_domain(entry); + + WARN_ON(pasid_dom->ops->set_dev_pasid( + pasid_dom, &pdev->dev, pasid, + group->blocking_domain)); + } + } if (!WARN_ON(group->recovery_cnt == 0)) group->recovery_cnt--; From 0d5fd7a9323ce6bedd170e21e1e90b8904917c75 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:24 -0700 Subject: [PATCH 149/377] iommu: Fix nested pci_dev_reset_iommu_prepare/done() Shuai found that cxl_reset_bus_function() calls pci_reset_bus_function() internally while both are calling pci_dev_reset_iommu_prepare/done(). As pci_dev_reset_iommu_prepare() doesn't support re-entry, the inner call will trigger a WARN_ON and return -EBUSY, resulting in failing the entire device reset. On the other hand, removing the outer calls in the PCI callers is unsafe. As pointed out by Kevin, device-specific quirks like reset_hinic_vf_dev() execute custom firmware waits after their inner pcie_flr() completes. If the IOMMU protection relies solely on the inner reset, the IOMMU will be unblocked prematurely while the device is still resetting. Instead, fix this by making pci_dev_reset_iommu_prepare/done() reentrant. Introduce gdev->reset_depth to handle the re-entries on the same device. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Reported-by: Shuai Xue Closes: https://lore.kernel.org/all/absKsk7qQOwzhpzv@Asurada-Nvidia/ Suggested-by: Kevin Tian Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 221be84db5ad..301c76c40e3d 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -83,6 +83,7 @@ struct group_device { * - Device is undergoing a reset */ bool blocked; + unsigned int reset_depth; }; /* Iterate over each struct group_device in a struct iommu_group */ @@ -4045,20 +4046,23 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) if (WARN_ON(!gdev)) return -ENODEV; - /* Re-entry is not allowed */ - if (WARN_ON(gdev->blocked)) - return -EBUSY; + if (gdev->reset_depth++) + return 0; ret = __iommu_group_alloc_blocking_domain(group); - if (ret) + if (ret) { + gdev->reset_depth--; return ret; + } /* Stage RID domain at blocking_domain while retaining group->domain */ if (group->domain != group->blocking_domain) { ret = __iommu_attach_device(group->blocking_domain, &pdev->dev, group->domain); - if (ret) + if (ret) { + gdev->reset_depth--; return ret; + } } /* @@ -4118,7 +4122,10 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) if (WARN_ON(!gdev)) return; - if (!gdev->blocked) + /* Unbalanced done() calls would underflow the counter */ + if (WARN_ON(gdev->reset_depth == 0)) + return; + if (--gdev->reset_depth) return; if (WARN_ON(!group->blocking_domain)) From fc3523b16d2b4b88e61e69504b0ae0b18b869c8f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:25 -0700 Subject: [PATCH 150/377] iommu: Fix ATS invalidation timeouts during __iommu_remove_group_pasid() If a device is blocked, its PASID domains are already detached. Repeating iommu_remove_dev_pasid() is unnecessary and might trigger ATS invalidation timeouts. Skip the iommu_remove_dev_pasid() call upon gdev->blocked. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Closes: https://sashiko.dev/#/patchset/20260407194644.171304-1-nicolinc%40nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 301c76c40e3d..a5e3e90883f8 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3602,7 +3602,12 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, struct group_device *device; for_each_group_device(group, device) { - if (device->dev->iommu->max_pasids > 0) + /* + * A group-level detach cannot fail, even if there is a blocked + * device. In fact, blocked devices must be already detached for + * a pending device recovery. + */ + if (!device->blocked && device->dev->iommu->max_pasids > 0) iommu_remove_dev_pasid(device->dev, pasid, domain); } } From 5474e6e17a262db45c60575c73f70210f5c7001f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:26 -0700 Subject: [PATCH 151/377] iommu: Fix WARN_ON in __iommu_group_set_domain_nofail() due to reset In __iommu_group_set_domain_internal(), concurrent domain attachments are rejected when any device in the group is recovering. This is necessary to fence concurrent attachments to a multi-device group where devices might share the same RID due to PCI DMA alias quirks, but triggers the WARN_ON in __iommu_group_set_domain_nofail(). Other IOMMU_SET_DOMAIN_MUST_SUCCEED callers in detach/teardown paths, such as __iommu_group_set_core_domain and __iommu_release_dma_ownership, should not be rejected, as the domain would be freed anyway in these nofail paths while group->domain is still pointing to it. So pci_dev_reset_iommu_done() could trigger a UAF when re-attaching group->domain. Honor the IOMMU_SET_DOMAIN_MUST_SUCCEED flag, allowing the callers through the group->recovery_cnt fence, so as to update the group->domain pointer. Instead add a gdev->blocked check in the device iteration loop, to prevent any concurrent per-device detachment. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Closes: https://sashiko.dev/#/patchset/20260407194644.171304-1-nicolinc%40nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index a5e3e90883f8..845284a9eeaf 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2474,9 +2474,10 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, /* * This is a concurrent attach during device recovery. Reject it until - * pci_dev_reset_iommu_done() attaches the device to group->domain. + * pci_dev_reset_iommu_done() attaches the device to group->domain, if + * IOMMU_SET_DOMAIN_MUST_SUCCEED is not set. */ - if (group->recovery_cnt) + if (group->recovery_cnt && !(flags & IOMMU_SET_DOMAIN_MUST_SUCCEED)) return -EBUSY; /* @@ -2487,6 +2488,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, */ result = 0; for_each_group_device(group, gdev) { + /* + * Device under recovery is attached to group->blocking_domain. + * Don't change that. pci_dev_reset_iommu_done() will re-attach + * its domain to the updated group->domain, after the recovery. + */ + if (gdev->blocked) + continue; ret = __iommu_device_set_domain(group, gdev->dev, new_domain, group->domain, flags); if (ret) { From 15dd29ca620648a18a0334a3f6b26af181154a23 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:27 -0700 Subject: [PATCH 152/377] iommu: Warn on premature unblock during DMA aliased sibling reset When two aliased siblings are in the same iommu_group, they might share the same RID. The reset functions don't support this case, though it is unclear whether there is a real case of having an ATS capable device on a PCI/PCI-X bus. Theoretically, however, if two aliased devices are resetting concurrently, one might be unblocked prematurely in the middle of the reset by the other sibling who completes the reset first. This isn't a regression from this series but it's better to spit a warning, so we can know if such use case is common enough for us to make subsequent patches for its coverage. Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 49 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 845284a9eeaf..e7bd28cc77ee 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4105,6 +4105,41 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) } EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); +static int __group_device_cmp_dma_alias(struct pci_dev *dev, u16 alias, + void *data) +{ + return alias == *(u16 *)data; +} + +static int group_device_cmp_dma_alias(struct pci_dev *dev, u16 alias, + void *data) +{ + return pci_for_each_dma_alias(data, __group_device_cmp_dma_alias, + &alias); +} + +static bool group_device_dma_alias_is_blocked(struct iommu_group *group, + struct group_device *gdev) +{ + struct group_device *sibling; + + lockdep_assert_held(&group->mutex); + + if (!dev_is_pci(gdev->dev)) + return false; + + for_each_group_device(group, sibling) { + if (sibling == gdev || !sibling->blocked || + !dev_is_pci(sibling->dev)) + continue; + if (pci_for_each_dma_alias(to_pci_dev(gdev->dev), + group_device_cmp_dma_alias, + to_pci_dev(sibling->dev))) + return true; + } + return false; +} + /** * pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done * @pdev: PCI device that has finished a reset routine @@ -4144,6 +4179,20 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) if (WARN_ON(!group->blocking_domain)) return; + if (group_device_dma_alias_is_blocked(group, gdev)) { + /* + * FIXME: DMA aliased devices share the same RID, which would be + * convoluted to handle, as "gdev->blocked" is not sufficient: + * - "blocked" state is effectively shared across these devices + * - if the core skipped the blocking on the second device, the + * IOMMU driver's attachment state would diverge from the HW + * state + * For now, just warn and see whether real ATS use cases hit it. + */ + pci_warn(pdev, + "DMA-aliased sibling may be prematurely unblocked\n"); + } + /* * Re-attach RID domain back to group->domain * From 4a39eda5fdd867fc39f3c039714dd432cee00268 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Sat, 9 May 2026 18:20:30 +0800 Subject: [PATCH 153/377] cgroup/cpuset: Reset DL migration state on can_attach() failure cpuset_can_attach() accumulates temporary SCHED_DEADLINE migration state in the destination cpuset while walking the taskset. If a later task_can_attach() or security_task_setscheduler() check fails, cgroup_migrate_execute() treats cpuset as the failing subsystem and does not call cpuset_cancel_attach() for it. The partially accumulated state is then left behind and can be consumed by a later attach, corrupting cpuset DL task accounting and pending DL bandwidth accounting. Reset the pending DL migration state from the common error exit when ret is non-zero. Successful can_attach() keeps the state for cpuset_attach() or cpuset_cancel_attach(). Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails") Cc: stable@vger.kernel.org # v6.10+ Signed-off-by: Guopeng Zhang Signed-off-by: Tejun Heo Reviewed-by: Chen Ridong Reviewed-by: Waiman Long --- kernel/cgroup/cpuset.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a48901a0416a..3fbf6e7f68c3 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3050,16 +3050,13 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); if (unlikely(cpu >= nr_cpu_ids)) { - reset_migrate_dl_data(cs); ret = -EINVAL; goto out_unlock; } ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); - if (ret) { - reset_migrate_dl_data(cs); + if (ret) goto out_unlock; - } cs->dl_bw_cpu = cpu; } @@ -3070,7 +3067,10 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; + out_unlock: + if (ret) + reset_migrate_dl_data(cs); mutex_unlock(&cpuset_mutex); return ret; } From 2cda2e10dc8343ae01eae9e999a876b7e7d37861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naval=20Alcal=C3=A1?= Date: Sat, 9 May 2026 10:43:44 +0800 Subject: [PATCH 154/377] iommu/vt-d: Disable DMAR for Intel Q35 IGFX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intel Q35 integrated graphics (8086:29b2) exhibits broken DMAR behaviour similar to other G4x/GM45 devices for which DMAR is already disabled via quirks. When DMAR is enabled, the system may hard lock up during boot or early device initialization, requiring a reset. Add the missing PCI ID to the existing quirk list to disable DMAR for this device. Fixes: 1f76249cc3be ("iommu/vt-d: Declare Broadwell igfx dmar support snafu") Cc: stable@vger.kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=201185 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=216064 Signed-off-by: Naval Alcalá Link: https://lore.kernel.org/r/20260410161622.13549-1-ari@naval.cat Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index c3d18cd77d2f..2a6b6813a78d 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3937,6 +3937,9 @@ static void quirk_iommu_igfx(struct pci_dev *dev) disable_igfx_iommu = 1; } +/* Q35 integrated gfx dmar support is totally busted. */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x29b2, quirk_iommu_igfx); + /* G4x/GM45 integrated gfx dmar support is totally busted. */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); From a6dea58d8625c06b9654c0555f101742481335c3 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 9 May 2026 10:43:45 +0800 Subject: [PATCH 155/377] iommu/vt-d: Fix oops due to out of scope access Below oops triggers when kill QEMU process: Oops: general protection fault, probably for non-canonical address 0x7fffffff844eaaa7: 0000 [#1] SMP NOPTI Call Trace: do_raw_spin_lock+0xaa/0xc0 _raw_spin_lock_irqsave+0x21/0x40 domain_remove_dev_pasid+0x52/0x160 intel_nested_set_dev_pasid+0x1b9/0x1e0 __iommu_set_group_pasid+0x56/0x120 pci_dev_reset_iommu_done+0xe3/0x180 pcie_flr+0x65/0x160 __pci_reset_function_locked+0x5b/0x120 vfio_pci_core_close_device+0x63/0xe0 [vfio_pci_core] vfio_df_close+0x4f/0xa0 vfio_df_unbind_iommufd+0x2d/0x60 vfio_device_fops_release+0x3e/0x40 __fput+0xe5/0x2c0 task_work_run+0x58/0xa0 do_exit+0x2c8/0x600 do_group_exit+0x2f/0xa0 get_signal+0x863/0x8c0 arch_do_signal_or_restart+0x24/0x100 exit_to_user_mode_loop+0x87/0x380 do_syscall_64+0x2ff/0x11e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e The global static blocked domain is a dummy domain without corresponding dmar_domain structure, accessing beyond iommu_domain structure triggers oops easily. Fix it by return early in domain_remove_dev_pasid() like identity domain. Fixes: 7d0c9da6c150 ("iommu/vt-d: Add set_dev_pasid callback for dma domain") Cc: stable@vger.kernel.org Signed-off-by: Zhenzhong Duan Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260421031347.1408890-1-zhenzhong.duan@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2a6b6813a78d..a4b123c33022 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3530,8 +3530,8 @@ void domain_remove_dev_pasid(struct iommu_domain *domain, if (!domain) return; - /* Identity domain has no meta data for pasid. */ - if (domain->type == IOMMU_DOMAIN_IDENTITY) + /* Identity domain and blocked domain have no meta data for pasid. */ + if (domain->type == IOMMU_DOMAIN_IDENTITY || domain->type == IOMMU_DOMAIN_BLOCKED) return; dmar_domain = to_dmar_domain(domain); From 79ea2feb917b05366b49d85573c9c5331f043b2c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 9 May 2026 10:43:46 +0800 Subject: [PATCH 156/377] iommu/vt-d: Avoid NULL pointer dereference or refcount corruption Commit 60f030f7418d ("iommu/vt-d: Avoid use of NULL after WARN_ON_ONCE") fixed a NULL pointer dereference in an unlikely situation partly. If dev_pasid is not found in the dev_pasids list, it remains NULL. However, the teardown operations are executed unconditionally, this lead to a NULL pointer dereference or refcount corruption. If the domain was never attached to this IOMMU, info will be NULL, which would cause an immediate dereference when checking --info->refcnt. Even if info is not NULL, decrementing the refcount without having removed a valid PASID might unbalance the count. This could lead to premature dropping of the refcount to 0, potentially causing a use-after-free for the remaining active devices sharing the domain. Fix it by returning early if dev_pasid is NULL, before executing the teardown operations. Issue found by AI review and suggested by Kevin Tian. https://sashiko.dev/#/patchset/20260421031347.1408890-1-zhenzhong.duan%40intel.com Fixes: 60f030f7418d ("iommu/vt-d: Avoid use of NULL after WARN_ON_ONCE") Cc: stable@vger.kernel.org Suggested-by: Kevin Tian Signed-off-by: Zhenzhong Duan Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260422033538.95000-1-zhenzhong.duan@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a4b123c33022..4d0e65bc131d 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3545,12 +3545,13 @@ void domain_remove_dev_pasid(struct iommu_domain *domain, } spin_unlock_irqrestore(&dmar_domain->lock, flags); + if (WARN_ON_ONCE(!dev_pasid)) + return; + cache_tag_unassign_domain(dmar_domain, dev, pasid); domain_detach_iommu(dmar_domain, iommu); - if (!WARN_ON_ONCE(!dev_pasid)) { - intel_iommu_debugfs_remove_dev_pasid(dev_pasid); - kfree(dev_pasid); - } + intel_iommu_debugfs_remove_dev_pasid(dev_pasid); + kfree(dev_pasid); } static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, From 4c79fc2d598694bda845b46229c9d48b65042970 Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Wed, 22 Apr 2026 10:47:13 +0200 Subject: [PATCH 157/377] libceph: Fix potential out-of-bounds access in crush_decode() A message of type CEPH_MSG_OSD_MAP containing a crush map with at least one bucket has two fields holding the bucket algorithm. If the values in these two fields differ, an out-of-bounds access can occur. This is the case because the first algorithm field (alg) is used to allocate the correct amount of memory for a bucket of this type, while the second algorithm field inside the bucket (b->alg) is used in the subsequent processing. This patch fixes the issue by adding a check that compares alg and b->alg and aborts the processing in case they differ. Furthermore, b->alg is set to 0 in this case, because the destruction of the crush map also uses this field to determine the bucket type, which can again result in an out-of-bounds access when trying to free the memory pointed to by the fields of the bucket. To correctly free the memory allocated for the bucket in such a case, the corresponding call to kfree is moved from the algorithm-specific crush_destroy_bucket functions to the generic crush_destroy_bucket(). Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/crush/crush.c | 6 +----- net/ceph/osdmap.c | 4 ++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 254ded0b05f6..521aec1d5fc0 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -47,7 +47,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_list(struct crush_bucket_list *b) @@ -55,14 +54,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) kfree(b->item_weights); kfree(b->sum_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { kfree(b->h.items); kfree(b->node_weights); - kfree(b); } void crush_destroy_bucket_straw(struct crush_bucket_straw *b) @@ -70,14 +67,12 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) kfree(b->straws); kfree(b->item_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) { kfree(b->item_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket(struct crush_bucket *b) @@ -99,6 +94,7 @@ void crush_destroy_bucket(struct crush_bucket *b) crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b); break; } + kfree(b); } /** diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c89e66d4fcb7..a87268058e61 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -516,6 +516,10 @@ static struct crush_map *crush_decode(void *pbyval, void *end) b->id = ceph_decode_32(p); b->type = ceph_decode_16(p); b->alg = ceph_decode_8(p); + if (b->alg != alg) { + b->alg = 0; + goto bad; + } b->hash = ceph_decode_8(p); b->weight = ceph_decode_32(p); b->size = ceph_decode_32(p); From 596f91294b351866956808b1ecb8dfae15382a6d Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Fri, 24 Apr 2026 15:37:37 +0200 Subject: [PATCH 158/377] libceph: Fix unnecessarily high ceph_decode_need() for uniform bucket In crush_decode_uniform_bucket(), the item_weight field of the bucket is set. This is a single field of type u32 since the uniform bucket uses the same weight for all items. The value in ceph_decode_need() is set to (1+b->h.size) * sizeof(u32), which is higher than actually needed. This patch removes the call to ceph_decode_need() with the unnecessarily high value and switches the subsequent operation from ceph_decode_32() to ceph_decode_32_safe(), which already includes the correct bounds check. Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index a87268058e61..669348d883f0 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -72,8 +72,7 @@ static int crush_decode_uniform_bucket(void **p, void *end, struct crush_bucket_uniform *b) { dout("crush_decode_uniform_bucket %p to %p\n", *p, end); - ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); - b->item_weight = ceph_decode_32(p); + ceph_decode_32_safe(p, end, b->item_weight, bad); return 0; bad: return -EINVAL; From 5d3cc36b4e77a27ce7b686b7c59c7072bcb3fa8e Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Thu, 9 Apr 2026 12:26:02 -0700 Subject: [PATCH 159/377] ceph: fix a buffer leak in __ceph_setxattr() The old_blob in __ceph_setxattr() can store ci->i_xattrs.prealloc_blob value during the retry. However, it is never called the ceph_buffer_put() for the old_blob object. This patch fixes the issue of the buffer leak. Cc: stable@vger.kernel.org Signed-off-by: Viacheslav Dubeyko Reviewed-by: Alex Markuze Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5f87f62091a1..c6fcbf428317 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1294,6 +1294,7 @@ int __ceph_setxattr(struct inode *inode, const char *name, do_sync: spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); do_sync_unlocked: if (lock_snap_rwsem) up_read(&mdsc->snap_rwsem); From 0c22d9511cbde746622f8e4c11aaa63fe76d45f9 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Thu, 9 Apr 2026 12:43:40 -0700 Subject: [PATCH 160/377] ceph: fix BUG_ON in __ceph_build_xattrs_blob() due to stale blob size The generic/642 test-case can reproduce the kernel crash: [40243.605254] ------------[ cut here ]------------ [40243.605956] kernel BUG at fs/ceph/xattr.c:918! [40243.607142] Oops: invalid opcode: 0000 [#1] SMP PTI [40243.608067] CPU: 7 UID: 0 PID: 498762 Comm: kworker/7:1 Not tainted 7.0.0-rc7+ #3 PREEMPT(full) [40243.609700] Hardware name: QEMU Ubuntu 25.10 PC v2 (i440FX + PIIX, + 10.1 machine, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [40243.611820] Workqueue: ceph-msgr ceph_con_workfn [40243.612715] RIP: 0010:__ceph_build_xattrs_blob+0x1b8/0x1e0 [40243.613731] Code: 0f 84 82 fe ff ff e9 cf 8e 56 ff 48 8d 65 e8 31 c0 5b 41 5c 41 5d 5d 31 d2 31 c9 31 f6 31 ff 45 31 c0 45 31 c9 c3 cc cc cc cc <0f> 0b 4c 8b 62 08 41 8b 85 24 07 00 00 49 83 c4 04 41 89 44 24 fc [40243.616888] RSP: 0018:ffffcc80c4d4b688 EFLAGS: 00010287 [40243.617773] RAX: 0000000000010026 RBX: 0000000000000001 RCX: 0000000000000000 [40243.618928] RDX: ffff8a773798dee0 RSI: 0000000000000000 RDI: 0000000000000000 [40243.620158] RBP: ffffcc80c4d4b6a0 R08: 0000000000000000 R09: 0000000000000000 [40243.621573] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8a75f3b58000 [40243.622907] R13: ffff8a75f3b58000 R14: 0000000000000080 R15: 000000000000bffd [40243.624054] FS: 0000000000000000(0000) GS:ffff8a787d1b4000(0000) knlGS:0000000000000000 [40243.625331] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [40243.626269] CR2: 000072f390b623c0 CR3: 000000011c02a003 CR4: 0000000000372ef0 [40243.627408] Call Trace: [40243.627839] [40243.628188] __prep_cap+0x3fd/0x4a0 [40243.628789] ? do_raw_spin_unlock+0x4e/0xe0 [40243.629474] ceph_check_caps+0x46a/0xc80 [40243.630094] ? __lock_acquire+0x4a2/0x2650 [40243.630773] ? find_held_lock+0x31/0x90 [40243.631347] ? handle_cap_grant+0x79f/0x1060 [40243.632068] ? lock_release+0xd9/0x300 [40243.632696] ? __mutex_unlock_slowpath+0x3e/0x340 [40243.633429] ? lock_release+0xd9/0x300 [40243.634052] handle_cap_grant+0xcf6/0x1060 [40243.634745] ceph_handle_caps+0x122b/0x2110 [40243.635415] mds_dispatch+0x5bd/0x2160 [40243.636034] ? ceph_con_process_message+0x65/0x190 [40243.636828] ? lock_release+0xd9/0x300 [40243.637431] ceph_con_process_message+0x7a/0x190 [40243.638184] ? kfree+0x311/0x4f0 [40243.638749] ? kfree+0x311/0x4f0 [40243.639268] process_message+0x16/0x1a0 [40243.639915] ? sg_free_table+0x39/0x90 [40243.640572] ceph_con_v2_try_read+0xf58/0x2120 [40243.641255] ? lock_acquire+0xc8/0x300 [40243.641863] ceph_con_workfn+0x151/0x820 [40243.642493] process_one_work+0x22f/0x630 [40243.643093] ? process_one_work+0x254/0x630 [40243.643770] worker_thread+0x1e2/0x400 [40243.644332] ? __pfx_worker_thread+0x10/0x10 [40243.645020] kthread+0x109/0x140 [40243.645560] ? __pfx_kthread+0x10/0x10 [40243.646125] ret_from_fork+0x3f8/0x480 [40243.646752] ? __pfx_kthread+0x10/0x10 [40243.647316] ? __pfx_kthread+0x10/0x10 [40243.647919] ret_from_fork_asm+0x1a/0x30 [40243.648556] [40243.648902] Modules linked in: overlay hctr2 libpolyval chacha libchacha adiantum libnh libpoly1305 essiv intel_rapl_msr intel_rapl_common intel_uncore_frequency_common skx_edac_common nfit kvm_intel kvm irqbypass joydev ghash_clmulni_intel aesni_intel rapl input_leds mac_hid psmouse vga16fb serio_raw vgastate floppy i2c_piix4 pata_acpi bochs qemu_fw_cfg i2c_smbus sch_fq_codel rbd dm_crypt msr parport_pc ppdev lp parport efi_pstore [40243.654766] ---[ end trace 0000000000000000 ]--- Commit d93231a6bc8a ("ceph: prevent a client from exceeding the MDS maximum xattr size") moved the required_blob_size computation to before the __build_xattrs() call, introducing a race. __build_xattrs() releases and reacquires i_ceph_lock during execution. In that window, handle_cap_grant() may update i_xattrs.blob with a newer MDS-provided blob and bump i_xattrs.version. When __build_xattrs() detects that index_version < version, it destroys and rebuilds the entire xattr rb-tree from the new blob, potentially increasing count, names_size, and vals_size. The prealloc_blob size check that follows still uses the stale required_blob_size computed before the rebuild, so it passes even when prealloc_blob is too small for the now-larger tree. After __set_xattr() adds one more xattr on top, __ceph_build_xattrs_blob() is called from the cap flush path and hits: BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len); Fix this by recomputing required_blob_size after __build_xattrs() returns, using the current tree state. Also re-validate against m_max_xattr_size to fall back to the sync path if the rebuilt tree now exceeds the MDS limit. Cc: stable@vger.kernel.org Fixes: d93231a6bc8a ("ceph: prevent a client from exceeding the MDS maximum xattr size") Signed-off-by: Viacheslav Dubeyko Reviewed-by: Alex Markuze Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index c6fcbf428317..e773be07f767 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1254,6 +1254,22 @@ int __ceph_setxattr(struct inode *inode, const char *name, ceph_vinop(inode), name, ceph_cap_string(issued)); __build_xattrs(inode); + /* + * __build_xattrs() may have released and reacquired i_ceph_lock, + * during which handle_cap_grant() could have replaced i_xattrs.blob + * with a newer MDS-provided blob and bumped i_xattrs.version. If that + * caused __build_xattrs() to rebuild the rb-tree from the new blob, + * count/names_size/vals_size may now be larger than when + * required_blob_size was computed above. Recompute it here so the + * prealloc_blob size check below reflects the current tree state. + */ + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + if (required_blob_size > mdsc->mdsmap->m_max_xattr_size) { + doutc(cl, "sync (size too large): %d > %llu\n", + required_blob_size, mdsc->mdsmap->m_max_xattr_size); + goto do_sync; + } + if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { struct ceph_buffer *blob; From 821365487aa58d06bda65c676ba215d506ba9768 Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Tue, 28 Apr 2026 14:15:46 +0200 Subject: [PATCH 161/377] libceph: Fix potential out-of-bounds access in __ceph_x_decrypt() In __ceph_x_decrypt(), a part of the buffer p is interpreted as a ceph_x_encrypt_header, and the magic field of this struct is accessed. This happens without any guarantee that the buffer is large enough to hold this struct. The function parameter ciphertext_len represents the length of the ciphertext to decrypt and is guaranteed to be at most the remaining size of the allocated buffer p. However, this value is not necessarily greater than sizeof(ceph_x_encrypt_header). E.g., a message frame of type FRAME_TAG_AUTH_REPLY_MORE, that is just as long to hold the ciphertext at its end with a ciphertext_len of 8 or less, can trigger an out-of-bounds memory access when accessing hdr->magic. This patch fixes the issue by adding a check to ensure that the decrypted plaintext in the buffer is large enough to represent at least the ceph_x_encrypt_header. Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/auth_x.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 692e0b868822..9e64e82d0b63 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -115,6 +115,11 @@ static int __ceph_x_decrypt(const struct ceph_crypto_key *key, int usage_slot, if (ret) return ret; + if (plaintext_len < sizeof(*hdr)) { + pr_err("%s plaintext too small %d\n", __func__, plaintext_len); + return -EINVAL; + } + hdr = p + ceph_crypt_data_offset(key); if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) { pr_err("%s bad magic\n", __func__); From 10d9be401108a0fc8b3bc99ba07bdee8fff875ac Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Thu, 9 Apr 2026 11:33:23 -0700 Subject: [PATCH 162/377] ceph: add ceph_has_realms_with_quotas() check to ceph_quota_update_statfs() When MDS rejects a session, remove_session_caps() -> __ceph_remove_cap() -> ceph_change_snap_realm() clears i_snap_realm for every inode that loses its last cap. The realm is restored once caps are re-granted after reconnect. It is not a real error and this patch changes pr_err_ratelimited_client() on doutc(). Every quota methods ceph_quota_is_max_files_exceeded(), ceph_quota_is_max_bytes_exceeded(), ceph_quota_is_max_bytes_approaching() calls ceph_has_realms_with_quotas() check. This patch adds the missing ceph_has_realms_with_quotas() call into ceph_quota_update_statfs(). [ idryomov: add braces around both arms of multiline ifs ] Signed-off-by: Viacheslav Dubeyko Reviewed-by: Alex Markuze Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 4dc9426643e8..053d5bf0c9f0 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -228,12 +228,19 @@ static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode, restart: realm = ceph_inode(inode)->i_snap_realm; - if (realm) + if (realm) { ceph_get_snap_realm(mdsc, realm); - else - pr_err_ratelimited_client(cl, - "%p %llx.%llx null i_snap_realm\n", - inode, ceph_vinop(inode)); + } else { + /* + * i_snap_realm is NULL when all caps have been released, e.g. + * after an MDS session rejection. This is a transient state; + * the realm will be restored once caps are re-granted. + * Treat it as "no quota realm found". + */ + doutc(cl, "%p %llx.%llx null i_snap_realm\n", + inode, ceph_vinop(inode)); + } + while (realm) { bool has_inode; @@ -340,12 +347,19 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, down_read(&mdsc->snap_rwsem); restart: realm = ceph_inode(inode)->i_snap_realm; - if (realm) + if (realm) { ceph_get_snap_realm(mdsc, realm); - else - pr_err_ratelimited_client(cl, - "%p %llx.%llx null i_snap_realm\n", - inode, ceph_vinop(inode)); + } else { + /* + * i_snap_realm is NULL when all caps have been released, e.g. + * after an MDS session rejection. This is a transient state; + * the realm will be restored once caps are re-granted. + * Treat it as "quota not exceeded". + */ + doutc(cl, "%p %llx.%llx null i_snap_realm\n", + inode, ceph_vinop(inode)); + } + while (realm) { bool has_inode; @@ -496,6 +510,9 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) u64 total = 0, used, free; bool is_updated = false; + if (!ceph_has_realms_with_quotas(d_inode(fsc->sb->s_root))) + return false; + down_read(&mdsc->snap_rwsem); get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES, &realm, true); From 544576f0f05c4a759806acddfaaeb686f14fb4b0 Mon Sep 17 00:00:00 2001 From: Hristo Venev Date: Mon, 4 May 2026 18:54:45 +0300 Subject: [PATCH 163/377] ceph: put folios not suitable for writeback The batch holds references to the folios (see `filemap_get_folios`, `folio_batch_release`), so we need to `folio_put` the folios we remove. Tested on v6.18. Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/74156 Signed-off-by: Hristo Venev Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1454760332ff..0a86f672cc09 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1336,6 +1336,7 @@ void ceph_process_folio_batch(struct address_space *mapping, ceph_wbc, folio); if (rc == -ENODATA) { folio_unlock(folio); + folio_put(folio); ceph_wbc->fbatch.folios[i] = NULL; continue; } else if (rc == -E2BIG) { @@ -1346,6 +1347,7 @@ void ceph_process_folio_batch(struct address_space *mapping, if (!folio_clear_dirty_for_io(folio)) { doutc(cl, "%p !folio_clear_dirty_for_io\n", folio); folio_unlock(folio); + folio_put(folio); ceph_wbc->fbatch.folios[i] = NULL; continue; } From 86ecb1c1a1f5c1bf4a45b91f54f8220c3121bd3b Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 11 May 2026 10:31:30 +0200 Subject: [PATCH 164/377] sched_ext: Clear ops->priv on scx_alloc_and_add_sched() error paths scx_alloc_and_add_sched() can fail after @sch has been assigned to ops->priv. In those cases @sch is torn down (either via kfree() through the err_free_* chain or via kobject_put() -> scx_kobj_release() -> RCU work), but @ops->priv is left pointing at the about-to-be-freed pointer. With the recent -EBUSY gate in scx_root_enable_workfn() and scx_sub_enable_workfn() that rejects an attach when @ops->priv is still non-NULL, see commit bbf30b383cf6 ("sched_ext: Fix ops->priv clobber on concurrent attach/detach"), a dangling @ops->priv permanently locks the kdata out: every future attach attempt sees a stale binding and returns -EBUSY even though no scheduler is actually attached. Clear @ops->priv on the post-assign failure paths so that the kdata returns to its pre-attach state when the function returns ERR_PTR(). Fixes: bbf30b383cf6 ("sched_ext: Fix ops->priv clobber on concurrent attach/detach") Suggested-by: Tejun Heo Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8e06694094d7..1efd5d82b08b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6670,6 +6670,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(ret); } @@ -6677,6 +6678,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, if (ops->sub_attach) { sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); if (!sch->sub_kset) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(-ENOMEM); } @@ -6684,6 +6686,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #else /* CONFIG_EXT_SUB_SCHED */ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(ret); } @@ -6692,6 +6695,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #ifdef CONFIG_EXT_SUB_SCHED err_free_lb_resched: + RCU_INIT_POINTER(ops->priv, NULL); free_cpumask_var(sch->bypass_lb_resched_cpumask); #endif err_free_lb_cpumask: From 657b594b2084b39a4bc6d8493aa2140cb00cea49 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 7 May 2026 16:46:29 +0900 Subject: [PATCH 165/377] fprobe: Fix unregister_fprobe() to wait for RCU grace period Commit 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") changed fprobe to register struct fprobe to an rcu-hlist, but it forgot to wait for RCU GP. Thus there can be use-after-free if the fprobe is released right after unregistering. This can be happened on fprobe event and sample module code. To fix this issue, add synchronize_rcu() in unregister_fprobe(). Note that BPF is OK because fprobe is used as a part of bpf_kprobe_multi_link. This unregisters its fprobe in bpf_kprobe_multi_link_release() and it is deallocated via bpf_kprobe_multi_link_dealloc(), which is invoked from bpf_link_defer_dealloc_rcu_gp() RCU callback. For BPF, this also introduced unregister_fprobe_async() which does NOT wait for RCU grace priod. Link: https://lore.kernel.org/all/177813998919.256460.2809243930741138224.stgit@mhiramat.tok.corp.google.com/ Fixes: 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 5 +++++ kernel/trace/bpf_trace.c | 3 ++- kernel/trace/fprobe.c | 23 +++++++++++++++++++++-- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 0a3bcd1718f3..be1b38c981d4 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -94,6 +94,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); int register_fprobe_syms(struct fprobe *fp, const char **syms, int num); int unregister_fprobe(struct fprobe *fp); +int unregister_fprobe_async(struct fprobe *fp); bool fprobe_is_registered(struct fprobe *fp); int fprobe_count_ips_from_filter(const char *filter, const char *notfilter); #else @@ -113,6 +114,10 @@ static inline int unregister_fprobe(struct fprobe *fp) { return -EOPNOTSUPP; } +static inline int unregister_fprobe_async(struct fprobe *fp) +{ + return -EOPNOTSUPP; +} static inline bool fprobe_is_registered(struct fprobe *fp) { return false; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index af7079aa0f36..a02bd258677e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2384,7 +2384,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link) struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); - unregister_fprobe(&kmulti_link->fp); + /* Don't wait for RCU GP here. */ + unregister_fprobe_async(&kmulti_link->fp); kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index cc49ebd2a773..f378613ad120 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -1093,14 +1093,15 @@ static int unregister_fprobe_nolock(struct fprobe *fp) } /** - * unregister_fprobe() - Unregister fprobe. + * unregister_fprobe_async() - Unregister fprobe without RCU GP wait * @fp: A fprobe data structure to be unregistered. * * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will NOT wait until the fprobe is no longer used. * * Return 0 if @fp is unregistered successfully, -errno if not. */ -int unregister_fprobe(struct fprobe *fp) +int unregister_fprobe_async(struct fprobe *fp) { guard(mutex)(&fprobe_mutex); if (!fp || !fprobe_registered(fp)) @@ -1108,6 +1109,24 @@ int unregister_fprobe(struct fprobe *fp) return unregister_fprobe_nolock(fp); } + +/** + * unregister_fprobe() - Unregister fprobe with RCU GP wait + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will block until the fprobe is no longer used. + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret = unregister_fprobe_async(fp); + + if (!ret) + synchronize_rcu(); + return ret; +} EXPORT_SYMBOL_GPL(unregister_fprobe); static int __init fprobe_initcall(void) From 5082d8835070fe63f3c61fe574cbb5319bd94575 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2026 07:24:57 +0200 Subject: [PATCH 166/377] xfs: fix the "limiting open zones" message The xfs logging macros include a newline, remove the \n, which adds an extra one. Signed-off-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Andrey Albershteyn Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index c64f9ab743a6..5e297b75a85f 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1170,7 +1170,7 @@ xfs_calc_open_zones( if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { mp->m_max_open_zones = bdev_open_zones; - xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", + xfs_info(mp, "limiting open zones to %u due to hardware limit.", bdev_open_zones); } From 509fdeb3326be0db055e88d0f689a3888f147f90 Mon Sep 17 00:00:00 2001 From: Md Shofiqul Islam Date: Wed, 6 May 2026 19:36:58 +0300 Subject: [PATCH 167/377] xfs: Fix typo in comment Fix spelling mistake in comment: - occured -> occurred Reviewed-by: Darrick J. Wong Signed-off-by: Md Shofiqul Islam Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_notify_failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 64c8afb935c2..b994ff15d5e4 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -350,7 +350,7 @@ xfs_dax_notify_dev_failure( /* * Shutdown fs from a force umount in pre-remove case which won't fail, * so errors can be ignored. Otherwise, shutdown the filesystem with - * CORRUPT flag if error occured or notify.want_shutdown was set during + * CORRUPT flag if error occurred or notify.want_shutdown was set during * RMAP querying. */ if (mf_flags & MF_MEM_PRE_REMOVE) From 0c3887a134f191723b53e2a47e501b534c8723ee Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Sun, 10 May 2026 04:25:31 +0000 Subject: [PATCH 168/377] platform/x86: lenovo-wmi-helpers: Fix memory leak in lwmi_dev_evaluate_int() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lwmi_dev_evaluate_int() leaks output.pointer when retval == NULL (found by sashiko.dev [1]). Fix it by moving `ret_obj = output.pointer' outside of the `if (retval)' block so that it is always freed by the __free cleanup callback. No functional change intended. Reviewed-by: Mark Pearson Fixes: e521d16e76cd ("platform/x86: Add lenovo-wmi-helpers") Cc: stable@vger.kernel.org Link: https://sashiko.dev/#/patchset/20260331181208.421552-1-derekjohn.clark%40gmail.com [1] Signed-off-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-2-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-helpers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-helpers.c b/drivers/platform/x86/lenovo/wmi-helpers.c index 7379defac500..018d7642e2bd 100644 --- a/drivers/platform/x86/lenovo/wmi-helpers.c +++ b/drivers/platform/x86/lenovo/wmi-helpers.c @@ -46,7 +46,6 @@ int lwmi_dev_evaluate_int(struct wmi_device *wdev, u8 instance, u32 method_id, unsigned char *buf, size_t size, u32 *retval) { struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL }; - union acpi_object *ret_obj __free(kfree) = NULL; struct acpi_buffer input = { size, buf }; acpi_status status; @@ -55,8 +54,9 @@ int lwmi_dev_evaluate_int(struct wmi_device *wdev, u8 instance, u32 method_id, if (ACPI_FAILURE(status)) return -EIO; + union acpi_object *ret_obj __free(kfree) = output.pointer; + if (retval) { - ret_obj = output.pointer; if (!ret_obj) return -ENODATA; From 55a279ae819adaea99a94c609f31970b70e0ec0c Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Sun, 10 May 2026 04:25:32 +0000 Subject: [PATCH 169/377] platform/x86: lenovo-wmi-other: Balance IDA id allocation and free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the IDA id is only freed on wmi-other device removal or failure to create firmware-attributes device, kset, or attributes. It leaks IDA ids if the wmi-other device is bound multiple times, as the unbind callback never frees the previously allocated IDA id. Additionally, if the wmi-other device has failed to create a firmware-attributes device before it gets removed, the wmi-device removal callback double frees the same IDA id. These bugs were found by sashiko.dev [1]. Fix them by moving ida_free() into lwmi_om_fw_attr_remove() so it is balanced with ida_alloc() in lwmi_om_fw_attr_add(). With them fixed, properly set and utilize the validity of priv->ida_id to balance firmware-attributes registration and removal, without relying on propagating the registration error to the component framework, which is more reliable and aligns with the hwmon device registration and removal sequences. No functional change intended. Reviewed-by: Mark Pearson Fixes: edc4b183b794 ("platform/x86: Add Lenovo Other Mode WMI Driver") Cc: stable@vger.kernel.org Link: https://sashiko.dev/#/patchset/20260331181208.421552-1-derekjohn.clark%40gmail.com [1] Signed-off-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-3-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-other.c | 36 ++++++++++++++----------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 6c2febe1a595..ebef3649d0a7 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -959,17 +959,17 @@ static struct capdata01_attr_group cd01_attr_groups[] = { /** * lwmi_om_fw_attr_add() - Register all firmware_attributes_class members * @priv: The Other Mode driver data. - * - * Return: Either 0, or an error code. */ -static int lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) +static void lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) { unsigned int i; int err; - priv->ida_id = ida_alloc(&lwmi_om_ida, GFP_KERNEL); - if (priv->ida_id < 0) - return priv->ida_id; + err = ida_alloc(&lwmi_om_ida, GFP_KERNEL); + if (err < 0) + goto err_no_ida; + + priv->ida_id = err; priv->fw_attr_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0), NULL, "%s-%u", @@ -995,7 +995,7 @@ static int lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) cd01_attr_groups[i].tunable_attr->dev = &priv->wdev->dev; } - return 0; + return; err_remove_groups: while (i--) @@ -1009,7 +1009,12 @@ static int lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) err_free_ida: ida_free(&lwmi_om_ida, priv->ida_id); - return err; + +err_no_ida: + priv->ida_id = -EIDRM; + + dev_warn(&priv->wdev->dev, + "failed to register firmware-attributes device: %d\n", err); } /** @@ -1018,12 +1023,17 @@ static int lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) */ static void lwmi_om_fw_attr_remove(struct lwmi_om_priv *priv) { + if (priv->ida_id < 0) + return; + for (unsigned int i = 0; i < ARRAY_SIZE(cd01_attr_groups) - 1; i++) sysfs_remove_group(&priv->fw_attr_kset->kobj, cd01_attr_groups[i].attr_group); kset_unregister(priv->fw_attr_kset); device_unregister(priv->fw_attr_dev); + ida_free(&lwmi_om_ida, priv->ida_id); + priv->ida_id = -EIDRM; } /* ======== Self (master: lenovo-wmi-other) ======== */ @@ -1065,7 +1075,9 @@ static int lwmi_om_master_bind(struct device *dev) lwmi_om_fan_info_collect_cd00(priv); - return lwmi_om_fw_attr_add(priv); + lwmi_om_fw_attr_add(priv); + + return 0; } /** @@ -1117,13 +1129,7 @@ static int lwmi_other_probe(struct wmi_device *wdev, const void *context) static void lwmi_other_remove(struct wmi_device *wdev) { - struct lwmi_om_priv *priv = dev_get_drvdata(&wdev->dev); - component_master_del(&wdev->dev, &lwmi_om_master_ops); - - /* No IDA to free if the driver is never bound to its components. */ - if (priv->ida_id >= 0) - ida_free(&lwmi_om_ida, priv->ida_id); } static const struct wmi_device_id lwmi_other_id_table[] = { From 2fe2504abcfa4f82a4208e8d0c21ec0f22baca43 Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Sun, 10 May 2026 04:25:33 +0000 Subject: [PATCH 170/377] platform/x86: lenovo-wmi-other: Balance component bind and unbind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When lwmi_om_master_bind() fails, the master device's components are left bound, with the aggregate device destroyed due to the failure (found by sashiko.dev [1]). Balance calls to component_bind_all() and component_unbind_all() when an error is propagated to the component framework. No functional change intended. Reviewed-by: Mark Pearson Reviewed-by: Ilpo Järvinen Fixes: edc4b183b794 ("platform/x86: Add Lenovo Other Mode WMI Driver") Cc: stable@vger.kernel.org Link: https://sashiko.dev/#/patchset/20260331181208.421552-1-derekjohn.clark%40gmail.com [1] Signed-off-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-4-derekjohn.clark@gmail.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-other.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index ebef3649d0a7..70fcb8406c27 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -1070,8 +1070,11 @@ static int lwmi_om_master_bind(struct device *dev) priv->cd00_list = binder.cd00_list; priv->cd01_list = binder.cd01_list; - if (!priv->cd00_list || !priv->cd01_list) + if (!priv->cd00_list || !priv->cd01_list) { + component_unbind_all(dev, NULL); + return -ENODEV; + } lwmi_om_fan_info_collect_cd00(priv); From 816fbd5dacee977ca56bab79bf97f71f2f7ac24e Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Sun, 10 May 2026 04:25:34 +0000 Subject: [PATCH 171/377] platform/x86: lenovo-wmi-other: Zero initialize WMI arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds explicit initialization of wmi_method_args_32 declarations with zero values to prevent uninitialized data from being sent to the device BIOS when passed. No functional change intended. Reviewed-by: Mark Pearson Fixes: 22024ac5366f ("platform/x86: Add Lenovo Gamezone WMI Driver") Fixes: edc4b183b794 ("platform/x86: Add Lenovo Other Mode WMI Driver") Reported-by: Rong Zhang Closes: https://lore.kernel.org/platform-driver-x86/95c7e7b539dd0af41189c754fcd35cec5b6fe182.camel@rong.moe/ Cc: stable@vger.kernel.org Reviewed-by: Rong Zhang Tested-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-5-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-gamezone.c | 2 +- drivers/platform/x86/lenovo/wmi-other.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-gamezone.c b/drivers/platform/x86/lenovo/wmi-gamezone.c index c7fe7e3c9f17..098239c9311a 100644 --- a/drivers/platform/x86/lenovo/wmi-gamezone.c +++ b/drivers/platform/x86/lenovo/wmi-gamezone.c @@ -201,7 +201,7 @@ static int lwmi_gz_profile_set(struct device *dev, enum platform_profile_option profile) { struct lwmi_gz_priv *priv = dev_get_drvdata(dev); - struct wmi_method_args_32 args; + struct wmi_method_args_32 args = {}; enum thermal_mode mode; int ret; diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 70fcb8406c27..c1b429269f89 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -166,7 +166,7 @@ MODULE_PARM_DESC(relax_fan_constraint, */ static int lwmi_om_fan_get_set(struct lwmi_om_priv *priv, int channel, u32 *val, bool set) { - struct wmi_method_args_32 args; + struct wmi_method_args_32 args = {}; u32 method_id, retval; int err; @@ -775,7 +775,7 @@ static ssize_t attr_current_value_store(struct kobject *kobj, struct tunable_attr_01 *tunable_attr) { struct lwmi_om_priv *priv = dev_get_drvdata(tunable_attr->dev); - struct wmi_method_args_32 args; + struct wmi_method_args_32 args = {}; struct capdata01 capdata; enum thermal_mode mode; u32 attribute_id; @@ -838,7 +838,7 @@ static ssize_t attr_current_value_show(struct kobject *kobj, struct tunable_attr_01 *tunable_attr) { struct lwmi_om_priv *priv = dev_get_drvdata(tunable_attr->dev); - struct wmi_method_args_32 args; + struct wmi_method_args_32 args = {}; enum thermal_mode mode; u32 attribute_id; int retval; From 71f3843e0f81e3c097a088c1121154bb9a44da0a Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Sun, 10 May 2026 04:25:35 +0000 Subject: [PATCH 172/377] platform/x86: lenovo-wmi-other: Fix tunable_attr_01 struct members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In struct tunable_attr_01 the capdata pointer is unused and the size of the id members is u32 when it should be u8. Fix these prior to adding additional members. No functional change intended. Reviewed-by: Mark Pearson Cc: stable@vger.kernel.org Reviewed-by: Rong Zhang Tested-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-6-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-other.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index c1b429269f89..526075ff52d0 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -548,11 +548,10 @@ static void lwmi_om_fan_info_collect_cd_fan(struct device *dev, struct cd_list * /* ======== fw_attributes (component: lenovo-wmi-capdata 01) ======== */ struct tunable_attr_01 { - struct capdata01 *capdata; struct device *dev; - u32 feature_id; - u32 device_id; - u32 type_id; + u8 feature_id; + u8 device_id; + u8 type_id; }; static struct tunable_attr_01 ppt_pl1_spl = { From e8d5460ad3fd22409f2566ecbe2c82d94aabc246 Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Sun, 10 May 2026 04:25:36 +0000 Subject: [PATCH 173/377] platform/x86: lenovo: Decouple lenovo-wmi-gamezone and lenovo-wmi-other MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, lenovo-wmi-gamezone depends on lenovo-wmi-other as the former imports symbols from the latter. The imported symbols are just used to register a notifier block. However, there is no runtime dependency between both drivers, and either of them can run without the other, which is the major purpose of using the notifier framework. Such a link-time dependency is non-optimal. A previous attempt to "fix" it made LENOVO_WMI_GAMEZONE select LENOVO_WMI_TUNING, which was fundamentally broken and resulted in undefined Kconfig behavior, as `select' cannot be used on a symbol with potentially unmet dependencies. Decouple both drivers by moving the thermal mode notifier chain to lenovo-wmi-helpers. Methods for notifier block (un)registration are exported for lenovo-wmi-gamezone, while a method for querying the current thermal mode are exported for lenovo-wmi-other. This turns the dependency graph from +------------ lenovo-wmi-gamezone | | v | lenovo-wmi-helpers | ^ | | V +------------ lenovo-wmi-other into +------------ lenovo-wmi-gamezone | v lenovo-wmi-helpers ^ | +------------ lenovo-wmi-other To make it clear, the name of the notifier chain is also renamed from `om_chain_head' to `tm_chain_head', indicating that it's used to query the current thermal mode. No functional change intended. Reviewed-by: Mark Pearson Fixes: 6e38b9fcbfa3 ("platform/x86: lenovo: gamezone needs "other mode"") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202603252259.gHvJDyh3-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202603260302.X0NjQOda-lkp@intel.com/ Signed-off-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-7-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/Kconfig | 1 - drivers/platform/x86/lenovo/wmi-gamezone.c | 4 +- drivers/platform/x86/lenovo/wmi-helpers.c | 101 ++++++++++++++++++++ drivers/platform/x86/lenovo/wmi-helpers.h | 8 ++ drivers/platform/x86/lenovo/wmi-other.c | 104 +-------------------- drivers/platform/x86/lenovo/wmi-other.h | 16 ---- 6 files changed, 112 insertions(+), 122 deletions(-) delete mode 100644 drivers/platform/x86/lenovo/wmi-other.h diff --git a/drivers/platform/x86/lenovo/Kconfig b/drivers/platform/x86/lenovo/Kconfig index f885127b007f..09b1b055d2e0 100644 --- a/drivers/platform/x86/lenovo/Kconfig +++ b/drivers/platform/x86/lenovo/Kconfig @@ -252,7 +252,6 @@ config LENOVO_WMI_GAMEZONE select ACPI_PLATFORM_PROFILE select LENOVO_WMI_EVENTS select LENOVO_WMI_HELPERS - select LENOVO_WMI_TUNING help Say Y here if you have a WMI aware Lenovo Legion device and would like to use the platform-profile firmware interface to manage power usage. diff --git a/drivers/platform/x86/lenovo/wmi-gamezone.c b/drivers/platform/x86/lenovo/wmi-gamezone.c index 098239c9311a..c28785d25673 100644 --- a/drivers/platform/x86/lenovo/wmi-gamezone.c +++ b/drivers/platform/x86/lenovo/wmi-gamezone.c @@ -23,7 +23,6 @@ #include "wmi-events.h" #include "wmi-gamezone.h" #include "wmi-helpers.h" -#include "wmi-other.h" #define LENOVO_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0" @@ -383,7 +382,7 @@ static int lwmi_gz_probe(struct wmi_device *wdev, const void *context) return ret; priv->mode_nb.notifier_call = lwmi_gz_mode_call; - return devm_lwmi_om_register_notifier(&wdev->dev, &priv->mode_nb); + return devm_lwmi_tm_register_notifier(&wdev->dev, &priv->mode_nb); } static const struct wmi_device_id lwmi_gz_id_table[] = { @@ -405,7 +404,6 @@ module_wmi_driver(lwmi_gz_driver); MODULE_IMPORT_NS("LENOVO_WMI_EVENTS"); MODULE_IMPORT_NS("LENOVO_WMI_HELPERS"); -MODULE_IMPORT_NS("LENOVO_WMI_OTHER"); MODULE_DEVICE_TABLE(wmi, lwmi_gz_id_table); MODULE_AUTHOR("Derek J. Clark "); MODULE_DESCRIPTION("Lenovo GameZone WMI Driver"); diff --git a/drivers/platform/x86/lenovo/wmi-helpers.c b/drivers/platform/x86/lenovo/wmi-helpers.c index 018d7642e2bd..7a198259e393 100644 --- a/drivers/platform/x86/lenovo/wmi-helpers.c +++ b/drivers/platform/x86/lenovo/wmi-helpers.c @@ -21,11 +21,15 @@ #include #include #include +#include #include #include #include "wmi-helpers.h" +/* Thermal mode notifier chain. */ +static BLOCKING_NOTIFIER_HEAD(tm_chain_head); + /** * lwmi_dev_evaluate_int() - Helper function for calling WMI methods that * return an integer. @@ -84,6 +88,103 @@ int lwmi_dev_evaluate_int(struct wmi_device *wdev, u8 instance, u32 method_id, }; EXPORT_SYMBOL_NS_GPL(lwmi_dev_evaluate_int, "LENOVO_WMI_HELPERS"); +/** + * lwmi_tm_register_notifier() - Add a notifier to the blocking notifier chain + * @nb: The notifier_block struct to register + * + * Call blocking_notifier_chain_register to register the notifier block to the + * thermal mode notifier chain. + * + * Return: 0 on success, %-EEXIST on error. + */ +int lwmi_tm_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&tm_chain_head, nb); +} +EXPORT_SYMBOL_NS_GPL(lwmi_tm_register_notifier, "LENOVO_WMI_HELPERS"); + +/** + * lwmi_tm_unregister_notifier() - Remove a notifier from the blocking notifier + * chain. + * @nb: The notifier_block struct to register + * + * Call blocking_notifier_chain_unregister to unregister the notifier block from the + * thermal mode notifier chain. + * + * Return: 0 on success, %-ENOENT on error. + */ +int lwmi_tm_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&tm_chain_head, nb); +} +EXPORT_SYMBOL_NS_GPL(lwmi_tm_unregister_notifier, "LENOVO_WMI_HELPERS"); + +/** + * devm_lwmi_tm_unregister_notifier() - Remove a notifier from the blocking + * notifier chain. + * @data: Void pointer to the notifier_block struct to register. + * + * Call lwmi_tm_unregister_notifier to unregister the notifier block from the + * thermal mode notifier chain. + * + * Return: 0 on success, %-ENOENT on error. + */ +static void devm_lwmi_tm_unregister_notifier(void *data) +{ + struct notifier_block *nb = data; + + lwmi_tm_unregister_notifier(nb); +} + +/** + * devm_lwmi_tm_register_notifier() - Add a notifier to the blocking notifier + * chain. + * @dev: The parent device of the notifier_block struct. + * @nb: The notifier_block struct to register + * + * Call lwmi_tm_register_notifier to register the notifier block to the + * thermal mode notifier chain. Then add devm_lwmi_tm_unregister_notifier + * as a device managed action to automatically unregister the notifier block + * upon parent device removal. + * + * Return: 0 on success, or an error code. + */ +int devm_lwmi_tm_register_notifier(struct device *dev, + struct notifier_block *nb) +{ + int ret; + + ret = lwmi_tm_register_notifier(nb); + if (ret < 0) + return ret; + + return devm_add_action_or_reset(dev, devm_lwmi_tm_unregister_notifier, + nb); +} +EXPORT_SYMBOL_NS_GPL(devm_lwmi_tm_register_notifier, "LENOVO_WMI_HELPERS"); + +/** + * lwmi_tm_notifier_call() - Call functions for the notifier call chain. + * @mode: Pointer to a thermal mode enum to retrieve the data from. + * + * Call blocking_notifier_call_chain to retrieve the thermal mode from the + * lenovo-wmi-gamezone driver. + * + * Return: 0 on success, or an error code. + */ +int lwmi_tm_notifier_call(enum thermal_mode *mode) +{ + int ret; + + ret = blocking_notifier_call_chain(&tm_chain_head, + LWMI_GZ_GET_THERMAL_MODE, &mode); + if ((ret & ~NOTIFY_STOP_MASK) != NOTIFY_OK) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(lwmi_tm_notifier_call, "LENOVO_WMI_HELPERS"); + MODULE_AUTHOR("Derek J. Clark "); MODULE_DESCRIPTION("Lenovo WMI Helpers Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/platform/x86/lenovo/wmi-helpers.h b/drivers/platform/x86/lenovo/wmi-helpers.h index 20fd21749803..651a039228ed 100644 --- a/drivers/platform/x86/lenovo/wmi-helpers.h +++ b/drivers/platform/x86/lenovo/wmi-helpers.h @@ -7,6 +7,8 @@ #include +struct device; +struct notifier_block; struct wmi_device; struct wmi_method_args_32 { @@ -17,4 +19,10 @@ struct wmi_method_args_32 { int lwmi_dev_evaluate_int(struct wmi_device *wdev, u8 instance, u32 method_id, unsigned char *buf, size_t size, u32 *retval); +int lwmi_tm_register_notifier(struct notifier_block *nb); +int lwmi_tm_unregister_notifier(struct notifier_block *nb); +int devm_lwmi_tm_register_notifier(struct device *dev, + struct notifier_block *nb); +int lwmi_tm_notifier_call(enum thermal_mode *mode); + #endif /* !_LENOVO_WMI_HELPERS_H_ */ diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 526075ff52d0..292fed8bcc80 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include @@ -49,7 +48,6 @@ #include "wmi-events.h" #include "wmi-gamezone.h" #include "wmi-helpers.h" -#include "wmi-other.h" #include "../firmware_attributes_class.h" #define LENOVO_OTHER_MODE_GUID "DC2A8805-3A8C-41BA-A6F7-092E0089CD3B" @@ -81,7 +79,6 @@ #define LWMI_OM_FW_ATTR_BASE_PATH "lenovo-wmi-other" #define LWMI_OM_HWMON_NAME "lenovo_wmi_other" -static BLOCKING_NOTIFIER_HEAD(om_chain_head); static DEFINE_IDA(lwmi_om_ida); enum attribute_property { @@ -109,7 +106,6 @@ struct lwmi_om_priv { struct device *hwmon_dev; struct device *fw_attr_dev; struct kset *fw_attr_kset; - struct notifier_block nb; struct wmi_device *wdev; int ida_id; @@ -577,102 +573,6 @@ struct capdata01_attr_group { struct tunable_attr_01 *tunable_attr; }; -/** - * lwmi_om_register_notifier() - Add a notifier to the blocking notifier chain - * @nb: The notifier_block struct to register - * - * Call blocking_notifier_chain_register to register the notifier block to the - * lenovo-wmi-other driver notifier chain. - * - * Return: 0 on success, %-EEXIST on error. - */ -int lwmi_om_register_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&om_chain_head, nb); -} -EXPORT_SYMBOL_NS_GPL(lwmi_om_register_notifier, "LENOVO_WMI_OTHER"); - -/** - * lwmi_om_unregister_notifier() - Remove a notifier from the blocking notifier - * chain. - * @nb: The notifier_block struct to register - * - * Call blocking_notifier_chain_unregister to unregister the notifier block from the - * lenovo-wmi-other driver notifier chain. - * - * Return: 0 on success, %-ENOENT on error. - */ -int lwmi_om_unregister_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&om_chain_head, nb); -} -EXPORT_SYMBOL_NS_GPL(lwmi_om_unregister_notifier, "LENOVO_WMI_OTHER"); - -/** - * devm_lwmi_om_unregister_notifier() - Remove a notifier from the blocking - * notifier chain. - * @data: Void pointer to the notifier_block struct to register. - * - * Call lwmi_om_unregister_notifier to unregister the notifier block from the - * lenovo-wmi-other driver notifier chain. - * - * Return: 0 on success, %-ENOENT on error. - */ -static void devm_lwmi_om_unregister_notifier(void *data) -{ - struct notifier_block *nb = data; - - lwmi_om_unregister_notifier(nb); -} - -/** - * devm_lwmi_om_register_notifier() - Add a notifier to the blocking notifier - * chain. - * @dev: The parent device of the notifier_block struct. - * @nb: The notifier_block struct to register - * - * Call lwmi_om_register_notifier to register the notifier block to the - * lenovo-wmi-other driver notifier chain. Then add devm_lwmi_om_unregister_notifier - * as a device managed action to automatically unregister the notifier block - * upon parent device removal. - * - * Return: 0 on success, or an error code. - */ -int devm_lwmi_om_register_notifier(struct device *dev, - struct notifier_block *nb) -{ - int ret; - - ret = lwmi_om_register_notifier(nb); - if (ret < 0) - return ret; - - return devm_add_action_or_reset(dev, devm_lwmi_om_unregister_notifier, - nb); -} -EXPORT_SYMBOL_NS_GPL(devm_lwmi_om_register_notifier, "LENOVO_WMI_OTHER"); - -/** - * lwmi_om_notifier_call() - Call functions for the notifier call chain. - * @mode: Pointer to a thermal mode enum to retrieve the data from. - * - * Call blocking_notifier_call_chain to retrieve the thermal mode from the - * lenovo-wmi-gamezone driver. - * - * Return: 0 on success, or an error code. - */ -static int lwmi_om_notifier_call(enum thermal_mode *mode) -{ - int ret; - - ret = blocking_notifier_call_chain(&om_chain_head, - LWMI_GZ_GET_THERMAL_MODE, &mode); - if ((ret & ~NOTIFY_STOP_MASK) != NOTIFY_OK) - return -EINVAL; - - return 0; -} - /* Attribute Methods */ /** @@ -781,7 +681,7 @@ static ssize_t attr_current_value_store(struct kobject *kobj, u32 value; int ret; - ret = lwmi_om_notifier_call(&mode); + ret = lwmi_tm_notifier_call(&mode); if (ret) return ret; @@ -843,7 +743,7 @@ static ssize_t attr_current_value_show(struct kobject *kobj, int retval; int ret; - ret = lwmi_om_notifier_call(&mode); + ret = lwmi_tm_notifier_call(&mode); if (ret) return ret; diff --git a/drivers/platform/x86/lenovo/wmi-other.h b/drivers/platform/x86/lenovo/wmi-other.h deleted file mode 100644 index 8ebf5602bb99..000000000000 --- a/drivers/platform/x86/lenovo/wmi-other.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ - -/* Copyright (C) 2025 Derek J. Clark */ - -#ifndef _LENOVO_WMI_OTHER_H_ -#define _LENOVO_WMI_OTHER_H_ - -struct device; -struct notifier_block; - -int lwmi_om_register_notifier(struct notifier_block *nb); -int lwmi_om_unregister_notifier(struct notifier_block *nb); -int devm_lwmi_om_register_notifier(struct device *dev, - struct notifier_block *nb); - -#endif /* !_LENOVO_WMI_OTHER_H_ */ From 7e27896e16a1c450085c3fe020eeb1b223880f37 Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Sun, 10 May 2026 04:25:37 +0000 Subject: [PATCH 174/377] platform/x86: lenovo-wmi-helpers: Move gamezone enums to wmi-helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a later patch in the series the thermal mode enum will be accessed across three separate drivers (wmi-capdata, wmi-gamezonem and wmi-other). An additional patch in the series will also add a function prototype that needs to reference this enum in wmi-helpers.h. To avoid having all these drivers begin to import each others headers, and to avoid declaring an opaque enum to hande the second case, move the thermal mode enum to helpers where it can be safely accessed by everything that needs it from a single import. While at it, since the gamezone_events_type enum is the only remaining item in the header, move that as well and remove the gamezone header entirely. Cc: stable@vger.kernel.org Reviewed-by: Mark Pearson Reviewed-by: Rong Zhang Tested-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-8-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-events.c | 2 +- drivers/platform/x86/lenovo/wmi-gamezone.c | 1 - drivers/platform/x86/lenovo/wmi-gamezone.h | 20 -------------------- drivers/platform/x86/lenovo/wmi-helpers.h | 13 +++++++++++++ drivers/platform/x86/lenovo/wmi-other.c | 1 - 5 files changed, 14 insertions(+), 23 deletions(-) delete mode 100644 drivers/platform/x86/lenovo/wmi-gamezone.h diff --git a/drivers/platform/x86/lenovo/wmi-events.c b/drivers/platform/x86/lenovo/wmi-events.c index 4a6a2c82413a..fc25bba68a7c 100644 --- a/drivers/platform/x86/lenovo/wmi-events.c +++ b/drivers/platform/x86/lenovo/wmi-events.c @@ -17,7 +17,7 @@ #include #include "wmi-events.h" -#include "wmi-gamezone.h" +#include "wmi-helpers.h" #define THERMAL_MODE_EVENT_GUID "D320289E-8FEA-41E0-86F9-911D83151B5F" diff --git a/drivers/platform/x86/lenovo/wmi-gamezone.c b/drivers/platform/x86/lenovo/wmi-gamezone.c index c28785d25673..109c0b564a9f 100644 --- a/drivers/platform/x86/lenovo/wmi-gamezone.c +++ b/drivers/platform/x86/lenovo/wmi-gamezone.c @@ -21,7 +21,6 @@ #include #include "wmi-events.h" -#include "wmi-gamezone.h" #include "wmi-helpers.h" #define LENOVO_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0" diff --git a/drivers/platform/x86/lenovo/wmi-gamezone.h b/drivers/platform/x86/lenovo/wmi-gamezone.h deleted file mode 100644 index 6b163a5eeb95..000000000000 --- a/drivers/platform/x86/lenovo/wmi-gamezone.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ - -/* Copyright (C) 2025 Derek J. Clark */ - -#ifndef _LENOVO_WMI_GAMEZONE_H_ -#define _LENOVO_WMI_GAMEZONE_H_ - -enum gamezone_events_type { - LWMI_GZ_GET_THERMAL_MODE = 1, -}; - -enum thermal_mode { - LWMI_GZ_THERMAL_MODE_QUIET = 0x01, - LWMI_GZ_THERMAL_MODE_BALANCED = 0x02, - LWMI_GZ_THERMAL_MODE_PERFORMANCE = 0x03, - LWMI_GZ_THERMAL_MODE_EXTREME = 0xE0, /* Ver 6+ */ - LWMI_GZ_THERMAL_MODE_CUSTOM = 0xFF, -}; - -#endif /* !_LENOVO_WMI_GAMEZONE_H_ */ diff --git a/drivers/platform/x86/lenovo/wmi-helpers.h b/drivers/platform/x86/lenovo/wmi-helpers.h index 651a039228ed..ed7db3ebba6c 100644 --- a/drivers/platform/x86/lenovo/wmi-helpers.h +++ b/drivers/platform/x86/lenovo/wmi-helpers.h @@ -16,6 +16,19 @@ struct wmi_method_args_32 { u32 arg1; }; +enum lwmi_event_type { + LWMI_GZ_GET_THERMAL_MODE = 0x01, +}; + +enum thermal_mode { + LWMI_GZ_THERMAL_MODE_NONE = 0x00, + LWMI_GZ_THERMAL_MODE_QUIET = 0x01, + LWMI_GZ_THERMAL_MODE_BALANCED = 0x02, + LWMI_GZ_THERMAL_MODE_PERFORMANCE = 0x03, + LWMI_GZ_THERMAL_MODE_EXTREME = 0xE0, /* Ver 6+ */ + LWMI_GZ_THERMAL_MODE_CUSTOM = 0xFF, +}; + int lwmi_dev_evaluate_int(struct wmi_device *wdev, u8 instance, u32 method_id, unsigned char *buf, size_t size, u32 *retval); diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 292fed8bcc80..19f48aeda228 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -46,7 +46,6 @@ #include "wmi-capdata.h" #include "wmi-events.h" -#include "wmi-gamezone.h" #include "wmi-helpers.h" #include "../firmware_attributes_class.h" From 30a4ad208a7f7bdb790cd31d368595890045334f Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Sun, 10 May 2026 04:25:38 +0000 Subject: [PATCH 175/377] platform/x86: lenovo-wmi-other: Add Attribute ID helper functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds lwmi_attr_id() function. In the same vein as LWMI_ATTR_ID_FAN_RPM(), but as a generic, to de-duplicate attribute_id assignment boilerplate. Adds tunable_attr_01_id() function that breaks out the members of a tunable_attr_01 struct and passes them to lwmi_attr_id(). No functional change intended. Cc: stable@vger.kernel.org Reviewed-by: Rong Zhang Tested-by: Rong Zhang Reviewed-by: Mark Pearson Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-9-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-capdata.c | 8 ++-- drivers/platform/x86/lenovo/wmi-capdata.h | 20 +++++++++ drivers/platform/x86/lenovo/wmi-other.c | 49 +++++++++-------------- 3 files changed, 44 insertions(+), 33 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-capdata.c b/drivers/platform/x86/lenovo/wmi-capdata.c index b73d378f0e8b..714aa6fd6f1f 100644 --- a/drivers/platform/x86/lenovo/wmi-capdata.c +++ b/drivers/platform/x86/lenovo/wmi-capdata.c @@ -27,7 +27,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include #include #include #include @@ -48,6 +47,7 @@ #include #include "wmi-capdata.h" +#include "wmi-helpers.h" #define LENOVO_CAPABILITY_DATA_00_GUID "362A3AFE-3D96-4665-8530-96DAD5BB300E" #define LENOVO_CAPABILITY_DATA_01_GUID "7A8F5407-CB67-4D6E-B547-39B3BE018154" @@ -57,9 +57,9 @@ #define LWMI_FEATURE_ID_FAN_TEST 0x05 -#define LWMI_ATTR_ID_FAN_TEST \ - (FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, LWMI_DEVICE_ID_FAN) | \ - FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, LWMI_FEATURE_ID_FAN_TEST)) +#define LWMI_ATTR_ID_FAN_TEST \ + lwmi_attr_id(LWMI_DEVICE_ID_FAN, LWMI_FEATURE_ID_FAN_TEST, \ + LWMI_GZ_THERMAL_MODE_NONE, LWMI_TYPE_ID_NONE) enum lwmi_cd_type { LENOVO_CAPABILITY_DATA_00, diff --git a/drivers/platform/x86/lenovo/wmi-capdata.h b/drivers/platform/x86/lenovo/wmi-capdata.h index 8c1df3efcc55..c3e760b8c3c3 100644 --- a/drivers/platform/x86/lenovo/wmi-capdata.h +++ b/drivers/platform/x86/lenovo/wmi-capdata.h @@ -6,6 +6,7 @@ #define _LENOVO_WMI_CAPDATA_H_ #include +#include #include #define LWMI_SUPP_VALID BIT(0) @@ -19,6 +20,8 @@ #define LWMI_DEVICE_ID_FAN 0x04 +#define LWMI_TYPE_ID_NONE 0x00 + struct component_match; struct device; struct cd_list; @@ -57,6 +60,23 @@ struct lwmi_cd_binder { cd_list_cb_t cd_fan_list_cb; }; +/** + * lwmi_attr_id() - Formats a capability data attribute ID + * @dev_id: The u8 corresponding to the device ID. + * @feat_id: The u8 corresponding to the feature ID on the device. + * @mode_id: The u8 corresponding to the wmi-gamezone mode for set/get. + * @type_id: The u8 corresponding to the sub-device. + * + * Return: encoded capability data attribute ID. + */ +static inline u32 lwmi_attr_id(u8 dev_id, u8 feat_id, u8 mode_id, u8 type_id) +{ + return (FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, dev_id) | + FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, feat_id) | + FIELD_PREP(LWMI_ATTR_MODE_ID_MASK, mode_id) | + FIELD_PREP(LWMI_ATTR_TYPE_ID_MASK, type_id)); +} + void lwmi_cd_match_add_all(struct device *master, struct component_match **matchptr); int lwmi_cd00_get_data(struct cd_list *list, u32 attribute_id, struct capdata00 *output); int lwmi_cd01_get_data(struct cd_list *list, u32 attribute_id, struct capdata01 *output); diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 19f48aeda228..a06b188eadac 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -59,8 +59,6 @@ #define LWMI_FEATURE_ID_FAN_RPM 0x03 -#define LWMI_TYPE_ID_NONE 0x00 - #define LWMI_FEATURE_VALUE_GET 17 #define LWMI_FEATURE_VALUE_SET 18 @@ -68,13 +66,12 @@ #define LWMI_FAN_NR 4 #define LWMI_FAN_ID(x) ((x) + LWMI_FAN_ID_BASE) -#define LWMI_ATTR_ID_FAN_RPM(x) \ - (FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, LWMI_DEVICE_ID_FAN) | \ - FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, LWMI_FEATURE_ID_FAN_RPM) | \ - FIELD_PREP(LWMI_ATTR_TYPE_ID_MASK, LWMI_FAN_ID(x))) - #define LWMI_FAN_DIV 100 +#define LWMI_ATTR_ID_FAN_RPM(x) \ + lwmi_attr_id(LWMI_DEVICE_ID_FAN, LWMI_FEATURE_ID_FAN_RPM, \ + LWMI_GZ_THERMAL_MODE_NONE, LWMI_FAN_ID(x)) + #define LWMI_OM_FW_ATTR_BASE_PATH "lenovo-wmi-other" #define LWMI_OM_HWMON_NAME "lenovo_wmi_other" @@ -549,6 +546,18 @@ struct tunable_attr_01 { u8 type_id; }; +/** + * tunable_attr_01_id() - Formats a tunable_attr_01 to a capdata attribute ID + * @attr: The tunable_attr_01 to format. + * @mode: The u8 corresponding to the wmi-gamezone mode for set/get. + * + * Return: encoded capability data attribute ID. + */ +static u32 tunable_attr_01_id(struct tunable_attr_01 *attr, u8 mode) +{ + return lwmi_attr_id(attr->device_id, attr->feature_id, mode, attr->type_id); +} + static struct tunable_attr_01 ppt_pl1_spl = { .device_id = LWMI_DEVICE_ID_CPU, .feature_id = LWMI_FEATURE_ID_CPU_SPL, @@ -616,12 +625,7 @@ static ssize_t attr_capdata01_show(struct kobject *kobj, u32 attribute_id; int value, ret; - attribute_id = - FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, tunable_attr->device_id) | - FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, tunable_attr->feature_id) | - FIELD_PREP(LWMI_ATTR_MODE_ID_MASK, - LWMI_GZ_THERMAL_MODE_CUSTOM) | - FIELD_PREP(LWMI_ATTR_TYPE_ID_MASK, tunable_attr->type_id); + attribute_id = tunable_attr_01_id(tunable_attr, LWMI_GZ_THERMAL_MODE_CUSTOM); ret = lwmi_cd01_get_data(priv->cd01_list, attribute_id, &capdata); if (ret) @@ -676,7 +680,6 @@ static ssize_t attr_current_value_store(struct kobject *kobj, struct wmi_method_args_32 args = {}; struct capdata01 capdata; enum thermal_mode mode; - u32 attribute_id; u32 value; int ret; @@ -687,13 +690,9 @@ static ssize_t attr_current_value_store(struct kobject *kobj, if (mode != LWMI_GZ_THERMAL_MODE_CUSTOM) return -EBUSY; - attribute_id = - FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, tunable_attr->device_id) | - FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, tunable_attr->feature_id) | - FIELD_PREP(LWMI_ATTR_MODE_ID_MASK, mode) | - FIELD_PREP(LWMI_ATTR_TYPE_ID_MASK, tunable_attr->type_id); + args.arg0 = tunable_attr_01_id(tunable_attr, mode); - ret = lwmi_cd01_get_data(priv->cd01_list, attribute_id, &capdata); + ret = lwmi_cd01_get_data(priv->cd01_list, args.arg0, &capdata); if (ret) return ret; @@ -704,7 +703,6 @@ static ssize_t attr_current_value_store(struct kobject *kobj, if (value < capdata.min_value || value > capdata.max_value) return -EINVAL; - args.arg0 = attribute_id; args.arg1 = value; ret = lwmi_dev_evaluate_int(priv->wdev, 0x0, LWMI_FEATURE_VALUE_SET, @@ -738,7 +736,6 @@ static ssize_t attr_current_value_show(struct kobject *kobj, struct lwmi_om_priv *priv = dev_get_drvdata(tunable_attr->dev); struct wmi_method_args_32 args = {}; enum thermal_mode mode; - u32 attribute_id; int retval; int ret; @@ -746,13 +743,7 @@ static ssize_t attr_current_value_show(struct kobject *kobj, if (ret) return ret; - attribute_id = - FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, tunable_attr->device_id) | - FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, tunable_attr->feature_id) | - FIELD_PREP(LWMI_ATTR_MODE_ID_MASK, mode) | - FIELD_PREP(LWMI_ATTR_TYPE_ID_MASK, tunable_attr->type_id); - - args.arg0 = attribute_id; + args.arg0 = tunable_attr_01_id(tunable_attr, mode); ret = lwmi_dev_evaluate_int(priv->wdev, 0x0, LWMI_FEATURE_VALUE_GET, (unsigned char *)&args, sizeof(args), From 03bb5147da083cb91e5c8c2599fcb2f8fd05cb8f Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Sun, 10 May 2026 04:25:39 +0000 Subject: [PATCH 176/377] platform/x86: lenovo-wmi-other: Limit adding attributes to supported devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds lwmi_is_attr_01_supported, and only creates the attribute subfolder if the attribute is supported by the hardware. Due to some poorly implemented BIOS this is a multi-step sequence of events. This is because: - Some BIOS support getting the capability data from custom mode (0xff), while others only support it in no-mode (0x00). - Some BIOS support get/set for the current value from custom mode (0xff), while others only support it in no-mode (0x00). - Some BIOS report capability data for a method that is not fully implemented. - Some BIOS have methods fully implemented, but no complimentary capability data. To ensure we only expose fully implemented methods with corresponding capability data, we check each outcome before reporting that an attribute can be supported. Checking for lwmi_is_attr_01_supported during remove is not done to ensure that we don't attempt to call cd01 or send WMI events if one of the interfaces being removed was the cause of the driver unloading. Fixes: edc4b183b794 ("platform/x86: Add Lenovo Other Mode WMI Driver") Reported-by: Kurt Borja Closes: https://lore.kernel.org/platform-driver-x86/DG60P3SHXR8H.3NSEHMZ6J7XRC@gmail.com/ Cc: stable@vger.kernel.org Reviewed-by: Rong Zhang Tested-by: Rong Zhang Reviewed-by: Mark Pearson Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-10-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-other.c | 92 +++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index a06b188eadac..d318ba432fdc 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -544,6 +544,8 @@ struct tunable_attr_01 { u8 feature_id; u8 device_id; u8 type_id; + u8 cd_mode_id; /* mode arg for searching capdata */ + u8 cv_mode_id; /* mode arg for set/get current_value */ }; /** @@ -625,7 +627,7 @@ static ssize_t attr_capdata01_show(struct kobject *kobj, u32 attribute_id; int value, ret; - attribute_id = tunable_attr_01_id(tunable_attr, LWMI_GZ_THERMAL_MODE_CUSTOM); + attribute_id = tunable_attr_01_id(tunable_attr, tunable_attr->cd_mode_id); ret = lwmi_cd01_get_data(priv->cd01_list, attribute_id, &capdata); if (ret) @@ -690,7 +692,7 @@ static ssize_t attr_current_value_store(struct kobject *kobj, if (mode != LWMI_GZ_THERMAL_MODE_CUSTOM) return -EBUSY; - args.arg0 = tunable_attr_01_id(tunable_attr, mode); + args.arg0 = tunable_attr_01_id(tunable_attr, tunable_attr->cd_mode_id); ret = lwmi_cd01_get_data(priv->cd01_list, args.arg0, &capdata); if (ret) @@ -703,6 +705,7 @@ static ssize_t attr_current_value_store(struct kobject *kobj, if (value < capdata.min_value || value > capdata.max_value) return -EINVAL; + args.arg0 = tunable_attr_01_id(tunable_attr, tunable_attr->cv_mode_id); args.arg1 = value; ret = lwmi_dev_evaluate_int(priv->wdev, 0x0, LWMI_FEATURE_VALUE_SET, @@ -743,6 +746,10 @@ static ssize_t attr_current_value_show(struct kobject *kobj, if (ret) return ret; + /* If "no-mode" is the supported mode, ensure we never send current mode */ + if (tunable_attr->cv_mode_id == LWMI_GZ_THERMAL_MODE_NONE) + mode = tunable_attr->cv_mode_id; + args.arg0 = tunable_attr_01_id(tunable_attr, mode); ret = lwmi_dev_evaluate_int(priv->wdev, 0x0, LWMI_FEATURE_VALUE_GET, @@ -754,6 +761,81 @@ static ssize_t attr_current_value_show(struct kobject *kobj, return sysfs_emit(buf, "%d\n", retval); } +/** + * lwmi_attr_01_is_supported() - Determine if the given attribute is supported. + * @tunable_attr: The attribute to verify. + * + * For an attribute to be supported it must have a functional get/set method, + * as well as associated capability data stored in the capdata01 table. + * + * First check if the attribute has a corresponding data table under custom mode + * (0xff), then under no mode (0x00). If either of those passes, check if the + * supported field of the capdata struct is > 0. If it is supported, store the + * successful mode in the cd_mode_id field of tunable_attr. + * + * If the attribute capdata shows it is supported, attempt to determine the mode + * for the current value property get/set methods using a similar pattern to the + * capdata table check. If the value returned by either mode is 0 or an error, + * assume that mode is not supported. Otherwise, store the successful mode in the + * cv_mode_id field of tunable_attr. + * + * If any of the above checks fail then the attribute is not fully supported. + * + * Return: true if capdata and set/get modes are found, otherwise false. + */ +static bool lwmi_attr_01_is_supported(struct tunable_attr_01 *tunable_attr) +{ + u8 modes[2] = { LWMI_GZ_THERMAL_MODE_CUSTOM, LWMI_GZ_THERMAL_MODE_NONE }; + struct lwmi_om_priv *priv = dev_get_drvdata(tunable_attr->dev); + struct wmi_method_args_32 args = {}; + bool cd_mode_found = false; + bool cv_mode_found = false; + struct capdata01 capdata; + int retval, ret, i; + + /* Determine tunable_attr->cd_mode_id */ + for (i = 0; i < ARRAY_SIZE(modes); i++) { + args.arg0 = tunable_attr_01_id(tunable_attr, modes[i]); + + ret = lwmi_cd01_get_data(priv->cd01_list, args.arg0, &capdata); + if (ret || !capdata.supported) + continue; + + tunable_attr->cd_mode_id = modes[i]; + cd_mode_found = true; + break; + } + + if (!cd_mode_found) + return cd_mode_found; + + dev_dbg(tunable_attr->dev, + "cd_mode_id: %#010x\n", args.arg0); + + /* Determine tunable_attr->cv_mode_id, returns 1 if supported */ + for (i = 0; i < ARRAY_SIZE(modes); i++) { + args.arg0 = tunable_attr_01_id(tunable_attr, modes[i]); + + ret = lwmi_dev_evaluate_int(priv->wdev, 0x0, LWMI_FEATURE_VALUE_GET, + (u8 *)&args, sizeof(args), + &retval); + if (ret || !retval) + continue; + + tunable_attr->cv_mode_id = modes[i]; + cv_mode_found = true; + break; + } + + if (!cv_mode_found) + return cv_mode_found; + + dev_dbg(tunable_attr->dev, "cv_mode_id: %#010x, attribute support level: %#010x\n", + args.arg0, capdata.supported); + + return capdata.supported > 0; +} + /* Lenovo WMI Other Mode Attribute macros */ #define __LWMI_ATTR_RO(_func, _name) \ { \ @@ -877,12 +959,14 @@ static void lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) } for (i = 0; i < ARRAY_SIZE(cd01_attr_groups) - 1; i++) { + cd01_attr_groups[i].tunable_attr->dev = &priv->wdev->dev; + if (!lwmi_attr_01_is_supported(cd01_attr_groups[i].tunable_attr)) + continue; + err = sysfs_create_group(&priv->fw_attr_kset->kobj, cd01_attr_groups[i].attr_group); if (err) goto err_remove_groups; - - cd01_attr_groups[i].tunable_attr->dev = &priv->wdev->dev; } return; From dec85d2fbd20de3711a71e65397dfdb40c3fa953 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 6 May 2026 09:37:02 +0000 Subject: [PATCH 177/377] irqchip/gic-v5: Move LPI allocation into the LPI domain The IPI and ITS MSI domains currently allocate and release LPIs directly, then pass the selected LPI ID to the parent LPI domain. This leaks the LPI domain's allocation policy into its child domains and forces each child to duplicate part of the parent domain's teardown. Make the LPI domain allocate LPIs in its .alloc() callback and release them in a matching .free() callback. Child domains can then request a parent interrupt without passing an implementation-specific LPI ID, and the LPI lifetime is tied to the domain that owns the LPI namespace. Remove the gicv5_alloc_lpi() and gicv5_free_lpi() wrappers now that no external caller needs to manage LPIs directly. This is a preparatory change for an actual leakage problem in the allocation code and therefore tagged with the same Fixes tag. Fixes: 0f0101325876 ("irqchip/gic-v5: Add GICv5 LPI/IPI support") Signed-off-by: Sascha Bischoff Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260506093634.382062-2-sascha.bischoff@arm.com --- drivers/irqchip/irq-gic-v5-its.c | 14 ++------ drivers/irqchip/irq-gic-v5.c | 53 +++++++++++++++--------------- include/linux/irqchip/arm-gic-v5.h | 3 -- 3 files changed, 28 insertions(+), 42 deletions(-) diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c index 36a8d1368f0e..36d03f82ef68 100644 --- a/drivers/irqchip/irq-gic-v5-its.c +++ b/drivers/irqchip/irq-gic-v5-its.c @@ -929,8 +929,8 @@ static void gicv5_its_free_eventid(struct gicv5_its_dev *its_dev, u32 event_id_b static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { - u32 device_id, event_id_base, lpi; struct gicv5_its_dev *its_dev; + u32 device_id, event_id_base; msi_alloc_info_t *info = arg; irq_hw_number_t hwirq; struct irq_data *irqd; @@ -949,16 +949,8 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi device_id = its_dev->device_id; for (i = 0; i < nr_irqs; i++) { - ret = gicv5_alloc_lpi(); - if (ret < 0) { - pr_debug("Failed to find free LPI!\n"); - goto out_free_irqs; - } - lpi = ret; - - ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi); + ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, NULL); if (ret) { - gicv5_free_lpi(lpi); goto out_free_irqs; } @@ -983,7 +975,6 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi out_free_irqs: while (--i >= 0) { irqd = irq_domain_get_irq_data(domain, virq + i); - gicv5_free_lpi(irqd->parent_data->hwirq); irq_domain_reset_irq_data(irqd); irq_domain_free_irqs_parent(domain, virq + i, 1); } @@ -1013,7 +1004,6 @@ static void gicv5_its_irq_domain_free(struct irq_domain *domain, unsigned int vi for (i = 0; i < nr_irqs; i++) { d = irq_domain_get_irq_data(domain, virq + i); - gicv5_free_lpi(d->parent_data->hwirq); irq_domain_reset_irq_data(d); irq_domain_free_irqs_parent(domain, virq + i, 1); } diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 6b0903be8ebf..15a2a04398d2 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -59,16 +59,6 @@ static void release_lpi(u32 lpi) ida_free(&lpi_ida, lpi); } -int gicv5_alloc_lpi(void) -{ - return alloc_lpi(); -} - -void gicv5_free_lpi(u32 lpi) -{ - release_lpi(lpi); -} - static void gicv5_ppi_priority_init(void) { write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR0_EL1); @@ -806,18 +796,36 @@ static void gicv5_lpi_config_reset(struct irq_data *d) gicv5_lpi_irq_write_pending_state(d, false); } +static void gicv5_irq_lpi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d; + + if (WARN_ON_ONCE(nr_irqs != 1)) + return; + + d = irq_domain_get_irq_data(domain, virq); + + release_lpi(d->hwirq); + + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(d); +} + static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { irq_hw_number_t hwirq; struct irq_data *irqd; - u32 *lpi = arg; int ret; if (WARN_ON_ONCE(nr_irqs != 1)) return -EINVAL; - hwirq = *lpi; + ret = alloc_lpi(); + if (ret < 0) + return ret; + hwirq = ret; irqd = irq_domain_get_irq_data(domain, virq); @@ -826,8 +834,10 @@ static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int vi irqd_set_single_target(irqd); ret = gicv5_irs_iste_alloc(hwirq); - if (ret < 0) + if (ret < 0) { + release_lpi(hwirq); return ret; + } gicv5_hwirq_init(hwirq, GICV5_IRQ_PRI_MI, GICV5_HWIRQ_TYPE_LPI); gicv5_lpi_config_reset(irqd); @@ -837,7 +847,7 @@ static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int vi static const struct irq_domain_ops gicv5_irq_lpi_domain_ops = { .alloc = gicv5_irq_lpi_domain_alloc, - .free = gicv5_irq_domain_free, + .free = gicv5_irq_lpi_domain_free, }; void __init gicv5_init_lpi_domain(void) @@ -859,21 +869,12 @@ static int gicv5_irq_ipi_domain_alloc(struct irq_domain *domain, unsigned int vi { struct irq_data *irqd; int ret, i; - u32 lpi; for (i = 0; i < nr_irqs; i++) { - ret = gicv5_alloc_lpi(); - if (ret < 0) + ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, NULL); + if (ret) return ret; - lpi = ret; - - ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi); - if (ret) { - gicv5_free_lpi(lpi); - return ret; - } - irqd = irq_domain_get_irq_data(domain, virq + i); irq_domain_set_hwirq_and_chip(domain, virq + i, i, @@ -899,8 +900,6 @@ static void gicv5_irq_ipi_domain_free(struct irq_domain *domain, unsigned int vi if (!d) return; - gicv5_free_lpi(d->parent_data->hwirq); - irq_set_handler(virq + i, NULL); irq_domain_reset_irq_data(d); irq_domain_free_irqs_parent(domain, virq + i, 1); diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 40d2fce68294..f78787e654f4 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -425,9 +425,6 @@ struct gicv5_its_itt_cfg { void gicv5_init_lpis(u32 max); void gicv5_deinit_lpis(void); -int gicv5_alloc_lpi(void); -void gicv5_free_lpi(u32 lpi); - void __init gicv5_its_of_probe(struct device_node *parent); void __init gicv5_its_acpi_probe(void); #endif From eb6f6d523813ead9dc2799194a2839d42c049734 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 6 May 2026 09:37:23 +0000 Subject: [PATCH 178/377] irqchip/gic-v5: Support range allocation for LPIs The per-IPI parent allocation loop returns immediately on failure and leaks any parent interrupts allocated by earlier iterations. The GICv5 LPI domain now owns LPI allocation and teardown internally, but its irq_domain callbacks still reject requests where nr_irqs is greater than one. This forces child domains to allocate and free LPIs one at a time even when the interrupt core requests a contiguous range. Handle multi-interrupt allocation and teardown in the LPI domain by iterating over the requested range and unwinding any partially allocated state on failure. Allocate the parent LPIs for the IPI domain with a single range request as well, which cures the leakage problem. Fixes: 0f0101325876 ("irqchip/gic-v5: Add GICv5 LPI/IPI support") Signed-off-by: Sascha Bischoff Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260506093634.382062-3-sascha.bischoff@arm.com --- drivers/irqchip/irq-gic-v5.c | 75 ++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 15a2a04398d2..c1af07083cef 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -801,15 +801,14 @@ static void gicv5_irq_lpi_domain_free(struct irq_domain *domain, unsigned int vi { struct irq_data *d; - if (WARN_ON_ONCE(nr_irqs != 1)) - return; + for (unsigned int i = 0; i < nr_irqs; i++, virq++) { + d = irq_domain_get_irq_data(domain, virq); - d = irq_domain_get_irq_data(domain, virq); + release_lpi(d->hwirq); - release_lpi(d->hwirq); - - irq_set_handler(virq, NULL); - irq_domain_reset_irq_data(d); + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(d); + } } static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int virq, @@ -817,32 +816,39 @@ static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int vi { irq_hw_number_t hwirq; struct irq_data *irqd; + unsigned int i; int ret; - if (WARN_ON_ONCE(nr_irqs != 1)) - return -EINVAL; + for (i = 0; i < nr_irqs; i++) { + ret = alloc_lpi(); + if (ret < 0) + goto out_free_lpis; + hwirq = ret; - ret = alloc_lpi(); - if (ret < 0) - return ret; - hwirq = ret; + ret = gicv5_irs_iste_alloc(hwirq); + if (ret < 0) { + /* Undo partial state first, then clean up the rest */ + release_lpi(hwirq); + goto out_free_lpis; + } - irqd = irq_domain_get_irq_data(domain, virq); + irqd = irq_domain_get_irq_data(domain, virq + i); - irq_domain_set_info(domain, virq, hwirq, &gicv5_lpi_irq_chip, NULL, - handle_fasteoi_irq, NULL, NULL); - irqd_set_single_target(irqd); + irq_domain_set_info(domain, virq + i, hwirq, &gicv5_lpi_irq_chip, + NULL, handle_fasteoi_irq, NULL, NULL); + irqd_set_single_target(irqd); - ret = gicv5_irs_iste_alloc(hwirq); - if (ret < 0) { - release_lpi(hwirq); - return ret; + gicv5_hwirq_init(hwirq, GICV5_IRQ_PRI_MI, GICV5_HWIRQ_TYPE_LPI); + gicv5_lpi_config_reset(irqd); } - gicv5_hwirq_init(hwirq, GICV5_IRQ_PRI_MI, GICV5_HWIRQ_TYPE_LPI); - gicv5_lpi_config_reset(irqd); - return 0; + +out_free_lpis: + if (i) + gicv5_irq_lpi_domain_free(domain, virq, i); + + return ret; } static const struct irq_domain_ops gicv5_irq_lpi_domain_ops = { @@ -868,21 +874,21 @@ static int gicv5_irq_ipi_domain_alloc(struct irq_domain *domain, unsigned int vi unsigned int nr_irqs, void *arg) { struct irq_data *irqd; - int ret, i; + int ret; - for (i = 0; i < nr_irqs; i++) { - ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, NULL); - if (ret) - return ret; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret) + return ret; - irqd = irq_domain_get_irq_data(domain, virq + i); + for (unsigned int i = 0; i < nr_irqs; i++, virq++) { + irqd = irq_domain_get_irq_data(domain, virq); - irq_domain_set_hwirq_and_chip(domain, virq + i, i, - &gicv5_ipi_irq_chip, NULL); + irq_domain_set_hwirq_and_chip(domain, virq, i, + &gicv5_ipi_irq_chip, NULL); irqd_set_single_target(irqd); - irq_set_handler(virq + i, handle_percpu_irq); + irq_set_handler(virq, handle_percpu_irq); } return 0; @@ -902,8 +908,9 @@ static void gicv5_irq_ipi_domain_free(struct irq_domain *domain, unsigned int vi irq_set_handler(virq + i, NULL); irq_domain_reset_irq_data(d); - irq_domain_free_irqs_parent(domain, virq + i, 1); } + + irq_domain_free_irqs_parent(domain, virq, nr_irqs); } static const struct irq_domain_ops gicv5_irq_ipi_domain_ops = { From a7c7e42654b6a8676610ee09d22901432c4851af Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 6 May 2026 09:37:43 +0000 Subject: [PATCH 179/377] irqchip/gic-v5: Allocate ITS parent LPIs as a range The ITS MSI domain no longer manages LPI allocation directly. LPIs are allocated and freed by the parent LPI domain, which can now handle a full range of interrupts and unwind partial allocations internally. Make the ITS domain request and release the parent IRQs as a single range instead of iterating over each interrupt. The ITS allocation path then only needs to reserve EventIDs, allocate the parent range, and fill in the ITS irq_data for each MSI. Since no operation in the per-MSI loop can fail, the partial parent-free unwind becomes unnecessary. On teardown, reset the ITS irq_data for the range and then release the parent range in one call, leaving LPI teardown to the LPI domain. Fixes: 0f0101325876 ("irqchip/gic-v5: Add GICv5 LPI/IPI support") Signed-off-by: Sascha Bischoff Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260506093634.382062-4-sascha.bischoff@arm.com --- drivers/irqchip/irq-gic-v5-its.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c index 36d03f82ef68..28e39b065de0 100644 --- a/drivers/irqchip/irq-gic-v5-its.c +++ b/drivers/irqchip/irq-gic-v5-its.c @@ -937,6 +937,7 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi int ret, i; its_dev = info->scratchpad[0].ptr; + device_id = its_dev->device_id; ret = gicv5_its_alloc_eventid(its_dev, info, nr_irqs, &event_id_base); if (ret) @@ -946,14 +947,11 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi if (ret) goto out_eventid; - device_id = its_dev->device_id; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, NULL); + if (ret) + goto out_eventid; for (i = 0; i < nr_irqs; i++) { - ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, NULL); - if (ret) { - goto out_free_irqs; - } - /* * Store eventid and deviceid into the hwirq for later use. * @@ -972,12 +970,6 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi return 0; -out_free_irqs: - while (--i >= 0) { - irqd = irq_domain_get_irq_data(domain, virq + i); - irq_domain_reset_irq_data(irqd); - irq_domain_free_irqs_parent(domain, virq + i, 1); - } out_eventid: gicv5_its_free_eventid(its_dev, event_id_base, nr_irqs); return ret; @@ -1000,14 +992,14 @@ static void gicv5_its_irq_domain_free(struct irq_domain *domain, unsigned int vi bitmap_release_region(its_dev->event_map, event_id_base, get_count_order(nr_irqs)); - /* Hierarchically free irq data */ for (i = 0; i < nr_irqs; i++) { d = irq_domain_get_irq_data(domain, virq + i); - irq_domain_reset_irq_data(d); - irq_domain_free_irqs_parent(domain, virq + i, 1); } + /* Hierarchically free irq data */ + irq_domain_free_irqs_parent(domain, virq, nr_irqs); + gicv5_its_syncr(its, its_dev); gicv5_irs_syncr(); } From 512718bbc51b851140380b7068ec7365bd039cba Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 7 May 2026 12:05:18 +0100 Subject: [PATCH 180/377] genirq/chip: Don't call add_interrupt_randomness() for NMIs Recently handle_percpu_devid_irq() was changed to call add_interrupt_randomness(). This introduced a potential deadlock when handle_percpu_devid_irq() is used to handle an NMI, which can be detected with lockdep, e.g. ================================ WARNING: inconsistent lock state 7.1.0-rc2-pnmi #465 Not tainted -------------------------------- inconsistent {INITIAL USE} -> {IN-NMI} usage. perf/695 [HC1[1]:SC0[0]:HE0:SE1] takes: ffff00837dfd3a18 (&base->lock){-.-.}-{2:2}, at: lock_timer_base+0x6c/0xac {INITIAL USE} state was registered at: _raw_spin_lock_irqsave+0x68/0xb0 lock_timer_base+0x6c/0xac __mod_timer+0x100/0x32c add_timer_global+0x2c/0x40 __queue_delayed_work+0xf0/0x140 queue_delayed_work_on+0x134/0x138 mem_cgroup_css_online+0x30c/0x310 online_css+0x34/0x10c cgroup_init_subsys+0x158/0x1c8 cgroup_init+0x440/0x524 start_kernel+0x888/0x998 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&base->lock); lock(&base->lock); *** DEADLOCK *** Call trace: _raw_spin_lock_irqsave+0x68/0xb0 lock_timer_base+0x6c/0xac add_timer_on+0x78/0x16c add_interrupt_randomness+0x124/0x134 handle_percpu_devid_irq+0xd4/0x16c handle_irq_desc+0x40/0x58 generic_handle_domain_nmi+0x28/0x50 __gic_handle_nmi.isra.0+0x4c/0xa0 gic_handle_irq+0x38/0x2bc call_on_irq_stack+0x30/0x48 do_interrupt_handler+0x80/0x98 el1_interrupt+0x90/0xac el1h_64_irq_handler+0x18/0x24 el1h_64_irq+0x80/0x84 [...] During review, Thomas pointed out it wouldn't be safe for handle_percpu_devid_irq() to call add_interrupt_randomness() if it was used to handle NMIs: https://lore.kernel.org/lkml/87bjgik042.ffs@tglx/ ... but evidently people missed that handle_percpu_devid_irq() *is* used for NMIs. While it might seem that NMIs should be handled with a separate handle_percpu_devid_nmi() function, for various structural reasons this was impractical, and handle_percpu_devid_irq() has been expected to be used for NMIs since commits: 21bbbc50f398f ("irqchip/gic-v3: Switch high priority PPIs over to handle_percpu_devid_irq()") 5ff78c8de9d83 ("genirq: Kill handle_percpu_devid_fasteoi_nmi()") Taking the above into account, avoid the deadlock by not calling add_interrupt_randomness() when handle_percpu_devid_irq() is called in an NMI context. This is consistent with other NNI handling flows, which do not call add_interrupt_randomness(). At the same time, update the kernel-doc comment to make it clear that handle_percpu_devid_irq() can be called in NMI context. The rest of handle_percpu_devid_irq() is currently NMI safe and doesn't need to change. Fixes: fd7400cfcbaa ("genirq/chip: Invoke add_interrupt_randomness() in handle_percpu_devid_irq()") Reported-by: Ada Couprie Diaz Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Reviewed-by: Marc Zyngier Link: https://patch.msgid.link/20260507110518.3128248-1-mark.rutland@arm.com --- kernel/irq/chip.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6c9b1dc4e7d4..b635e3c5d5b6 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -893,7 +894,10 @@ void handle_percpu_irq(struct irq_desc *desc) * * action->percpu_dev_id is a pointer to percpu variables which * contain the real device id for the cpu on which this handler is - * called + * called. + * + * May be used for NMI interrupt lines, and so may be called in IRQ or NMI + * context. */ void handle_percpu_devid_irq(struct irq_desc *desc) { @@ -930,7 +934,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc) enabled ? " and unmasked" : "", irq, cpu); } - add_interrupt_randomness(irq); + if (!in_nmi()) + add_interrupt_randomness(irq); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); From 0fa10fb77069fb67aa51384868ef3702b7791465 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Wed, 6 May 2026 01:55:22 -0700 Subject: [PATCH 181/377] irqchip/ath79-cpu: Remove unused function ath79_cpu_irq_init() was part of the legacy pre-OF code that got removed a while back. Remove it to get rid of a missing prototype warning, reported by the kernel test robot. [ tglx: Fix the subject prefix. Sigh ... ] Fixes: 51fa4f8912c0 ("MIPS: ath79: drop legacy IRQ code") Reported-by: kernel test robot Signed-off-by: Rosen Penev Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260506085522.1210143-1-rosenp@gmail.com Closes: https://lore.kernel.org/oe-kbuild-all/202412011509.kGQkDr1y-lkp@intel.com/ --- drivers/irqchip/irq-ath79-cpu.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/irqchip/irq-ath79-cpu.c b/drivers/irqchip/irq-ath79-cpu.c index 923e4bba3776..9b7273a7f8ce 100644 --- a/drivers/irqchip/irq-ath79-cpu.c +++ b/drivers/irqchip/irq-ath79-cpu.c @@ -85,10 +85,3 @@ static int __init ar79_cpu_intc_of_init( } IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc", ar79_cpu_intc_of_init); - -void __init ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3) -{ - irq_wb_chan[2] = irq_wb_chan2; - irq_wb_chan[3] = irq_wb_chan3; - mips_cpu_irq_init(); -} From 5363b67ac8ebcc3e227dbf59fc8061949109841d Mon Sep 17 00:00:00 2001 From: Xianwei Zhao Date: Fri, 8 May 2026 07:36:54 +0000 Subject: [PATCH 182/377] irqchip/meson-gpio: Use the correct register in meson_s4_gpio_irq_set_type() meson_s4_gpio_irq_set_type() uses the both-edge trigger register for configuring level type and single edge mode interrupts, which is not correct. Use REG_EDGE_POL instead. Fixes: bbd6fcc76b39 ("irqchip: Add support for Amlogic A4 and A5 SoCs") Signed-off-by: Xianwei Zhao Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260508-a9-gpio-irqchip-v1-1-9dc5f3e022e0@amlogic.com --- drivers/irqchip/irq-meson-gpio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c index f722e9c57e2e..74a376ef452e 100644 --- a/drivers/irqchip/irq-meson-gpio.c +++ b/drivers/irqchip/irq-meson-gpio.c @@ -415,8 +415,7 @@ static int meson_s4_gpio_irq_set_type(struct meson_gpio_irq_controller *ctl, if (type & (IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING)) val |= BIT(ctl->params->edge_single_offset + idx); - meson_gpio_irq_update_bits(ctl, params->edge_pol_reg, - BIT(idx) | BIT(12 + idx), val); + meson_gpio_irq_update_bits(ctl, REG_EDGE_POL, BIT(idx) | BIT(12 + idx), val); return 0; }; From cefafbd561402b0fe6447449364a30315b9b1570 Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Fri, 8 May 2026 02:31:21 -0700 Subject: [PATCH 183/377] irqchip/riscv-imsic: Clear interrupt move state during CPU offlining Affinity changes of IMSIC interrupts have to be careful to not lose an interrupt in the process. Each vector keeps track of an affinity change in progress with two pointers in struct imsic_vector. imsic_vector::move_prev points to the previous CPU target data and imsic_vector::move_next to the designated new CPU target data. imsic_vector::move_prev on the new CPU can only be cleared after the previous CPU has cleared imsic_vector::move_next, which ususally happens in __imsic_remote_sync(). In case of CPU hot-unplug __imsic_remote_sync() is not invoked because the CPU is already marked offline. That means imsic_vector::move_prev becomes stale until the CPU is onlined again. The stale pointer prevents further affinity changes for the affected interrupts. Solve this by clearing the imsic_vector::move_prev pointers in the CPU hotplug offline path. [ tglx: Replace word salad in change log ] Fixes: 0f67911e821c ("irqchip/riscv-imsic: Separate next and previous pointers in IMSIC vector") Signed-off-by: Yong-Xuan Wang Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260508-imsic-v2-1-e9f08dd46cf5@sifive.com --- drivers/irqchip/irq-riscv-imsic-early.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/irqchip/irq-riscv-imsic-early.c b/drivers/irqchip/irq-riscv-imsic-early.c index ba903fa689bd..a7a1852b548c 100644 --- a/drivers/irqchip/irq-riscv-imsic-early.c +++ b/drivers/irqchip/irq-riscv-imsic-early.c @@ -158,6 +158,8 @@ static int imsic_dying_cpu(unsigned int cpu) /* Cleanup IPIs */ imsic_ipi_dying_cpu(); + imsic_local_sync_all(false); + /* Mark per-CPU IMSIC state as offline */ imsic_state_offline(); From 3799c2570982577551023ae035f5a786cf39a76e Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Sun, 10 May 2026 16:41:19 +0800 Subject: [PATCH 184/377] io_uring/fdinfo: translate SqThread PID through caller's pid_ns SQPOLL stores current->pid (init_pid_ns view) in sqd->task_pid at thread creation. fdinfo prints it raw via seq_printf("SqThread:\t%d\n", sq_pid). A reader inside a non-initial pid_ns sees the host PID, not the kthread's PID in the reader's own pid_ns. The SQPOLL kthread is created with CLONE_THREAD and no CLONE_NEW*, so it lives in the submitter's pid_ns. An unprivileged user_ns + pid_ns submitter can read fdinfo and learn the host PID of a kthread whose in-namespace PID is different. Reproducer (mainline 7.0, KASAN): unshare CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS, mount a private /proc, then have a grandchild that is pid 1 in the new pid_ns open an io_uring ring with IORING_SETUP_SQPOLL. /proc/self/task lists {1, 2}; the SQPOLL kthread is pid 2. Before: fdinfo prints SqThread = . After: SqThread = 2. Use task_pid_nr_ns() against the proc inode's pid_ns to compute sq_pid, instead of reading the stored sq->task_pid (which holds the init_pid_ns view). pidfd_show_fdinfo() in kernel/pid.c follows the same pattern. Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260510084119.457578-1-maoyi.xie@ntu.edu.sg Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index c2d3e45544bb..001fb542dc11 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -190,8 +190,9 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) get_task_struct(tsk); rcu_read_unlock(); usec = io_sq_cpu_usec(tsk); + sq_pid = task_pid_nr_ns(tsk, + proc_pid_ns(file_inode(m->file)->i_sb)); put_task_struct(tsk); - sq_pid = sq->task_pid; sq_cpu = sq->sq_cpu; sq_total_time = usec; sq_work_time = sq->work_time; From 1860c2f85922917d8a46f16a6f4bd2298ffa0fb5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 10 May 2026 22:48:43 +0800 Subject: [PATCH 185/377] ublk: reject max_sectors smaller than PAGE_SECTORS in parameter validation blk_validate_limits() requires max_hw_sectors >= PAGE_SECTORS and fires a WARN_ON_ONCE if this invariant is violated. ublk_validate_params() only checked the upper bound of max_sectors against max_io_buf_bytes, allowing userspace to pass small values (including zero) that trigger the warning when blk_mq_alloc_disk() is called from ublk_ctrl_start_dev(). Before 494ea040bcb5, ublk used blk_queue_max_hw_sectors() which silently clamped small values up to PAGE_SECTORS. The conversion to passing queue_limits directly to blk_mq_alloc_disk() lost that clamping and now hits blk_validate_limits()'s WARN_ON_ONCE instead. Validate that max_sectors is at least PAGE_SECTORS in ublk_validate_params() so invalid values are rejected early with -EINVAL instead of reaching the block layer. Fixes: 494ea040bcb5 ("ublk: pass queue_limits to blk_mq_alloc_disk") Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260510144843.769031-1-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6d13f1481de0..6c041eaebdb9 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -920,6 +920,9 @@ static int ublk_validate_params(const struct ublk_device *ub) if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) return -EINVAL; + if (p->max_sectors < PAGE_SECTORS) + return -EINVAL; + if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) return -EINVAL; } else From 2997606dd17729404cef9821ce66dd037b6019eb Mon Sep 17 00:00:00 2001 From: Paolo Pisati Date: Fri, 8 May 2026 09:09:56 +0200 Subject: [PATCH 186/377] platform/x86: asus-nb-wmi: add DMI quirk for ASUS Zenbook Duo UX8407AA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the existing zenbook duo keyboard quirk for the UX8407AA model too. Signed-off-by: Paolo Pisati Reviewed-by: Denis Benato Link: https://patch.msgid.link/20260508070956.62201-1-p.pisati@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-nb-wmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index b4677c5bba5b..8005c088e9ee 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -544,6 +544,15 @@ static const struct dmi_system_id asus_quirks[] = { }, .driver_data = &quirk_asus_zenbook_duo_kbd, }, + { + .callback = dmi_matched, + .ident = "ASUS Zenbook Duo UX8407AA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUS"), + DMI_MATCH(DMI_PRODUCT_NAME, "Zenbook Duo UX8407AA"), + }, + .driver_data = &quirk_asus_zenbook_duo_kbd, + }, { .callback = dmi_matched, .ident = "ASUS ROG Z13", From 91840be8f710370607f949a627e070896faeddb8 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Mon, 30 Mar 2026 15:32:29 +0800 Subject: [PATCH 187/377] irq_work: Fix use-after-free in irq_work_single() on PREEMPT_RT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On PREEMPT_RT, non-HARD irq_work runs in per-CPU kthreads via run_irq_workd(), so irq_work_sync() uses rcuwait() to wait for BUSY==0. After irq_work_single() clears BUSY via atomic_cmpxchg(), it still dereferences @work for irq_work_is_hard() and rcuwait_wake_up(). An irq_work_sync() caller on another CPU that enters after BUSY is cleared can observe BUSY==0 immediately, return, and free the work before those accesses complete — causing a use-after-free. Fix this by wrapping run_irq_workd() in guard(rcu)() so that the entire irq_work_single() execution is within an RCU read-side critical section. Then add synchronize_rcu() in irq_work_sync() after rcuwait_wait_event() to ensure the caller waits for the RCU grace period before returning, preventing premature frees. Fixes: 810979682ccc ("irq_work: Allow irq_work_sync() to sleep if irq_work() no IRQ support.") Suggested-by: Sebastian Andrzej Siewior Suggested-by: Steven Rostedt Signed-off-by: Jiayuan Chen Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20260330073234.303732-1-jiayuan.chen@linux.dev --- kernel/irq_work.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 120fd7365fbe..f7e2dc2c30c6 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -292,6 +292,12 @@ void irq_work_sync(struct irq_work *work) !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); + /* + * Ensure irq_work_single() does not access @work + * after removing IRQ_WORK_BUSY. It is always + * accessed within a RCU-read section. + */ + synchronize_rcu(); return; } @@ -302,6 +308,7 @@ EXPORT_SYMBOL_GPL(irq_work_sync); static void run_irq_workd(unsigned int cpu) { + guard(rcu)(); irq_work_run_list(this_cpu_ptr(&lazy_list)); } From ce28b772c3c28fb1c62a81533045ae90ed3b496c Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 6 May 2026 06:05:14 -0700 Subject: [PATCH 188/377] nvme: make prp passthrough usage less scary The warning is a bit alarming, and it only prints for the very first non-sgl capable device that receives a passthrough command. Just log an informational message on initial discovery for every device. Reviewed-by: Jens Axboe Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++++ drivers/nvme/host/ioctl.c | 13 ++----------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dc388e24caad..1e7e42b43aa3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3749,6 +3749,10 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended) ret = nvme_hwmon_init(ctrl); if (ret == -EINTR) return ret; + + if (!nvme_ctrl_sgl_supported(ctrl)) + dev_info(ctrl->device, + "passthrough uses implicit buffer lengths\n"); } clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 9597a87cf05d..e232188ccd02 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -120,21 +120,12 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct nvme_ns *ns = q->queuedata; struct block_device *bdev = ns ? ns->disk->part0 : NULL; bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); - struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; bool has_metadata = meta_buffer && meta_len; struct bio *bio = NULL; int ret; - if (!nvme_ctrl_sgl_supported(ctrl)) - dev_warn_once(ctrl->device, "using unchecked data buffer\n"); - if (has_metadata) { - if (!supports_metadata) - return -EINVAL; - - if (!nvme_ctrl_meta_sgl_supported(ctrl)) - dev_warn_once(ctrl->device, - "using unchecked metadata buffer\n"); - } + if (has_metadata && !supports_metadata) + return -EINVAL; if (iter) ret = blk_rq_map_user_iov(q, req, NULL, iter, GFP_KERNEL); From 2279cd9c61a330e5de4d6eb0bc422820dd6fdf36 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 6 May 2026 06:16:02 -0700 Subject: [PATCH 189/377] nvme: fix bio leak on mapping failure The local bio is always NULL, so we'd leak the bio if the integrity mapping failed. Just get it directly from the request. Fixes: d0d1d522316e91f ("blk-map: provide the bdev to bio if one exists") Reviewed-by: Sagi Grimberg Reviewed-by: John Garry Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e232188ccd02..08889b20e5d8 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -121,7 +121,6 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct block_device *bdev = ns ? ns->disk->part0 : NULL; bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); bool has_metadata = meta_buffer && meta_len; - struct bio *bio = NULL; int ret; if (has_metadata && !supports_metadata) @@ -145,8 +144,8 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, return ret; out_unmap: - if (bio) - blk_rq_unmap_user(bio); + if (req->bio) + blk_rq_unmap_user(req->bio); return ret; } From a891962ae5e48fb5476c23631df759482e62ab16 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 30 Apr 2026 15:22:32 +0200 Subject: [PATCH 190/377] nvmet-auth: Do not print DH-HMAC-CHAP secrets From a security standpoint we should not allow to print out the DH-HMAC-CHAP secrets, but at the same time having them is useful for debugging authentication failures. So add a Kconfig option NVME_TARGET_AUTH_DEBUG to only enable debugging if explictly requested at build time. Reviewed-by: Sagi Grimberg Signed-off-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/target/Kconfig | 9 +++++++++ drivers/nvme/target/auth.c | 13 ++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 4904097dfd49..69bde270115e 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -117,6 +117,15 @@ config NVME_TARGET_AUTH If unsure, say N. +config NVME_TARGET_AUTH_DEBUG + bool "NVMe over Fabrics In-band Authentication debug messages" + depends on NVME_TARGET_AUTH + help + This enables additional debug messages including the generated + DH-HMAC-CHAP secrets to help debugging authentication failures. + + If unsure, say N. + config NVME_TARGET_PCI_EPF tristate "NVMe PCI Endpoint Function target support" depends on NVME_TARGET && PCI_ENDPOINT diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 9a2eccdc8b13..edb9627d97b0 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -144,7 +144,6 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset) goto out_unlock; list_for_each_entry(p, &ctrl->subsys->hosts, entry) { - pr_debug("check %s\n", nvmet_host_name(p->host)); if (strcmp(nvmet_host_name(p->host), ctrl->hostnqn)) continue; host = p->host; @@ -189,11 +188,12 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset) ctrl->host_key = NULL; goto out_free_hash; } +#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG pr_debug("%s: using hash %s key %*ph\n", __func__, ctrl->host_key->hash > 0 ? nvme_auth_hmac_name(ctrl->host_key->hash) : "none", (int)ctrl->host_key->len, ctrl->host_key->key); - +#endif nvme_auth_free_key(ctrl->ctrl_key); if (!host->dhchap_ctrl_secret) { ctrl->ctrl_key = NULL; @@ -207,11 +207,12 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset) ctrl->ctrl_key = NULL; goto out_free_hash; } +#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG pr_debug("%s: using ctrl hash %s key %*ph\n", __func__, ctrl->ctrl_key->hash > 0 ? nvme_auth_hmac_name(ctrl->ctrl_key->hash) : "none", (int)ctrl->ctrl_key->len, ctrl->ctrl_key->key); - +#endif out_free_hash: if (ret) { if (ctrl->host_key) { @@ -317,7 +318,6 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, if (ret) goto out_free_challenge; } - pr_debug("ctrl %d qid %d host response seq %u transaction %d\n", ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, req->sq->dhchap_tid); @@ -434,8 +434,10 @@ int nvmet_auth_ctrl_exponential(struct nvmet_req *req, ret = -EINVAL; } else { memcpy(buf, ctrl->dh_key, buf_size); +#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG pr_debug("%s: ctrl %d public key %*ph\n", __func__, ctrl->cntlid, (int)buf_size, buf); +#endif } return ret; @@ -458,11 +460,12 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, ctrl->shash_id); if (ret) pr_debug("failed to compute session key, err %d\n", ret); +#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG else pr_debug("%s: session key %*ph\n", __func__, (int)req->sq->dhchap_skey_len, req->sq->dhchap_skey); - +#endif return ret; } From b35a13036755c5803168a7cb93bc66035c3e65b8 Mon Sep 17 00:00:00 2001 From: "Chia-Lin Kao (AceLan)" Date: Wed, 29 Apr 2026 16:11:16 +0800 Subject: [PATCH 191/377] nvme-pci: fix use-after-free in nvme_free_host_mem() nvme_free_host_mem() frees dev->hmb_sgt via dma_free_noncontiguous() but never clears the pointer afterward. This leads to a use-after-free if nvme_free_host_mem() is called twice in the same error path. This can happen during nvme_probe() when nvme_setup_host_mem() succeeds in allocating the HMB (setting dev->hmb_sgt) but nvme_set_host_mem() fails with an I/O error: nvme_setup_host_mem() nvme_alloc_host_mem_single() -> sets dev->hmb_sgt nvme_set_host_mem() -> fails with -EIO nvme_free_host_mem() -> frees hmb_sgt, but does NOT NULL it return error nvme_probe() error path: nvme_free_host_mem() -> dev->hmb_sgt is stale, use-after-free The second call dereferences the freed sgt, causing a NULL pointer dereference in iommu_dma_free_noncontiguous() when it accesses sgt->sgl->dma_address (the backing memory has been freed and zeroed). This is reproducible on Thunderbolt-attached NVMe devices (e.g., OWC Envoy Express behind a Dell WD22TB4 dock) where the device intermittently returns I/O errors during HMB setup due to PCIe link instability. BUG: kernel NULL pointer dereference, address: 0000000000000010 RIP: 0010:iommu_dma_free_noncontiguous+0x22/0x80 Call Trace: dma_free_noncontiguous+0x3b/0x130 nvme_free_host_mem+0x30/0xf0 [nvme] nvme_probe.cold+0xcc/0x275 [nvme] local_pci_probe+0x43/0xa0 pci_device_probe+0xeea/0x290 really_probe+0xf9/0x3b0 __driver_probe_device+0x8b/0x170 driver_probe_device+0x24/0xd0 __driver_attach_async_helper+0x6b/0x110 async_run_entry_fn+0x37/0x170 process_one_work+0x1ac/0x3d0 worker_thread+0x1b8/0x360 kthread+0xf7/0x130 ret_from_fork+0x2d8/0x3a0 ret_from_fork_asm+0x1a/0x30 Fix this by setting dev->hmb_sgt to NULL after freeing it, so the second call takes the multi-descriptor path which safely handles the already-cleaned-up state. Fixes: 63a5c7a4b4c4 ("nvme-pci: use dma_alloc_noncontigous if possible") Signed-off-by: Chia-Lin Kao (AceLan) Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9fd04cd7c5cb..94c423bed947 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2533,11 +2533,13 @@ static void nvme_free_host_mem_multi(struct nvme_dev *dev) static void nvme_free_host_mem(struct nvme_dev *dev) { - if (dev->hmb_sgt) + if (dev->hmb_sgt) { dma_free_noncontiguous(dev->dev, dev->host_mem_size, dev->hmb_sgt, DMA_BIDIRECTIONAL); - else + dev->hmb_sgt = NULL; + } else { nvme_free_host_mem_multi(dev); + } dma_free_coherent(dev->dev, dev->host_mem_descs_size, dev->host_mem_descs, dev->host_mem_descs_dma); From dbbd07d0a7020b80f6a7028e561908f7b83b3d5a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 10 May 2026 23:30:29 +0300 Subject: [PATCH 192/377] nvmet-tcp: Fix potential UAF when ddgst mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shivam Kumar found via vulnerability testing: When data digest is enabled on an NVMe/TCP connection and a digest mismatch occurs on a non-final H2C_DATA PDU during an R2T-based data transfer, the digest error handler in nvmet_tcp_try_recv_ddgst() calls nvmet_req_uninit() — which performs percpu_ref_put() on the submission queue — but does NOT mark the command as completed. It does not set cqe->status, does not modify rbytes_done, and does not clear any flag. When the subsequent fatal error triggers queue teardown, nvmet_tcp_uninit_data_in_cmds() iterates all commands, checks nvmet_tcp_need_data_in() for each one, and finds that the already-uninited command still appears to need data (because rbytes_done < transfer_len and cqe->status == 0). It therefore calls nvmet_req_uninit() a second time on the same command — a double percpu_ref_put against a single percpu_ref_get. Reported-by: Shivam Kumar Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 164a564ba3b4..20f150d17a96 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1321,8 +1321,10 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) queue->idx, cmd->req.cmd->common.command_id, queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), le32_to_cpu(cmd->exp_ddgst)); - if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED)) + if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED)) { + cmd->req.cqe->status = NVME_SC_CMD_SEQ_ERROR; nvmet_req_uninit(&cmd->req); + } nvmet_tcp_free_cmd_buffers(cmd); ret = -EPROTO; goto out; From 40df2172853ffc0f84cca3906f5c4d9a6131195d Mon Sep 17 00:00:00 2001 From: AlanCui4080 Date: Fri, 8 May 2026 17:59:12 +0800 Subject: [PATCH 193/377] Revert "nvme: add quirk NVME_QUIRK_IGNORE_DEV_SUBNQN for 144d:a808" This reverts commit 7f991e3f9b8f044640bcb5fa8570350a68932843 ("nvme: add quirk NVME_QUIRK_IGNORE_DEV_SUBNQN for 144d:a808") The incorrect implementations of SUBNQN is a known issue in a massive number of NVMe units. However, the warning "nvme nvmex: missing or invalid SUBNQN field." is usually appropriate and will not affect performance or behavior etc. That is because the support for SUBNQN is mandatory if the controller supports NVMe revision 1.2.1 or greater, and it reported itself without a SUBNQN field which breaks compliance with the specification. It should be not quirked by the Linux Kernel. Signed-off-by: Alan Cui Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 94c423bed947..139a10cd687f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -4109,8 +4109,6 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c5f, 0x0555), /* Memblaze Pblaze5 adapter */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, }, - { PCI_DEVICE(0x144d, 0xa808), /* Samsung PM981/983 */ - .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ From c1fa0bb633e4a6b11e83ffc57fa5abe8ebb87891 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 11 May 2026 08:55:11 -0700 Subject: [PATCH 194/377] exit: prevent preemption of oopsing TASK_DEAD task When an already-exiting task oopses, make_task_dead() currently calls do_task_dead() with preemption enabled. That is forbidden: do_task_dead() calls __schedule(), which has a comment saying "WARNING: must be called with preemption disabled!". If an oopsing task is preempted in do_task_dead(), between becoming TASK_DEAD and entering the scheduler explicitly, bad things happen: finish_task_switch() assumes that once the scheduler has switched away from a TASK_DEAD task, the task can never run again and its stack is no longer needed; but that assumption apparently doesn't hold if the dead task was preempted (the SM_PREEMPT case). This means that the scheduler ends up repeatedly dropping references on the dead task's stack, which can lead to use-after-free or double-free of the entire task stack; in other words, two tasks can end up running on the same stack, resulting in various kinds of memory corruption. (This does not just affect "recursively oopsing" tasks; it is enough to oops once during task exit, for example in a file_operations::release handler) Fixes: 7f80a2fd7db9 ("exit: Stop poorly open coding do_task_dead in make_task_dead") Cc: stable@kernel.org Signed-off-by: Jann Horn Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/exit.c b/kernel/exit.c index 25e9cb6de7e7..9a909993ab1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1073,6 +1073,7 @@ void __noreturn make_task_dead(int signr) futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); + preempt_disable(); do_task_dead(); } From e4865a56d013e86e46ea6acea15bb6eae01898ff Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 May 2026 20:04:33 +0200 Subject: [PATCH 195/377] ACPI: driver: Check ACPI_COMPANION() against NULL during probe Since every platform driver can be forced to match a device that doesn't match its list of device IDs because of device_match_driver_override(), platform drivers that rely on the existence of a device's ACPI companion object should verify its presence. Accordingly, add requisite ACPI_COMPANION() or ACPI_HANDLE() checks against NULL to 13 platform drivers handling core ACPI devices. Also change the value returned by the ACPI thermal zone driver when the device's ACPI companion is not present to -ENODEV for consistency with the other drivers. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/4516068.ejJDZkT8p0@rafael.j.wysocki Cc: 7.0+ # 7.0+ --- drivers/acpi/ac.c | 6 +++++- drivers/acpi/acpi_pad.c | 6 +++++- drivers/acpi/acpi_tad.c | 6 +++++- drivers/acpi/battery.c | 6 +++++- drivers/acpi/button.c | 9 +++++++-- drivers/acpi/ec.c | 6 +++++- drivers/acpi/hed.c | 6 +++++- drivers/acpi/nfit/core.c | 6 +++++- drivers/acpi/pfr_telemetry.c | 6 +++++- drivers/acpi/pfr_update.c | 6 +++++- drivers/acpi/sbs.c | 6 +++++- drivers/acpi/sbshc.c | 6 +++++- drivers/acpi/thermal.c | 2 +- drivers/acpi/tiny-power-button.c | 6 +++++- 14 files changed, 68 insertions(+), 15 deletions(-) diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index e9e970fd8f33..27f31744f29e 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -192,11 +192,15 @@ static const struct dmi_system_id ac_dmi_table[] __initconst = { static int acpi_ac_probe(struct platform_device *pdev) { - struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); struct power_supply_config psy_cfg = {}; + struct acpi_device *adev; struct acpi_ac *ac; int result; + adev = ACPI_COMPANION(&pdev->dev); + if (!adev) + return -ENODEV; + ac = kzalloc_obj(struct acpi_ac); if (!ac) return -ENOMEM; diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index 0a8e02bc8c8b..ec94b09bb747 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -423,7 +423,11 @@ static void acpi_pad_notify(acpi_handle handle, u32 event, void *data) static int acpi_pad_probe(struct platform_device *pdev) { - struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); + struct acpi_device *adev; + + adev = ACPI_COMPANION(&pdev->dev); + if (!adev) + return -ENODEV; return acpi_dev_install_notify_handler(adev, ACPI_DEVICE_NOTIFY, acpi_pad_notify, adev); diff --git a/drivers/acpi/acpi_tad.c b/drivers/acpi/acpi_tad.c index cac07e997028..386fc1abcbdc 100644 --- a/drivers/acpi/acpi_tad.c +++ b/drivers/acpi/acpi_tad.c @@ -815,12 +815,16 @@ static void acpi_tad_remove(void *data) static int acpi_tad_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - acpi_handle handle = ACPI_HANDLE(dev); struct acpi_tad_driver_data *dd; + acpi_handle handle; acpi_status status; unsigned long long caps; int ret; + handle = ACPI_HANDLE(dev); + if (!handle) + return -ENODEV; + /* * Initialization failure messages are mostly about firmware issues, so * print them at the "info" level. diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index b4c25474f42f..d4ae21a71007 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -1214,10 +1214,14 @@ static void sysfs_battery_cleanup(struct acpi_battery *battery) static int acpi_battery_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); struct acpi_battery *battery; + struct acpi_device *device; int result; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + if (device->dep_unmet) return -EPROBE_DEFER; diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index dc064a388c23..b47301ee4c8a 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -531,15 +531,20 @@ static int acpi_lid_input_open(struct input_dev *input) static int acpi_button_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); acpi_notify_handler handler; + struct acpi_device *device; struct acpi_button *button; struct input_dev *input; - const char *hid = acpi_device_hid(device); acpi_status status; char *name, *class; + const char *hid; int error = 0; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + + hid = acpi_device_hid(device); if (!strcmp(hid, ACPI_BUTTON_HID_LID) && lid_init_state == ACPI_BUTTON_LID_INIT_DISABLED) return -ENODEV; diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 45204538ed87..64ad4cfa6208 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1676,10 +1676,14 @@ static int acpi_ec_setup(struct acpi_ec *ec, struct acpi_device *device, bool ca static int acpi_ec_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; struct acpi_ec *ec; int ret; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + if (boot_ec && (boot_ec->handle == device->handle || !strcmp(acpi_device_hid(device), ACPI_ECDT_HID))) { /* Fast path: this device corresponds to the boot EC. */ diff --git a/drivers/acpi/hed.c b/drivers/acpi/hed.c index 4d5e12ed6f3c..060e8d670f5d 100644 --- a/drivers/acpi/hed.c +++ b/drivers/acpi/hed.c @@ -50,9 +50,13 @@ static void acpi_hed_notify(acpi_handle handle, u32 event, void *data) static int acpi_hed_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; int err; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + /* Only one hardware error device */ if (hed_handle) return -EINVAL; diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index d13264fb9e02..9304ac996d41 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -3341,12 +3341,16 @@ static int acpi_nfit_probe(struct platform_device *pdev) struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; struct acpi_nfit_desc *acpi_desc; struct device *dev = &pdev->dev; - struct acpi_device *adev = ACPI_COMPANION(dev); struct acpi_table_header *tbl; + struct acpi_device *adev; acpi_status status = AE_OK; acpi_size sz; int rc = 0; + adev = ACPI_COMPANION(&pdev->dev); + if (!adev) + return -ENODEV; + rc = acpi_dev_install_notify_handler(adev, ACPI_DEVICE_NOTIFY, acpi_nfit_notify, dev); if (rc) diff --git a/drivers/acpi/pfr_telemetry.c b/drivers/acpi/pfr_telemetry.c index 32bdf8cbe8f2..2387376832a1 100644 --- a/drivers/acpi/pfr_telemetry.c +++ b/drivers/acpi/pfr_telemetry.c @@ -360,10 +360,14 @@ static void pfrt_log_put_idx(void *data) static int acpi_pfrt_log_probe(struct platform_device *pdev) { - acpi_handle handle = ACPI_HANDLE(&pdev->dev); struct pfrt_log_device *pfrt_log_dev; + acpi_handle handle; int ret; + handle = ACPI_HANDLE(&pdev->dev); + if (!handle) + return -ENODEV; + if (!acpi_has_method(handle, "_DSM")) { dev_dbg(&pdev->dev, "Missing _DSM\n"); return -ENODEV; diff --git a/drivers/acpi/pfr_update.c b/drivers/acpi/pfr_update.c index 11b1c2828005..6283105bb0e8 100644 --- a/drivers/acpi/pfr_update.c +++ b/drivers/acpi/pfr_update.c @@ -538,10 +538,14 @@ static void pfru_put_idx(void *data) static int acpi_pfru_probe(struct platform_device *pdev) { - acpi_handle handle = ACPI_HANDLE(&pdev->dev); struct pfru_device *pfru_dev; + acpi_handle handle; int ret; + handle = ACPI_HANDLE(&pdev->dev); + if (!handle) + return -ENODEV; + if (!acpi_has_method(handle, "_DSM")) { dev_dbg(&pdev->dev, "Missing _DSM\n"); return -ENODEV; diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index 440f1d69aca8..86b7c7975852 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -629,11 +629,15 @@ static void acpi_sbs_callback(void *context) static int acpi_sbs_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; struct acpi_sbs *sbs; int result = 0; int id; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + sbs = kzalloc_obj(struct acpi_sbs); if (!sbs) { result = -ENOMEM; diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c index f413270415b6..c0ffa267f96c 100644 --- a/drivers/acpi/sbshc.c +++ b/drivers/acpi/sbshc.c @@ -237,11 +237,15 @@ static int smbus_alarm(void *context) static int acpi_smbus_hc_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; int status; unsigned long long val; struct acpi_smb_hc *hc; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + status = acpi_evaluate_integer(device->handle, "_EC", NULL, &val); if (ACPI_FAILURE(status)) { pr_err("error obtaining _EC.\n"); diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index b8b487d89d25..dfc7daa809b5 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -789,7 +789,7 @@ static int acpi_thermal_probe(struct platform_device *pdev) int i; if (!device) - return -EINVAL; + return -ENODEV; tz = kzalloc_obj(struct acpi_thermal); if (!tz) diff --git a/drivers/acpi/tiny-power-button.c b/drivers/acpi/tiny-power-button.c index 531e65b01bcb..92516ef84b02 100644 --- a/drivers/acpi/tiny-power-button.c +++ b/drivers/acpi/tiny-power-button.c @@ -38,9 +38,13 @@ static u32 acpi_tiny_power_button_event(void *not_used) static int acpi_tiny_power_button_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; acpi_status status; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + if (device->device_type == ACPI_BUS_TYPE_POWER_BUTTON) { status = acpi_install_fixed_event_handler(ACPI_EVENT_POWER_BUTTON, acpi_tiny_power_button_event, From 37953cec775ed34e59cf9a7d7bb9b0610daa3f3e Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 8 May 2026 15:33:29 +0200 Subject: [PATCH 196/377] nvme: fix race condition between connected uevent and STARTED_ONCE flag When a controller connects, nvme_start_ctrl() emits the "NVME_EVENT=connected" uevent and sets the NVME_CTRL_STARTED_ONCE flag. Currently, the uevent is emitted before the flag is set. This creates a race condition for userspace tools (like udev rules) that might rely on the "connected" event to configure other attributes. Swap the order of operations in nvme_start_ctrl() so that the NVME_CTRL_STARTED_ONCE flag is set before the uevent is sent. This guarantees that the admin_timeout can already be changed when userspace is notified. Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Daniel Wagner Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1e7e42b43aa3..c3032d6ad6b1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5045,8 +5045,8 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_mpath_update(ctrl); } - nvme_change_uevent(ctrl, "NVME_EVENT=connected"); set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags); + nvme_change_uevent(ctrl, "NVME_EVENT=connected"); } EXPORT_SYMBOL_GPL(nvme_start_ctrl); From 20c39819a27646573dfa0ac0d01c38895298a6f6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 May 2026 10:58:38 -0600 Subject: [PATCH 197/377] io_uring: hold uring_lock when walking link chain in io_wq_free_work() io_wq_free_work() calls io_req_find_next() from io-wq worker context, which reads and clears req->link without holding any lock. This can potentially race with other paths that mutate the same chain under ctx->uring_lock. Take ctx->uring_lock around the io_req_find_next() call. Only requests with IO_REQ_LINK_FLAGS reach this path, which is not the hot path. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4ed998d60c09..2ebb0ba37c4f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1452,8 +1452,13 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) struct io_kiocb *nxt = NULL; if (req_ref_put_and_test_atomic(req)) { - if (req->flags & IO_REQ_LINK_FLAGS) + if (req->flags & IO_REQ_LINK_FLAGS) { + struct io_ring_ctx *ctx = req->ctx; + + mutex_lock(&ctx->uring_lock); nxt = io_req_find_next(req); + mutex_unlock(&ctx->uring_lock); + } io_free_req(req); } return nxt ? &nxt->work : NULL; From 49ae66eb8c27375075ffa308cfd4bf25af335d41 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 May 2026 10:58:50 -0600 Subject: [PATCH 198/377] io_uring: defer linked-timeout chain splice out of hrtimer context io_link_timeout_fn() is the hrtimer callback that fires when a linked timeout expires. It currently calls io_remove_next_linked(prev) under ctx->timeout_lock to splice the timeout request out of the link chain. This is the only chain-mutation site that runs without ctx->uring_lock, because hrtimer callbacks cannot take a mutex. Defer the splicing until the task_work callback. Signed-off-by: Jens Axboe --- io_uring/timeout.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index e2595cae2b07..6353a4d979dc 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -284,6 +284,10 @@ static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout); io_remove_next_linked(req); + + /* If this is NULL, then timer already claimed it and will complete it */ + if (!timeout->head) + return NULL; timeout->head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&timeout->list); @@ -367,6 +371,14 @@ static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw) int ret; if (prev) { + /* + * splice the linked timeout out of prev's chain if the regular + * completion path didn't already do it. + */ + if (prev->link == req) + prev->link = req->link; + req->link = NULL; + if (!tw.cancel) { struct io_cancel_data cd = { .ctx = req->ctx, @@ -401,10 +413,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) /* * We don't expect the list to be empty, that will only happen if we - * race with the completion of the linked work. + * race with the completion of the linked work. Splice of prev is + * done in io_req_task_link_timeout(), if needed. */ if (prev) { - io_remove_next_linked(prev); if (!req_ref_inc_not_zero(prev)) prev = NULL; } From a65855ec34aed84e1e5b4aea0323cc1745f83a5c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 May 2026 10:58:56 -0600 Subject: [PATCH 199/377] io_uring: hold uring_lock across io_kill_timeouts() in cancel path io_uring_try_cancel_requests() dropped ctx->uring_lock before calling io_kill_timeouts(), which walks each timeout's link chain via io_match_task() to test REQ_F_INFLIGHT. With chain mutation now serialized by ctx->uring_lock, that walk needs the lock too. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 5e5eb9cfc7cd..4aa3103ba9c3 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -561,8 +561,8 @@ __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_waitid_remove_all(ctx, tctx, cancel_all); ret |= io_futex_remove_all(ctx, tctx, cancel_all); ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); - mutex_unlock(&ctx->uring_lock); ret |= io_kill_timeouts(ctx, tctx, cancel_all); + mutex_unlock(&ctx->uring_lock); if (tctx) ret |= io_run_task_work() > 0; else From 35d0ed82d03e5ee77ea4f31f20e29562a7721649 Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Tue, 5 May 2026 11:08:12 +0200 Subject: [PATCH 200/377] libceph: Fix potential out-of-bounds access in osdmap_decode() When decoding osd_state and osd_weight from an incoming osdmap in osdmap_decode(), both are decoded for each osd, i.e., map->max_osd times. The ceph_decode_need() check only accounts for sizeof(*map->osd_weight) once. This can potentially result in an out-of-bounds memory access if the incoming message is corrupted such that the max_osd value exceeds the actual content of the osdmap message. This patch fixes the issue by changing the corresponding part in the ceph_decode_need() check to account for map->max_osd*sizeof(*map->osd_weight). Cc: stable@vger.kernel.org Fixes: dcbc919a5dc8 ("libceph: switch osdmap decoding to use ceph_decode_entity_addr") Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 669348d883f0..2095e73ccf6c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1705,7 +1705,7 @@ static int osdmap_decode(void **p, void *end, bool msgr2, ceph_decode_need(p, end, 3*sizeof(u32) + map->max_osd*(struct_v >= 5 ? sizeof(u32) : sizeof(u8)) + - sizeof(*map->osd_weight), e_inval); + map->max_osd*sizeof(*map->osd_weight), e_inval); if (ceph_decode_32(p) != map->max_osd) goto e_inval; From 5dd74441cbf42c22e874450eb6a6bbb19390a216 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Sat, 9 May 2026 18:20:31 +0800 Subject: [PATCH 201/377] cgroup/cpuset: Reserve DL bandwidth only for root-domain moves cpuset_can_attach() currently adds the bandwidth of all migrating SCHED_DEADLINE tasks to sum_migrate_dl_bw. If the source and destination cpuset effective CPU masks do not overlap, the whole sum is then reserved in the destination root domain. set_cpus_allowed_dl(), however, subtracts bandwidth from the source root domain only when the affinity change really moves the task between root domains. A DL task can move between cpusets that are still in the same root domain, so including that task in sum_migrate_dl_bw can reserve destination bandwidth without a matching source-side subtraction. Share the root-domain move test with set_cpus_allowed_dl(). Keep nr_migrate_dl_tasks counting all migrating deadline tasks for cpuset DL task accounting, but add to sum_migrate_dl_bw only for tasks that need a root-domain bandwidth move. Keep using the destination cpuset effective CPU mask and leave the broader can_attach()/attach() transaction model unchanged. Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails") Cc: stable@vger.kernel.org # v6.10+ Signed-off-by: Guopeng Zhang Reviewed-by: Waiman Long Acked-by: Juri Lelli Tested-by: Juri Lelli Signed-off-by: Tejun Heo --- include/linux/sched/deadline.h | 9 +++++++++ kernel/cgroup/cpuset-internal.h | 1 + kernel/cgroup/cpuset.c | 35 ++++++++++++++++++--------------- kernel/sched/deadline.c | 13 +++++++++--- 4 files changed, 39 insertions(+), 19 deletions(-) diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 1198138cb839..273538200a44 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -33,6 +33,15 @@ struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); +/* + * Return whether moving DL task @p to @new_mask requires moving DL + * bandwidth accounting between root domains. This helper is specific to + * DL bandwidth move accounting semantics and is shared by + * cpuset_can_attach() and set_cpus_allowed_dl() so both paths use the + * same source root-domain test. + */ +extern bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask); extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index bb4e692bea30..f7aaf01f7cd5 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -167,6 +167,7 @@ struct cpuset { */ int nr_deadline_tasks; int nr_migrate_dl_tasks; + /* DL bandwidth that needs destination reservation for this attach. */ u64 sum_migrate_dl_bw; /* * CPU used for temporary DL bandwidth allocation during attach; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3fbf6e7f68c3..e84e801e22cf 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2993,7 +2993,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cpuset *cs, *oldcs; struct task_struct *task; bool setsched_check; - int ret; + int cpu, ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); @@ -3038,29 +3038,32 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) } if (dl_task(task)) { + /* + * Count all migrating DL tasks for cpuset task accounting. + * Only tasks that need a root-domain bandwidth move + * contribute to sum_migrate_dl_bw. + */ cs->nr_migrate_dl_tasks++; - cs->sum_migrate_dl_bw += task->dl.dl_bw; + if (dl_task_needs_bw_move(task, cs->effective_cpus)) + cs->sum_migrate_dl_bw += task->dl.dl_bw; } } - if (!cs->nr_migrate_dl_tasks) + if (!cs->sum_migrate_dl_bw) goto out_success; - if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { - int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); - - if (unlikely(cpu >= nr_cpu_ids)) { - ret = -EINVAL; - goto out_unlock; - } - - ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); - if (ret) - goto out_unlock; - - cs->dl_bw_cpu = cpu; + cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + if (unlikely(cpu >= nr_cpu_ids)) { + ret = -EINVAL; + goto out_unlock; } + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) + goto out_unlock; + + cs->dl_bw_cpu = cpu; + out_success: /* * Mark attach is in progress. This makes validate_change() fail diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index edca7849b165..7db4c87df83b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3107,20 +3107,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, struct affinity_context *ctx) { - struct root_domain *src_rd; struct rq *rq; WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); - src_rd = rq->rd; /* * Migrating a SCHED_DEADLINE task between exclusive * cpusets (different root_domains) entails a bandwidth * update. We already made space for us in the destination * domain (see cpuset_can_attach()). */ - if (!cpumask_intersects(src_rd->span, ctx->new_mask)) { + if (dl_task_needs_bw_move(p, ctx->new_mask)) { struct dl_bw *src_dl_b; src_dl_b = dl_bw_of(cpu_of(rq)); @@ -3137,6 +3135,15 @@ static void set_cpus_allowed_dl(struct task_struct *p, set_cpus_allowed_common(p, ctx); } +bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask) +{ + if (!dl_task(p)) + return false; + + return !cpumask_intersects(task_rq(p)->rd->span, new_mask); +} + /* Assumes rq->lock is held */ static void rq_online_dl(struct rq *rq) { From 92f3403a7ca5d186b2bad342603410cc6adc95a3 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 6 May 2026 18:50:27 +0530 Subject: [PATCH 202/377] drm/xe/madvise: Track purgeability with BO-local counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xe_bo_recompute_purgeable_state() walks all VMAs of a BO to determine whether the BO can be made purgeable. This makes VMA create/destroy and madvise updates O(n) in the number of mappings. Replace the walk with BO-local counters protected by the BO dma-resv lock: - vma_count tracks the number of VMAs mapping the BO. - willneed_count tracks active WILLNEED holders, including WILLNEED VMAs and active dma-buf exports for non-imported BOs. A DONTNEED BO is promoted back to WILLNEED on a 0->1 transition of willneed_count. A BO is demoted to DONTNEED on a 1->0 transition only when it still has VMAs, preserving the previous behaviour where a BO with no mappings keeps its current madvise state. PURGED remains terminal, preserving the existing "once purged, always purged" rule. Fixes: 4f44961eab84 ("drm/xe/vm: Prevent binding of purged buffer objects") v2: - Use early return for imported BOs in all four helpers to avoid nesting (Matt B). - Group purgeability state into a purgeable sub-struct on struct xe_bo (Matt B). - Reword xe_bo_willneed_put_locked() kernel-doc to explain that a 1->0 transition means all remaining active VMAs are DONTNEED (Matt B). v3: - Move DONTNEED/PURGED reject from vma_lock_and_validate() into xe_vma_create(), gated on attr->purgeable_state == WILLNEED. Fixes vm_bind bypass and partial-unbind rejection on DONTNEED BOs (Matt B). - Drop .check_purged from MAP and REMAP; keep it for PREFETCH and add a comment why (Matt B). - Skip BO validation in vma_lock_and_validate() for non-WILLNEED VMA remnants so cleanup/remap paths do not repopulate DONTNEED/PURGED BOs. Suggested-by: Thomas Hellström Cc: Matthew Brost Cc: Thomas Hellström Cc: Himal Prasad Ghimiray Signed-off-by: Arvind Yadav Reviewed-by: Matthew Brost Link: https://patch.msgid.link/20260506132027.2556046-1-arvind.yadav@intel.com Signed-off-by: Himal Prasad Ghimiray (cherry picked from commit 23fb2ea56cb4fa2587bc072b04e4e698687a48e4) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_bo.c | 6 +- drivers/gpu/drm/xe/xe_bo.h | 88 +++++++++++++++- drivers/gpu/drm/xe/xe_bo_types.h | 28 ++++- drivers/gpu/drm/xe/xe_dma_buf.c | 28 ++++- drivers/gpu/drm/xe/xe_vm.c | 51 +++++++-- drivers/gpu/drm/xe/xe_vm_madvise.c | 162 ++--------------------------- drivers/gpu/drm/xe/xe_vm_madvise.h | 2 - 7 files changed, 190 insertions(+), 175 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 4075edf97421..6b518858538f 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -897,10 +897,10 @@ void xe_bo_set_purgeable_state(struct xe_bo *bo, new_state == XE_MADV_PURGEABLE_PURGED); /* Once purged, always purged - cannot transition out */ - xe_assert(xe, !(bo->madv_purgeable == XE_MADV_PURGEABLE_PURGED && + xe_assert(xe, !(bo->purgeable.state == XE_MADV_PURGEABLE_PURGED && new_state != XE_MADV_PURGEABLE_PURGED)); - bo->madv_purgeable = new_state; + bo->purgeable.state = new_state; xe_bo_set_purgeable_shrinker(bo, new_state); } @@ -2368,7 +2368,7 @@ struct xe_bo *xe_bo_init_locked(struct xe_device *xe, struct xe_bo *bo, INIT_LIST_HEAD(&bo->vram_userfault_link); /* Initialize purge advisory state */ - bo->madv_purgeable = XE_MADV_PURGEABLE_WILLNEED; + bo->purgeable.state = XE_MADV_PURGEABLE_WILLNEED; drm_gem_private_object_init(&xe->drm, &bo->ttm.base, size); diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h index 68dea7d25a6b..6340317f7d2e 100644 --- a/drivers/gpu/drm/xe/xe_bo.h +++ b/drivers/gpu/drm/xe/xe_bo.h @@ -251,7 +251,7 @@ static inline bool xe_bo_is_protected(const struct xe_bo *bo) static inline bool xe_bo_is_purged(struct xe_bo *bo) { xe_bo_assert_held(bo); - return bo->madv_purgeable == XE_MADV_PURGEABLE_PURGED; + return bo->purgeable.state == XE_MADV_PURGEABLE_PURGED; } /** @@ -268,11 +268,95 @@ static inline bool xe_bo_is_purged(struct xe_bo *bo) static inline bool xe_bo_madv_is_dontneed(struct xe_bo *bo) { xe_bo_assert_held(bo); - return bo->madv_purgeable == XE_MADV_PURGEABLE_DONTNEED; + return bo->purgeable.state == XE_MADV_PURGEABLE_DONTNEED; } void xe_bo_set_purgeable_state(struct xe_bo *bo, enum xe_madv_purgeable_state new_state); +/** + * xe_bo_willneed_get_locked() - Acquire a WILLNEED holder on a BO + * @bo: Buffer object + * + * Increments willneed_count and, on a 0->1 transition, promotes the BO + * from DONTNEED to WILLNEED. PURGED is terminal and is never modified. + * + * Caller must hold the BO's dma-resv lock. + */ +static inline void xe_bo_willneed_get_locked(struct xe_bo *bo) +{ + xe_bo_assert_held(bo); + + /* Imported BOs are owned externally; do not track purgeability. */ + if (drm_gem_is_imported(&bo->ttm.base)) + return; + + if (bo->purgeable.willneed_count++ == 0 && xe_bo_madv_is_dontneed(bo)) + xe_bo_set_purgeable_state(bo, XE_MADV_PURGEABLE_WILLNEED); +} + +/** + * xe_bo_willneed_put_locked() - Release a WILLNEED holder on a BO + * @bo: Buffer object + * + * Decrements willneed_count and, on a 1->0 transition, marks the BO + * DONTNEED only if it still has VMAs (implying all active VMAs are + * DONTNEED). If the last VMA is being removed, preserve the current BO + * state to match the previous VMA-walk semantics. + * + * PURGED is terminal and the BO state is never modified. + * + * Caller must hold the BO's dma-resv lock. + */ +static inline void xe_bo_willneed_put_locked(struct xe_bo *bo) +{ + xe_bo_assert_held(bo); + + if (drm_gem_is_imported(&bo->ttm.base)) + return; + + xe_assert(xe_bo_device(bo), bo->purgeable.willneed_count > 0); + if (--bo->purgeable.willneed_count == 0 && bo->purgeable.vma_count > 0 && + !xe_bo_is_purged(bo)) + xe_bo_set_purgeable_state(bo, XE_MADV_PURGEABLE_DONTNEED); +} + +/** + * xe_bo_vma_count_inc_locked() - Account a new VMA on a BO + * @bo: Buffer object + * + * Increments vma_count. + * + * Caller must hold the BO's dma-resv lock. + */ +static inline void xe_bo_vma_count_inc_locked(struct xe_bo *bo) +{ + xe_bo_assert_held(bo); + + if (drm_gem_is_imported(&bo->ttm.base)) + return; + + bo->purgeable.vma_count++; +} + +/** + * xe_bo_vma_count_dec_locked() - Account a VMA removal on a BO + * @bo: Buffer object + * + * Decrements vma_count. + * + * Caller must hold the BO's dma-resv lock. + */ +static inline void xe_bo_vma_count_dec_locked(struct xe_bo *bo) +{ + xe_bo_assert_held(bo); + + if (drm_gem_is_imported(&bo->ttm.base)) + return; + + xe_assert(xe_bo_device(bo), bo->purgeable.vma_count > 0); + bo->purgeable.vma_count--; +} + static inline void xe_bo_unpin_map_no_vm(struct xe_bo *bo) { if (likely(bo)) { diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h index 9d19940b8fc0..077e35b4cdce 100644 --- a/drivers/gpu/drm/xe/xe_bo_types.h +++ b/drivers/gpu/drm/xe/xe_bo_types.h @@ -111,10 +111,32 @@ struct xe_bo { u64 min_align; /** - * @madv_purgeable: user space advise on BO purgeability, protected - * by BO's dma-resv lock. + * @purgeable: Purgeability state and accounting. + * + * All fields are protected by the BO's dma-resv lock. */ - u32 madv_purgeable; + struct { + /** + * @purgeable.state: BO purgeability state + * (WILLNEED/DONTNEED/PURGED). + */ + u32 state; + + /** + * @purgeable.vma_count: Number of VMAs currently mapping this BO. + */ + u32 vma_count; + + /** + * @purgeable.willneed_count: Number of active WILLNEED holders. + * + * Counts WILLNEED VMAs plus active dma-buf exports for + * non-imported BOs. The BO flips to DONTNEED on a 1->0 + * transition only when VMAs still exist; if the last VMA is + * removed, the previous BO state is preserved. + */ + u32 willneed_count; + } purgeable; }; #endif diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index b9828da15897..855d32ba314d 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -193,6 +193,18 @@ static int xe_dma_buf_begin_cpu_access(struct dma_buf *dma_buf, return 0; } +static void xe_dma_buf_release(struct dma_buf *dmabuf) +{ + struct drm_gem_object *obj = dmabuf->priv; + struct xe_bo *bo = gem_to_xe_bo(obj); + + xe_bo_lock(bo, false); + xe_bo_willneed_put_locked(bo); + xe_bo_unlock(bo); + + drm_gem_dmabuf_release(dmabuf); +} + static const struct dma_buf_ops xe_dmabuf_ops = { .attach = xe_dma_buf_attach, .detach = xe_dma_buf_detach, @@ -200,7 +212,7 @@ static const struct dma_buf_ops xe_dmabuf_ops = { .unpin = xe_dma_buf_unpin, .map_dma_buf = xe_dma_buf_map, .unmap_dma_buf = xe_dma_buf_unmap, - .release = drm_gem_dmabuf_release, + .release = xe_dma_buf_release, .begin_cpu_access = xe_dma_buf_begin_cpu_access, .mmap = drm_gem_dmabuf_mmap, .vmap = drm_gem_dmabuf_vmap, @@ -241,18 +253,26 @@ struct dma_buf *xe_gem_prime_export(struct drm_gem_object *obj, int flags) ret = -EINVAL; goto out_unlock; } + + xe_bo_willneed_get_locked(bo); xe_bo_unlock(bo); ret = ttm_bo_setup_export(&bo->ttm, &ctx); if (ret) - return ERR_PTR(ret); + goto out_put; buf = drm_gem_prime_export(obj, flags); - if (!IS_ERR(buf)) - buf->ops = &xe_dmabuf_ops; + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_put; + } + buf->ops = &xe_dmabuf_ops; return buf; +out_put: + xe_bo_lock(bo, false); + xe_bo_willneed_put_locked(bo); out_unlock: xe_bo_unlock(bo); return ERR_PTR(ret); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index a717a2b8dea3..ab6cc1f0a789 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1120,6 +1120,25 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm, xe_bo_assert_held(bo); + /* + * Reject only WILLNEED mappings on DONTNEED/PURGED BOs. This + * gates new vm_bind ioctls (user supplies WILLNEED) while + * still allowing partial-unbind / remap splits whose new VMAs + * inherit the parent's DONTNEED attr. It must also run before + * xe_bo_willneed_get_locked() below so a 0->1 holder bump + * cannot silently promote DONTNEED back to WILLNEED. + */ + if (vma->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED) { + if (xe_bo_madv_is_dontneed(bo)) { + xe_vma_free(vma); + return ERR_PTR(-EBUSY); + } + if (xe_bo_is_purged(bo)) { + xe_vma_free(vma); + return ERR_PTR(-EINVAL); + } + } + vm_bo = drm_gpuvm_bo_obtain_locked(vma->gpuva.vm, &bo->ttm.base); if (IS_ERR(vm_bo)) { xe_vma_free(vma); @@ -1131,6 +1150,10 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm, vma->gpuva.gem.offset = bo_offset_or_userptr; drm_gpuva_link(&vma->gpuva, vm_bo); drm_gpuvm_bo_put(vm_bo); + + xe_bo_vma_count_inc_locked(bo); + if (vma->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED) + xe_bo_willneed_get_locked(bo); } else /* userptr or null */ { if (!is_null && !is_cpu_addr_mirror) { struct xe_userptr_vma *uvma = to_userptr_vma(vma); @@ -1208,7 +1231,10 @@ static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence) xe_bo_assert_held(bo); drm_gpuva_unlink(&vma->gpuva); - xe_bo_recompute_purgeable_state(bo); + + xe_bo_vma_count_dec_locked(bo); + if (vma->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED) + xe_bo_willneed_put_locked(bo); } xe_vm_assert_held(vm); @@ -3016,7 +3042,7 @@ static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm, * @res_evict: Allow evicting resources during validation * @validate: Perform BO validation * @request_decompress: Request BO decompression - * @check_purged: Reject operation if BO is purged + * @check_purged: Reject operation if BO is DONTNEED or PURGED */ struct xe_vma_lock_and_validate_flags { u32 res_evict : 1; @@ -3030,6 +3056,7 @@ static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma, { struct xe_bo *bo = xe_vma_bo(vma); struct xe_vm *vm = xe_vma_vm(vma); + bool validate_bo = flags.validate; int err = 0; if (bo) { @@ -3044,7 +3071,11 @@ static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma, err = -EINVAL; /* BO already purged */ } - if (!err && flags.validate) + /* Don't validate the BO for DONTNEED/PURGED remap remnants. */ + if (vma->attr.purgeable_state != XE_MADV_PURGEABLE_WILLNEED) + validate_bo = false; + + if (!err && validate_bo) err = xe_bo_validate(bo, vm, xe_vm_allow_vm_eviction(vm) && flags.res_evict, exec); @@ -3152,7 +3183,7 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm, op->map.immediate, .request_decompress = op->map.request_decompress, - .check_purged = true, + .check_purged = false, }); break; case DRM_GPUVA_OP_REMAP: @@ -3174,7 +3205,7 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm, .res_evict = res_evict, .validate = true, .request_decompress = false, - .check_purged = true, + .check_purged = false, }); if (!err && op->remap.next) err = vma_lock_and_validate(exec, op->remap.next, @@ -3182,7 +3213,7 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm, .res_evict = res_evict, .validate = true, .request_decompress = false, - .check_purged = true, + .check_purged = false, }); break; case DRM_GPUVA_OP_UNMAP: @@ -3211,9 +3242,11 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm, } /* - * Prefetch attempts to migrate BO's backing store without - * repopulating it first. Purged BOs have no backing store - * to migrate, so reject the operation. + * PREFETCH is the only op that still gates on BO purge state. + * MAP/REMAP handle this inside xe_vma_create() so partial + * unbind on a DONTNEED BO still works. PREFETCH skips + * xe_vma_create() and would migrate a BO with no backing + * store, so reject DONTNEED/PURGED here. */ err = vma_lock_and_validate(exec, gpuva_to_vma(op->base.prefetch.va), diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c b/drivers/gpu/drm/xe/xe_vm_madvise.c index c78906dea82b..c4fb29004195 100644 --- a/drivers/gpu/drm/xe/xe_vm_madvise.c +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c @@ -185,147 +185,6 @@ static void madvise_pat_index(struct xe_device *xe, struct xe_vm *vm, } } -/** - * xe_bo_is_dmabuf_shared() - Check if BO is shared via dma-buf - * @bo: Buffer object - * - * Prevent marking imported or exported dma-bufs as purgeable. - * For imported BOs, Xe doesn't own the backing store and cannot - * safely reclaim pages (exporter or other devices may still be - * using them). For exported BOs, external devices may have active - * mappings we cannot track. - * - * Return: true if BO is imported or exported, false otherwise - */ -static bool xe_bo_is_dmabuf_shared(struct xe_bo *bo) -{ - struct drm_gem_object *obj = &bo->ttm.base; - - /* Imported: exporter owns backing store */ - if (drm_gem_is_imported(obj)) - return true; - - /* Exported: external devices may be accessing */ - if (obj->dma_buf) - return true; - - return false; -} - -/** - * enum xe_bo_vmas_purge_state - VMA purgeable state aggregation - * - * Distinguishes whether a BO's VMAs are all DONTNEED, have at least - * one WILLNEED, or have no VMAs at all. - * - * Enum values align with XE_MADV_PURGEABLE_* states for consistency. - */ -enum xe_bo_vmas_purge_state { - /** @XE_BO_VMAS_STATE_WILLNEED: At least one VMA is WILLNEED */ - XE_BO_VMAS_STATE_WILLNEED = 0, - /** @XE_BO_VMAS_STATE_DONTNEED: All VMAs are DONTNEED */ - XE_BO_VMAS_STATE_DONTNEED = 1, - /** @XE_BO_VMAS_STATE_NO_VMAS: BO has no VMAs */ - XE_BO_VMAS_STATE_NO_VMAS = 2, -}; - -/* - * xe_bo_recompute_purgeable_state() casts between xe_bo_vmas_purge_state and - * xe_madv_purgeable_state. Enforce that WILLNEED=0 and DONTNEED=1 match across - * both enums so the single-line cast is always valid. - */ -static_assert(XE_BO_VMAS_STATE_WILLNEED == (int)XE_MADV_PURGEABLE_WILLNEED, - "VMA purge state WILLNEED must equal madv purgeable WILLNEED"); -static_assert(XE_BO_VMAS_STATE_DONTNEED == (int)XE_MADV_PURGEABLE_DONTNEED, - "VMA purge state DONTNEED must equal madv purgeable DONTNEED"); - -/** - * xe_bo_all_vmas_dontneed() - Determine BO VMA purgeable state - * @bo: Buffer object - * - * Check all VMAs across all VMs to determine aggregate purgeable state. - * Shared BOs require unanimous DONTNEED state from all mappings. - * - * Caller must hold BO dma-resv lock. - * - * Return: XE_BO_VMAS_STATE_DONTNEED if all VMAs are DONTNEED, - * XE_BO_VMAS_STATE_WILLNEED if at least one VMA is not DONTNEED, - * XE_BO_VMAS_STATE_NO_VMAS if BO has no VMAs - */ -static enum xe_bo_vmas_purge_state xe_bo_all_vmas_dontneed(struct xe_bo *bo) -{ - struct drm_gpuvm_bo *vm_bo; - struct drm_gpuva *gpuva; - struct drm_gem_object *obj = &bo->ttm.base; - bool has_vmas = false; - - xe_bo_assert_held(bo); - - /* Shared dma-bufs cannot be purgeable */ - if (xe_bo_is_dmabuf_shared(bo)) - return XE_BO_VMAS_STATE_WILLNEED; - - drm_gem_for_each_gpuvm_bo(vm_bo, obj) { - drm_gpuvm_bo_for_each_va(gpuva, vm_bo) { - struct xe_vma *vma = gpuva_to_vma(gpuva); - - has_vmas = true; - - /* Any non-DONTNEED VMA prevents purging */ - if (vma->attr.purgeable_state != XE_MADV_PURGEABLE_DONTNEED) - return XE_BO_VMAS_STATE_WILLNEED; - } - } - - /* - * No VMAs => preserve existing BO purgeable state. - * Avoids incorrectly flipping DONTNEED -> WILLNEED when last VMA unmapped. - */ - if (!has_vmas) - return XE_BO_VMAS_STATE_NO_VMAS; - - return XE_BO_VMAS_STATE_DONTNEED; -} - -/** - * xe_bo_recompute_purgeable_state() - Recompute BO purgeable state from VMAs - * @bo: Buffer object - * - * Walk all VMAs to determine if BO should be purgeable or not. - * Shared BOs require unanimous DONTNEED state from all mappings. - * If the BO has no VMAs the existing state is preserved. - * - * Locking: Caller must hold BO dma-resv lock. When iterating GPUVM lists, - * VM lock must also be held (write) to prevent concurrent VMA modifications. - * This is satisfied at both call sites: - * - xe_vma_destroy(): holds vm->lock write - * - madvise_purgeable(): holds vm->lock write (from madvise ioctl path) - * - * Return: nothing - */ -void xe_bo_recompute_purgeable_state(struct xe_bo *bo) -{ - enum xe_bo_vmas_purge_state vma_state; - - if (!bo) - return; - - xe_bo_assert_held(bo); - - /* - * Once purged, always purged. Cannot transition back to WILLNEED. - * This matches i915 semantics where purged BOs are permanently invalid. - */ - if (bo->madv_purgeable == XE_MADV_PURGEABLE_PURGED) - return; - - vma_state = xe_bo_all_vmas_dontneed(bo); - - if (vma_state != (enum xe_bo_vmas_purge_state)bo->madv_purgeable && - vma_state != XE_BO_VMAS_STATE_NO_VMAS) - xe_bo_set_purgeable_state(bo, (enum xe_madv_purgeable_state)vma_state); -} - /** * madvise_purgeable - Handle purgeable buffer object advice * @xe: XE device @@ -359,12 +218,6 @@ static void madvise_purgeable(struct xe_device *xe, struct xe_vm *vm, /* BO must be locked before modifying madv state */ xe_bo_assert_held(bo); - /* Skip shared dma-bufs - no PTEs to zap */ - if (xe_bo_is_dmabuf_shared(bo)) { - vmas[i]->skip_invalidation = true; - continue; - } - /* * Once purged, always purged. Cannot transition back to WILLNEED. * This matches i915 semantics where purged BOs are permanently invalid. @@ -377,13 +230,14 @@ static void madvise_purgeable(struct xe_device *xe, struct xe_vm *vm, switch (op->purge_state_val.val) { case DRM_XE_VMA_PURGEABLE_STATE_WILLNEED: - vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_WILLNEED; vmas[i]->skip_invalidation = true; - - xe_bo_recompute_purgeable_state(bo); + /* Only act on a real DONTNEED -> WILLNEED transition. */ + if (vmas[i]->attr.purgeable_state == XE_MADV_PURGEABLE_DONTNEED) { + vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_WILLNEED; + xe_bo_willneed_get_locked(bo); + } break; case DRM_XE_VMA_PURGEABLE_STATE_DONTNEED: - vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_DONTNEED; /* * Don't zap PTEs at DONTNEED time -- pages are still * alive. The zap happens in xe_bo_move_notify() right @@ -391,7 +245,11 @@ static void madvise_purgeable(struct xe_device *xe, struct xe_vm *vm, */ vmas[i]->skip_invalidation = true; - xe_bo_recompute_purgeable_state(bo); + /* Only act on a real WILLNEED -> DONTNEED transition. */ + if (vmas[i]->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED) { + vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_DONTNEED; + xe_bo_willneed_put_locked(bo); + } break; default: /* Should never hit - values validated in madvise_args_are_sane() */ diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.h b/drivers/gpu/drm/xe/xe_vm_madvise.h index 39acd2689ca0..a3078f634c7e 100644 --- a/drivers/gpu/drm/xe/xe_vm_madvise.h +++ b/drivers/gpu/drm/xe/xe_vm_madvise.h @@ -13,6 +13,4 @@ struct xe_bo; int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file); -void xe_bo_recompute_purgeable_state(struct xe_bo *bo); - #endif From 4568dfbb8363a153e7cef384d23cc6d6563ff316 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Thu, 7 May 2026 14:00:15 -0700 Subject: [PATCH 203/377] drm/xe: Make decision to use Xe2-style blitter instructions a feature flag The blitter engines' MEM_COPY and MEM_SET instructions were added as part of the same hardware change that introduced service copy engines (i.e., BCS1-BCS8) which is why the driver checks for service copy engine presence when deciding whether to use these instructions or the older XY_* instructions. However when making this decision the driver should consider which engines are part of the hardware architecture, not which engines are present/usable on the current device. For graphics IP versions that architecturally include service copy engines (i.e., everything Xe2 and later, plus PVC's Xe_HPC) we should use MEM_SET and MEM_COPY even in if all of the service copy engines wind up getting fused off. I.e., we need to decide based on whether the platform's graphics descriptor contains these engines, rather than whether the usable engine mask contains them. This logic got broken when gt->info.__engine_mask was removed, although in practice that mistake has been harmless so far because there haven't been any hardware SKUs that fuse off all of the service copy engines yet. Replace the incorrect has_service_copy_support() function with a GT feature flag that tracks more accurately whether the new blitter instructions are usable. In addition to fixing incorrect logic if all service copies are fused off, the flag also makes it more obvious what the calling code is trying to do; previously it wasn't terribly obvious why "has service copy engines" was being used as the condition for using different instructions on all copy engine types. The new feature flag is named 'has_xe2_blt_instructions' because we expect this flag to be set for all Xe2 and later platforms (i.e., everything officially supported by the Xe driver). Technically there's also one Xe1-era platform (PVC) that supports these engines/instructions and will set this flag, but this still seems to be the most clear and understandable name for the flag. Fixes: 61549a2ee594 ("drm/xe: Drop __engine_mask") Cc: Balasubramani Vivekanandan Reviewed-by: Balasubramani Vivekanandan Link: https://patch.msgid.link/20260507-xe2_copy-v1-1-26506381b821@intel.com Signed-off-by: Matt Roper (cherry picked from commit 09b399842907565a64e351fb22da790b4c673ffb) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_types.h | 7 +++++++ drivers/gpu/drm/xe/xe_migrate.c | 18 ++---------------- drivers/gpu/drm/xe/xe_pci.c | 9 +++++++++ 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h index 8b55cf25a75f..fffb5d631b69 100644 --- a/drivers/gpu/drm/xe/xe_gt_types.h +++ b/drivers/gpu/drm/xe/xe_gt_types.h @@ -144,6 +144,13 @@ struct xe_gt { u8 id; /** @info.has_indirect_ring_state: GT has indirect ring state support */ u8 has_indirect_ring_state:1; + /** + * @info.has_xe2_blt_instructions: GT supports Xe2-style MEM_SET + * and MEM_COPY blitter functionality. Note that despite the + * name, some Xe1 platforms may also support this "Xe2-style" + * feature. + */ + u8 has_xe2_blt_instructions:1; /** * @info.num_geometry_xecore_fuse_regs: Number of 32b-bit fuse * registers the geometry XeCore mask spans. diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 5fdc89ed5256..a22413f892a0 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1524,23 +1524,9 @@ static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, bb->len += len; } -static bool has_service_copy_support(struct xe_gt *gt) -{ - /* - * What we care about is whether the architecture was designed with - * service copy functionality (specifically the new MEM_SET / MEM_COPY - * instructions) so check the architectural engine list rather than the - * actual list since these instructions are usable on BCS0 even if - * all of the actual service copy engines (BCS1-BCS8) have been fused - * off. - */ - return gt->info.engine_mask & GENMASK(XE_HW_ENGINE_BCS8, - XE_HW_ENGINE_BCS1); -} - static u32 emit_clear_cmd_len(struct xe_gt *gt) { - if (has_service_copy_support(gt)) + if (gt->info.has_xe2_blt_instructions) return PVC_MEM_SET_CMD_LEN_DW; else return XY_FAST_COLOR_BLT_DW; @@ -1549,7 +1535,7 @@ static u32 emit_clear_cmd_len(struct xe_gt *gt) static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, u32 size, u32 pitch, bool is_vram) { - if (has_service_copy_support(gt)) + if (gt->info.has_xe2_blt_instructions) emit_clear_link_copy(gt, bb, src_ofs, size, pitch); else emit_clear_main_copy(gt, bb, src_ofs, size, pitch, diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 9f98d0334164..c2ecd27ec770 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -851,6 +851,15 @@ static struct xe_gt *alloc_primary_gt(struct xe_tile *tile, gt->info.num_geometry_xecore_fuse_regs = graphics_desc->num_geometry_xecore_fuse_regs; gt->info.num_compute_xecore_fuse_regs = graphics_desc->num_compute_xecore_fuse_regs; + /* + * Even if the service copy engines wind up being fused off, their + * presence in the IP descriptor indicates that the platform supports + * Xe2-style MEM_SET and MEM_COPY functionality. + */ + if (graphics_desc->hw_engine_mask & GENMASK(XE_HW_ENGINE_BCS8, + XE_HW_ENGINE_BCS1)) + gt->info.has_xe2_blt_instructions = true; + /* * Before media version 13, the media IP was part of the primary GT * so we need to add the media engines to the primary GT's engine list. From 981bedbbe61364fcc3a3b87ebaf648a66cd07108 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 8 May 2026 11:26:36 +0100 Subject: [PATCH 204/377] drm/xe/dma-buf: handle empty bo and UAF races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There look to be some nasty races here when triggering the invalidate_mappings hook: 1) We do xe_bo_alloc() followed by the attach, before the actual full bo init step in xe_dma_buf_init_obj(). However the bo is visible on the attachments list after the attach. This is bad since exporter driver, say amdgpu, can at any time call back into our invalidate_mappings hook, with an empty/bogus bo, leading to potential bugs/crashes. 2) Similar to 1) but here we get a UAF, when the invalidate_mappings hook is triggered. For example, we get as far as xe_bo_init_locked() but this fails in some way. But here the bo will be freed on error, but we still have it attached from dma-buf pov, so if the invalidate_mappings is now triggered then the bo we access is gone and we trigger UAF and more bugs/crashes. To fix this, move the attach step until after we actually have a fully set up buffer object. Note that the bo is not published to userspace until later, so not sure what the comment "Don't publish the bo until we have a valid attachment", is referring to. We have at least two different customers reporting hitting a NULL ptr deref in evict_flags when importing something from amdgpu, followed by triggering the evict flow. Hit rate is also pretty low, which would hint at some kind of race, so something like 1) or 2) might explain this. v2: - Shuffle the order of the ops slightly (no functional change) - Improve the comment to better explain the ordering (Matt B) Assisted-by: Gemini:gemini-3 #debug Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/work_items/7903 Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/work_items/4055 Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Auld Cc: Thomas Hellström Cc: Matthew Brost Cc: # v6.8+ Reviewed-by: Matthew Brost Acked-by: Thomas Hellström Link: https://patch.msgid.link/20260508102635.149172-3-matthew.auld@intel.com (cherry picked from commit af1f2ad0c59fe4e2f924c526f66e968289d77971) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_dma_buf.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index 855d32ba314d..f7b47e9d1a4c 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -377,15 +377,25 @@ struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, } } - /* - * Don't publish the bo until we have a valid attachment, and a - * valid attachment needs the bo address. So pre-create a bo before - * creating the attachment and publish. - */ bo = xe_bo_alloc(); if (IS_ERR(bo)) return ERR_CAST(bo); + /* + * xe_dma_buf_init_obj() takes ownership of the raw bo, so do not touch + * on fail, since it will already take care of cleanup. On success we + * still need to drop the ref, if something later fails. + * + * In addition this needs to happen before the attach, since + * it will create a new attachment for this, and add it to the list of + * attachments, at which point it is globally visible, and at any point + * the export side can call into on invalidate_mappings callback, which + * require a working object. + */ + obj = xe_dma_buf_init_obj(dev, bo, dma_buf); + if (IS_ERR(obj)) + return obj; + attach_ops = &xe_dma_buf_attach_ops; #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) if (test) @@ -398,21 +408,12 @@ struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, goto out_err; } - /* - * xe_dma_buf_init_obj() takes ownership of bo on both success - * and failure, so we must not touch bo after this call. - */ - obj = xe_dma_buf_init_obj(dev, bo, dma_buf); - if (IS_ERR(obj)) { - dma_buf_detach(dma_buf, attach); - return obj; - } get_dma_buf(dma_buf); obj->import_attach = attach; return obj; out_err: - xe_bo_free(bo); + xe_bo_put(bo); return obj; } From 155a372a1cc50fa93387c5d3cdfd614a61e1afd1 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 8 May 2026 11:26:37 +0100 Subject: [PATCH 205/377] drm/xe/dma-buf: fix UAF with retry loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Retry doesn't work here, since bo will be freed on error, leading to UAF. However, now that we do the alloc & init before the attach, we can now combine this as one unit and have the init do the alloc for us. This should make the retry safe. Reported by Sashiko. v2: Fix up the error unwind (CI) Closes: https://sashiko.dev/#/patchset/20260506184332.86743-2-matthew.auld%40intel.com Fixes: eb289a5f6cc6 ("drm/xe: Convert xe_dma_buf.c for exhaustive eviction") Signed-off-by: Matthew Auld Cc: Thomas Hellström Cc: Matthew Brost Cc: # v6.18+ Reviewed-by: Thomas Hellström Link: https://patch.msgid.link/20260508102635.149172-4-matthew.auld@intel.com (cherry picked from commit 479669418253e0f27f8cf5db01a731352ea592e7) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_dma_buf.c | 49 ++++++++------------------------- 1 file changed, 12 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index f7b47e9d1a4c..8a920e58245c 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -278,16 +278,8 @@ struct dma_buf *xe_gem_prime_export(struct drm_gem_object *obj, int flags) return ERR_PTR(ret); } -/* - * Takes ownership of @storage: on success it is transferred to the returned - * drm_gem_object; on failure it is freed before returning the error. - * This matches the contract of xe_bo_init_locked() which frees @storage on - * its error paths, so callers need not (and must not) free @storage after - * this call. - */ static struct drm_gem_object * -xe_dma_buf_init_obj(struct drm_device *dev, struct xe_bo *storage, - struct dma_buf *dma_buf) +xe_dma_buf_create_obj(struct drm_device *dev, struct dma_buf *dma_buf) { struct dma_resv *resv = dma_buf->resv; struct xe_device *xe = to_xe_device(dev); @@ -298,10 +290,8 @@ xe_dma_buf_init_obj(struct drm_device *dev, struct xe_bo *storage, int ret = 0; dummy_obj = drm_gpuvm_resv_object_alloc(&xe->drm); - if (!dummy_obj) { - xe_bo_free(storage); + if (!dummy_obj) return ERR_PTR(-ENOMEM); - } dummy_obj->resv = resv; xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, ret) { @@ -310,8 +300,7 @@ xe_dma_buf_init_obj(struct drm_device *dev, struct xe_bo *storage, if (ret) break; - /* xe_bo_init_locked() frees storage on error */ - bo = xe_bo_init_locked(xe, storage, NULL, resv, NULL, dma_buf->size, + bo = xe_bo_init_locked(xe, NULL, NULL, resv, NULL, dma_buf->size, 0, /* Will require 1way or 2way for vm_bind */ ttm_bo_type_sg, XE_BO_FLAG_SYSTEM, &exec); drm_exec_retry_on_contention(&exec); @@ -362,7 +351,6 @@ struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, const struct dma_buf_attach_ops *attach_ops; struct dma_buf_attachment *attach; struct drm_gem_object *obj; - struct xe_bo *bo; if (dma_buf->ops == &xe_dmabuf_ops) { obj = dma_buf->priv; @@ -377,22 +365,14 @@ struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, } } - bo = xe_bo_alloc(); - if (IS_ERR(bo)) - return ERR_CAST(bo); - /* - * xe_dma_buf_init_obj() takes ownership of the raw bo, so do not touch - * on fail, since it will already take care of cleanup. On success we - * still need to drop the ref, if something later fails. - * - * In addition this needs to happen before the attach, since - * it will create a new attachment for this, and add it to the list of - * attachments, at which point it is globally visible, and at any point - * the export side can call into on invalidate_mappings callback, which - * require a working object. + * This needs to happen before the attach, since it will create a new + * attachment for this, and add it to the list of attachments, at which + * point it is globally visible, and at any point the export side can + * call into on invalidate_mappings callback, which require a working + * object. */ - obj = xe_dma_buf_init_obj(dev, bo, dma_buf); + obj = xe_dma_buf_create_obj(dev, dma_buf); if (IS_ERR(obj)) return obj; @@ -402,20 +382,15 @@ struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, attach_ops = test->attach_ops; #endif - attach = dma_buf_dynamic_attach(dma_buf, dev->dev, attach_ops, &bo->ttm.base); + attach = dma_buf_dynamic_attach(dma_buf, dev->dev, attach_ops, obj); if (IS_ERR(attach)) { - obj = ERR_CAST(attach); - goto out_err; + xe_bo_put(gem_to_xe_bo(obj)); + return ERR_CAST(attach); } get_dma_buf(dma_buf); obj->import_attach = attach; return obj; - -out_err: - xe_bo_put(bo); - - return obj; } #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) From d5971c5c34303a00bf841a902ca00a703602c500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 20 Apr 2026 20:18:43 +0200 Subject: [PATCH 206/377] drm/amdgpu: remove deadlocks from amdgpu_userq_pre_reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The purpose of a GPU reset is to make sure that fence can be signaled again and the signal and resume workers can make progress again. So waiting for the resume worker or any fence in the GPU reset path is just utterly nonsense. Signed-off-by: Christian König Reviewed-by: Prike Liang Signed-off-by: Alex Deucher (cherry picked from commit fcd5f065eab46993af43442fd77ee8d9eb9c5bdf) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 26 +++++++++++------------ 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index de140a8ed135..692f7e3513df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -1504,23 +1504,21 @@ void amdgpu_userq_pre_reset(struct amdgpu_device *adev) { const struct amdgpu_userq_funcs *userq_funcs; struct amdgpu_usermode_queue *queue; - struct amdgpu_userq_mgr *uqm; unsigned long queue_id; + /* TODO: We probably need a new lock for the queue state */ xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { - uqm = queue->userq_mgr; - cancel_delayed_work_sync(&uqm->resume_work); - if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { - amdgpu_userq_wait_for_last_fence(queue); - userq_funcs = adev->userq_funcs[queue->queue_type]; - userq_funcs->unmap(queue); - /* just mark all queues as hung at this point. - * if unmap succeeds, we could map again - * in amdgpu_userq_post_reset() if vram is not lost - */ - queue->state = AMDGPU_USERQ_STATE_HUNG; - amdgpu_userq_fence_driver_force_completion(queue); - } + if (queue->state != AMDGPU_USERQ_STATE_MAPPED) + continue; + + userq_funcs = adev->userq_funcs[queue->queue_type]; + userq_funcs->unmap(queue); + /* just mark all queues as hung at this point. + * if unmap succeeds, we could map again + * in amdgpu_userq_post_reset() if vram is not lost + */ + queue->state = AMDGPU_USERQ_STATE_HUNG; + amdgpu_userq_fence_driver_force_completion(queue); } } From 44e5bc73bdcbec115cfe1c4b4394d25eb3deaff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 16 Apr 2026 15:32:11 +0200 Subject: [PATCH 207/377] drm/amdgpu: rework amdgpu_userq_signal_ioctl v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This one was fortunately not looking so bad as the wait ioctl path, but there were still a few things which could be fixed/improved: 1. Allocating with GFP_ATOMIC was quite unnecessary, we can do that before taking the userq_lock. 2. Use a new mutex as protection for the fence_drv_xa so that we can do memory allocations while holding it. 3. Starting the reset timer is unnecessary when the fence is already signaled when we create it. 4. Cleanup error handling, avoid trying to free the queue when we don't even got one. v2: fix incorrect usage of xa_find, destroy the new mutex on error v3: cleanup ref ordering Signed-off-by: Christian König Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher (cherry picked from commit 1609eb0f81a609d350169839128cecf298c84e7a) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 12 + .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 228 ++++++++---------- 3 files changed, 120 insertions(+), 122 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 692f7e3513df..2d4f159f3362 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -800,6 +800,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) } queue->doorbell_index = index; + mutex_init(&queue->fence_drv_lock); xa_init_flags(&queue->fence_drv_xa, XA_FLAGS_ALLOC); r = amdgpu_userq_fence_driver_alloc(adev, &queue->fence_drv); if (r) { @@ -873,6 +874,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) amdgpu_bo_reserve(fpriv->vm.root.bo, true); amdgpu_userq_buffer_vas_list_cleanup(adev, queue); amdgpu_bo_unreserve(fpriv->vm.root.bo); + mutex_destroy(&queue->fence_drv_lock); free_queue: kfree(queue); err_pm_runtime: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 8b8f345b60b6..843ea8ecc5d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -66,6 +66,18 @@ struct amdgpu_usermode_queue { struct amdgpu_userq_obj db_obj; struct amdgpu_userq_obj fw_obj; struct amdgpu_userq_obj wptr_obj; + + /** + * @fence_drv_lock: Protecting @fence_drv_xa. + */ + struct mutex fence_drv_lock; + + /** + * @fence_drv_xa: + * + * References to the external fence drivers returned by wait_ioctl. + * Dropped on the next signaled dma_fence or queue destruction. + */ struct xarray fence_drv_xa; struct amdgpu_userq_fence_driver *fence_drv; struct dma_fence *last_fence; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index e2d5f04296e1..c9dbf03c4eea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -121,6 +121,7 @@ amdgpu_userq_fence_driver_free(struct amdgpu_usermode_queue *userq) userq->last_fence = NULL; amdgpu_userq_walk_and_drop_fence_drv(&userq->fence_drv_xa); xa_destroy(&userq->fence_drv_xa); + mutex_destroy(&userq->fence_drv_lock); /* Drop the queue's ownership reference to fence_drv explicitly */ amdgpu_userq_fence_driver_put(userq->fence_drv); } @@ -209,80 +210,84 @@ void amdgpu_userq_fence_driver_put(struct amdgpu_userq_fence_driver *fence_drv) kref_put(&fence_drv->refcount, amdgpu_userq_fence_driver_destroy); } -static int amdgpu_userq_fence_alloc(struct amdgpu_userq_fence **userq_fence) +static int amdgpu_userq_fence_alloc(struct amdgpu_usermode_queue *userq, + struct amdgpu_userq_fence **pfence) { - *userq_fence = kmalloc(sizeof(**userq_fence), GFP_KERNEL); - return *userq_fence ? 0 : -ENOMEM; + struct amdgpu_userq_fence_driver *fence_drv = userq->fence_drv; + struct amdgpu_userq_fence *userq_fence; + void *entry; + + userq_fence = kmalloc(sizeof(*userq_fence), GFP_KERNEL); + if (!userq_fence) + return -ENOMEM; + + /* + * Get the next unused entry, since we fill from the start this can be + * used as size to allocate the array. + */ + mutex_lock(&userq->fence_drv_lock); + XA_STATE(xas, &userq->fence_drv_xa, 0); + + rcu_read_lock(); + do { + entry = xas_find_marked(&xas, ULONG_MAX, XA_FREE_MARK); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + userq_fence->fence_drv_array = kvmalloc_array(xas.xa_index, + sizeof(fence_drv), + GFP_KERNEL); + if (!userq_fence->fence_drv_array) { + mutex_unlock(&userq->fence_drv_lock); + kfree(userq_fence); + return -ENOMEM; + } + + userq_fence->fence_drv_array_count = xas.xa_index; + xa_extract(&userq->fence_drv_xa, (void **)userq_fence->fence_drv_array, + 0, ULONG_MAX, xas.xa_index, XA_PRESENT); + xa_destroy(&userq->fence_drv_xa); + + mutex_unlock(&userq->fence_drv_lock); + + amdgpu_userq_fence_driver_get(fence_drv); + userq_fence->fence_drv = fence_drv; + + *pfence = userq_fence; + return 0; } -static int amdgpu_userq_fence_create(struct amdgpu_usermode_queue *userq, - struct amdgpu_userq_fence *userq_fence, - u64 seq, struct dma_fence **f) +static void amdgpu_userq_fence_init(struct amdgpu_usermode_queue *userq, + struct amdgpu_userq_fence *fence, + u64 seq) { - struct amdgpu_userq_fence_driver *fence_drv; - struct dma_fence *fence; + struct amdgpu_userq_fence_driver *fence_drv = userq->fence_drv; unsigned long flags; bool signaled = false; - fence_drv = userq->fence_drv; - if (!fence_drv) - return -EINVAL; - - spin_lock_init(&userq_fence->lock); - INIT_LIST_HEAD(&userq_fence->link); - fence = &userq_fence->base; - userq_fence->fence_drv = fence_drv; - - dma_fence_init64(fence, &amdgpu_userq_fence_ops, &userq_fence->lock, + spin_lock_init(&fence->lock); + dma_fence_init64(&fence->base, &amdgpu_userq_fence_ops, &fence->lock, fence_drv->context, seq); - amdgpu_userq_fence_driver_get(fence_drv); - dma_fence_get(fence); + /* Make sure the fence is visible to the hang detect worker */ + dma_fence_put(userq->last_fence); + userq->last_fence = dma_fence_get(&fence->base); - if (!xa_empty(&userq->fence_drv_xa)) { - struct amdgpu_userq_fence_driver *stored_fence_drv; - unsigned long index, count = 0; - int i = 0; - - xa_lock(&userq->fence_drv_xa); - xa_for_each(&userq->fence_drv_xa, index, stored_fence_drv) - count++; - - userq_fence->fence_drv_array = - kvmalloc_objs(struct amdgpu_userq_fence_driver *, count, - GFP_ATOMIC); - - if (userq_fence->fence_drv_array) { - xa_for_each(&userq->fence_drv_xa, index, stored_fence_drv) { - userq_fence->fence_drv_array[i] = stored_fence_drv; - __xa_erase(&userq->fence_drv_xa, index); - i++; - } - } - - userq_fence->fence_drv_array_count = i; - xa_unlock(&userq->fence_drv_xa); - } else { - userq_fence->fence_drv_array = NULL; - userq_fence->fence_drv_array_count = 0; - } - - /* Check if hardware has already processed the job */ + /* Check if hardware has already processed the fence */ spin_lock_irqsave(&fence_drv->fence_list_lock, flags); - if (!dma_fence_is_signaled(fence)) { - list_add_tail(&userq_fence->link, &fence_drv->fences); + if (!dma_fence_is_signaled(&fence->base)) { + dma_fence_get(&fence->base); + list_add_tail(&fence->link, &fence_drv->fences); } else { + INIT_LIST_HEAD(&fence->link); signaled = true; - dma_fence_put(fence); } spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags); if (signaled) - amdgpu_userq_fence_put_fence_drv_array(userq_fence); - - *f = fence; - - return 0; + amdgpu_userq_fence_put_fence_drv_array(fence); + else + amdgpu_userq_start_hang_detect_work(userq); } static const char *amdgpu_userq_fence_get_driver_name(struct dma_fence *f) @@ -403,11 +408,6 @@ static int amdgpu_userq_fence_read_wptr(struct amdgpu_device *adev, return r; } -static void amdgpu_userq_fence_cleanup(struct dma_fence *fence) -{ - dma_fence_put(fence); -} - static void amdgpu_userq_fence_driver_set_error(struct amdgpu_userq_fence *fence, int error) @@ -451,13 +451,14 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, const unsigned int num_read_bo_handles = args->num_bo_read_handles; struct amdgpu_fpriv *fpriv = filp->driver_priv; struct amdgpu_userq_mgr *userq_mgr = &fpriv->userq_mgr; + struct drm_gem_object **gobj_write, **gobj_read; u32 *syncobj_handles, num_syncobj_handles; - struct amdgpu_userq_fence *userq_fence; - struct amdgpu_usermode_queue *queue = NULL; - struct drm_syncobj **syncobj = NULL; - struct dma_fence *fence; + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_fence *fence; + struct drm_syncobj **syncobj; struct drm_exec exec; + void __user *ptr; int r, i, entry; u64 wptr; @@ -469,13 +470,14 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, return -EINVAL; num_syncobj_handles = args->num_syncobj_handles; - syncobj_handles = memdup_array_user(u64_to_user_ptr(args->syncobj_handles), - num_syncobj_handles, sizeof(u32)); + ptr = u64_to_user_ptr(args->syncobj_handles); + syncobj_handles = memdup_array_user(ptr, num_syncobj_handles, + sizeof(u32)); if (IS_ERR(syncobj_handles)) return PTR_ERR(syncobj_handles); - /* Array of pointers to the looked up syncobjs */ - syncobj = kmalloc_array(num_syncobj_handles, sizeof(*syncobj), GFP_KERNEL); + syncobj = kmalloc_array(num_syncobj_handles, sizeof(*syncobj), + GFP_KERNEL); if (!syncobj) { r = -ENOMEM; goto free_syncobj_handles; @@ -489,21 +491,17 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, } } - r = drm_gem_objects_lookup(filp, - u64_to_user_ptr(args->bo_read_handles), - num_read_bo_handles, - &gobj_read); + ptr = u64_to_user_ptr(args->bo_read_handles); + r = drm_gem_objects_lookup(filp, ptr, num_read_bo_handles, &gobj_read); if (r) goto free_syncobj; - r = drm_gem_objects_lookup(filp, - u64_to_user_ptr(args->bo_write_handles), - num_write_bo_handles, + ptr = u64_to_user_ptr(args->bo_write_handles); + r = drm_gem_objects_lookup(filp, ptr, num_write_bo_handles, &gobj_write); if (r) goto put_gobj_read; - /* Retrieve the user queue */ queue = amdgpu_userq_get(userq_mgr, args->queue_id); if (!queue) { r = -ENOENT; @@ -512,73 +510,61 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, r = amdgpu_userq_fence_read_wptr(adev, queue, &wptr); if (r) - goto put_gobj_write; + goto put_queue; - r = amdgpu_userq_fence_alloc(&userq_fence); + r = amdgpu_userq_fence_alloc(queue, &fence); if (r) - goto put_gobj_write; + goto put_queue; /* We are here means UQ is active, make sure the eviction fence is valid */ amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr); - /* Create a new fence */ - r = amdgpu_userq_fence_create(queue, userq_fence, wptr, &fence); - if (r) { - mutex_unlock(&userq_mgr->userq_mutex); - kfree(userq_fence); - goto put_gobj_write; - } + /* Create the new fence */ + amdgpu_userq_fence_init(queue, fence, wptr); - dma_fence_put(queue->last_fence); - queue->last_fence = dma_fence_get(fence); - amdgpu_userq_start_hang_detect_work(queue); mutex_unlock(&userq_mgr->userq_mutex); + /* + * This needs to come after the fence is created since + * amdgpu_userq_ensure_ev_fence() can't be called while holding the resv + * locks. + */ drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, (num_read_bo_handles + num_write_bo_handles)); - /* Lock all BOs with retry handling */ drm_exec_until_all_locked(&exec) { - r = drm_exec_prepare_array(&exec, gobj_read, num_read_bo_handles, 1); + r = drm_exec_prepare_array(&exec, gobj_read, + num_read_bo_handles, 1); drm_exec_retry_on_contention(&exec); - if (r) { - amdgpu_userq_fence_cleanup(fence); + if (r) goto exec_fini; - } - r = drm_exec_prepare_array(&exec, gobj_write, num_write_bo_handles, 1); + r = drm_exec_prepare_array(&exec, gobj_write, + num_write_bo_handles, 1); drm_exec_retry_on_contention(&exec); - if (r) { - amdgpu_userq_fence_cleanup(fence); + if (r) goto exec_fini; - } } - for (i = 0; i < num_read_bo_handles; i++) { - if (!gobj_read || !gobj_read[i]->resv) - continue; - - dma_resv_add_fence(gobj_read[i]->resv, fence, + /* And publish the new fence in the BOs and syncobj */ + for (i = 0; i < num_read_bo_handles; i++) + dma_resv_add_fence(gobj_read[i]->resv, &fence->base, DMA_RESV_USAGE_READ); - } - for (i = 0; i < num_write_bo_handles; i++) { - if (!gobj_write || !gobj_write[i]->resv) - continue; - - dma_resv_add_fence(gobj_write[i]->resv, fence, + for (i = 0; i < num_write_bo_handles; i++) + dma_resv_add_fence(gobj_write[i]->resv, &fence->base, DMA_RESV_USAGE_WRITE); - } - /* Add the created fence to syncobj/BO's */ for (i = 0; i < num_syncobj_handles; i++) - drm_syncobj_replace_fence(syncobj[i], fence); - - /* drop the reference acquired in fence creation function */ - dma_fence_put(fence); + drm_syncobj_replace_fence(syncobj[i], &fence->base); exec_fini: + /* drop the reference acquired in fence creation function */ + dma_fence_put(&fence->base); + drm_exec_fini(&exec); +put_queue: + amdgpu_userq_put(queue); put_gobj_write: for (i = 0; i < num_write_bo_handles; i++) drm_gem_object_put(gobj_write[i]); @@ -589,15 +575,11 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, kvfree(gobj_read); free_syncobj: while (entry-- > 0) - if (syncobj[entry]) - drm_syncobj_put(syncobj[entry]); + drm_syncobj_put(syncobj[entry]); kfree(syncobj); free_syncobj_handles: kfree(syncobj_handles); - if (queue) - amdgpu_userq_put(queue); - return r; } @@ -872,8 +854,10 @@ amdgpu_userq_wait_return_fence_info(struct drm_file *filp, * Otherwise, we would gather those references until we don't * have any more space left and crash. */ + mutex_lock(&waitq->fence_drv_lock); r = xa_alloc(&waitq->fence_drv_xa, &index, fence_drv, xa_limit_32b, GFP_KERNEL); + mutex_unlock(&waitq->fence_drv_lock); if (r) goto put_waitq; From d0053441ad7eaf7920b71e2263097ece53c5af34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 20 Apr 2026 15:13:57 +0200 Subject: [PATCH 208/377] drm/amdgpu: remove almost all calls to amdgpu_userq_detect_and_reset_queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Well the reset handling seems broken on multiple levels. As first step of fixing this remove most calls to the hang detection. That function should only be called after we run into a timeout! And *NOT* as random check spread over the code in multiple places. Signed-off-by: Christian König Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher (cherry picked from commit 71bea36b54ccfb14cbc90f94267af6369af4e702) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 38 +++++++++-------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 2d4f159f3362..ba03d2a42e1e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -345,23 +345,18 @@ static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue) struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; - bool found_hung_queue = false; - int r = 0; + int r; if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { r = userq_funcs->preempt(queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - found_hung_queue = true; + return r; } else { queue->state = AMDGPU_USERQ_STATE_PREEMPTED; } } - - if (found_hung_queue) - amdgpu_userq_detect_and_reset_queues(uq_mgr); - - return r; + return 0; } static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue) @@ -390,24 +385,21 @@ static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue) struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; - bool found_hung_queue = false; - int r = 0; + int r; if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || - (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { + (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { + r = userq_funcs->unmap(queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - found_hung_queue = true; + return r; } else { queue->state = AMDGPU_USERQ_STATE_UNMAPPED; } } - if (found_hung_queue) - amdgpu_userq_detect_and_reset_queues(uq_mgr); - - return r; + return 0; } static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue) @@ -416,19 +408,19 @@ static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue) struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; - int r = 0; + int r; if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { r = userq_funcs->map(queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - amdgpu_userq_detect_and_reset_queues(uq_mgr); + return r; } else { queue->state = AMDGPU_USERQ_STATE_MAPPED; } } - return r; + return 0; } static void amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue) @@ -654,7 +646,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que #if defined(CONFIG_DEBUG_FS) debugfs_remove_recursive(queue->debugfs_queue); #endif - amdgpu_userq_detect_and_reset_queues(uq_mgr); r = amdgpu_userq_unmap_helper(queue); atomic_dec(&uq_mgr->userq_count[queue->queue_type]); amdgpu_userq_cleanup(queue); @@ -1264,7 +1255,6 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) unsigned long queue_id; int ret = 0, r; - amdgpu_userq_detect_and_reset_queues(uq_mgr); /* Try to unmap all the queues in this process ctx */ xa_for_each(&uq_mgr->userq_xa, queue_id, queue) { r = amdgpu_userq_preempt_helper(queue); @@ -1272,9 +1262,11 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) ret = r; } - if (ret) + if (ret) { drm_file_err(uq_mgr->file, "Couldn't unmap all the queues, eviction failed ret=%d\n", ret); + amdgpu_userq_detect_and_reset_queues(uq_mgr); + } return ret; } @@ -1374,7 +1366,6 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) uqm = queue->userq_mgr; cancel_delayed_work_sync(&uqm->resume_work); guard(mutex)(&uqm->userq_mutex); - amdgpu_userq_detect_and_reset_queues(uqm); if (adev->in_s0ix) r = amdgpu_userq_preempt_helper(queue); else @@ -1433,7 +1424,6 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, if (((queue->queue_type == AMDGPU_HW_IP_GFX) || (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && (queue->xcp_id == idx)) { - amdgpu_userq_detect_and_reset_queues(uqm); r = amdgpu_userq_preempt_helper(queue); if (r) ret = r; From 0071e01c61aa73f22edd5252f0a3e2c2eff744d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 20 Apr 2026 16:08:35 +0200 Subject: [PATCH 209/377] drm/amdgpu: fix userq hang detection and reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix lock inversions pointed out by Prike and Sunil. The hang detection timeout *CAN'T* grab locks under which we wait for fences, especially not the userq_mutex lock. Then instead of this completely broken handling with the hang_detect_fence just cancel the work when fences are processed and re-start if necessary. Signed-off-by: Christian König Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher (cherry picked from commit 1b62077f045ac6ffde7c97005c6659569ac5c1ec) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 65 ++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 1 - .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 17 +++-- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h | 2 +- 4 files changed, 40 insertions(+), 45 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index ba03d2a42e1e..70d74f04d2dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -106,9 +106,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) int r = 0; int i; - /* Warning if current process mutex is not held */ - WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex)); - if (unlikely(adev->debug_disable_gpu_ring_reset)) { dev_err(adev->dev, "userq reset disabled by debug mask\n"); return 0; @@ -127,9 +124,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) */ for (i = 0; i < num_queue_types; i++) { int ring_type = queue_types[i]; - const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type]; + const struct amdgpu_userq_funcs *funcs = + adev->userq_funcs[ring_type]; - if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE)) + if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, + AMDGPU_RESET_TYPE_PER_QUEUE)) continue; if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 && @@ -150,38 +149,22 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) static void amdgpu_userq_hang_detect_work(struct work_struct *work) { - struct amdgpu_usermode_queue *queue = container_of(work, - struct amdgpu_usermode_queue, - hang_detect_work.work); - struct dma_fence *fence; - struct amdgpu_userq_mgr *uq_mgr; + struct amdgpu_usermode_queue *queue = + container_of(work, struct amdgpu_usermode_queue, + hang_detect_work.work); - if (!queue->userq_mgr) - return; - - uq_mgr = queue->userq_mgr; - fence = READ_ONCE(queue->hang_detect_fence); - /* Fence already signaled – no action needed */ - if (!fence || dma_fence_is_signaled(fence)) - return; - - mutex_lock(&uq_mgr->userq_mutex); - amdgpu_userq_detect_and_reset_queues(uq_mgr); - mutex_unlock(&uq_mgr->userq_mutex); + amdgpu_userq_detect_and_reset_queues(queue->userq_mgr); } /* * Start hang detection for a user queue fence. A delayed work will be scheduled - * to check if the fence is still pending after the timeout period. -*/ + * to reset the queues when the fence doesn't signal in time. + */ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue) { struct amdgpu_device *adev; unsigned long timeout_ms; - if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev) - return; - adev = queue->userq_mgr->adev; /* Determine timeout based on queue type */ switch (queue->queue_type) { @@ -199,8 +182,6 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue) break; } - /* Store the fence to monitor and schedule hang detection */ - WRITE_ONCE(queue->hang_detect_fence, queue->last_fence); schedule_delayed_work(&queue->hang_detect_work, msecs_to_jiffies(timeout_ms)); } @@ -210,18 +191,24 @@ void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell) struct xarray *xa = &adev->userq_doorbell_xa; struct amdgpu_usermode_queue *queue; unsigned long flags; + int r; xa_lock_irqsave(xa, flags); queue = xa_load(xa, doorbell); - if (queue) - amdgpu_userq_fence_driver_process(queue->fence_drv); - xa_unlock_irqrestore(xa, flags); -} + if (queue) { + r = amdgpu_userq_fence_driver_process(queue->fence_drv); + /* + * We are in interrupt context here, this *can't* wait for + * reset work to finish. + */ + if (r >= 0) + cancel_delayed_work(&queue->hang_detect_work); -static void amdgpu_userq_init_hang_detect_work(struct amdgpu_usermode_queue *queue) -{ - INIT_DELAYED_WORK(&queue->hang_detect_work, amdgpu_userq_hang_detect_work); - queue->hang_detect_fence = NULL; + /* Restart the timer when there are still fences pending */ + if (r == 1) + amdgpu_userq_start_hang_detect_work(queue); + } + xa_unlock_irqrestore(xa, flags); } static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue, @@ -640,7 +627,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que amdgpu_bo_unreserve(vm->root.bo); mutex_lock(&uq_mgr->userq_mutex); - queue->hang_detect_fence = NULL; amdgpu_userq_wait_for_last_fence(queue); #if defined(CONFIG_DEBUG_FS) @@ -847,7 +833,8 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) up_read(&adev->reset_domain->sem); amdgpu_debugfs_userq_init(filp, queue, qid); - amdgpu_userq_init_hang_detect_work(queue); + INIT_DELAYED_WORK(&queue->hang_detect_work, + amdgpu_userq_hang_detect_work); args->out.queue_id = qid; atomic_inc(&uq_mgr->userq_count[queue->queue_type]); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 843ea8ecc5d7..85f460e7c31b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -85,7 +85,6 @@ struct amdgpu_usermode_queue { int priority; struct dentry *debugfs_queue; struct delayed_work hang_detect_work; - struct dma_fence *hang_detect_fence; struct kref refcount; struct list_head userq_va_list; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index c9dbf03c4eea..53a8944bab05 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -135,7 +135,14 @@ amdgpu_userq_fence_put_fence_drv_array(struct amdgpu_userq_fence *userq_fence) userq_fence->fence_drv_array_count = 0; } -void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv) +/* + * Returns: + * -ENOENT when no fences were processes + * 1 when more fences are pending + * 0 when no fences are pending any more + */ +int +amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv) { struct amdgpu_userq_fence *userq_fence, *tmp; LIST_HEAD(to_be_signaled); @@ -143,9 +150,6 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d unsigned long flags; u64 rptr; - if (!fence_drv) - return; - spin_lock_irqsave(&fence_drv->fence_list_lock, flags); rptr = amdgpu_userq_fence_read(fence_drv); @@ -158,6 +162,9 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d &userq_fence->link); spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags); + if (list_empty(&to_be_signaled)) + return -ENOENT; + list_for_each_entry_safe(userq_fence, tmp, &to_be_signaled, link) { fence = &userq_fence->base; list_del_init(&userq_fence->link); @@ -169,6 +176,8 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d dma_fence_put(fence); } + /* That doesn't need to be accurate so no locking */ + return list_empty(&fence_drv->fences) ? 0 : 1; } void amdgpu_userq_fence_driver_destroy(struct kref *ref) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h index d355a0eecc07..0bd51616cef1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h @@ -63,7 +63,7 @@ void amdgpu_userq_fence_driver_put(struct amdgpu_userq_fence_driver *fence_drv); int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, struct amdgpu_userq_fence_driver **fence_drv_req); void amdgpu_userq_fence_driver_free(struct amdgpu_usermode_queue *userq); -void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv); +int amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv); void amdgpu_userq_fence_driver_force_completion(struct amdgpu_usermode_queue *userq); void amdgpu_userq_fence_driver_destroy(struct kref *ref); int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, From 183182235f6d53bac62c6c39014738a54a68dfa6 Mon Sep 17 00:00:00 2001 From: Mikhail Gavrilov Date: Tue, 5 May 2026 09:05:37 +0800 Subject: [PATCH 210/377] drm/amd/display: Wrap DCN32 phantom-plane allocation in DC_RUN_WITH_PREEMPTION_ENABLED [Why] dcn32_validate_bandwidth() wraps dcn32_internal_validate_bw() with DC_FP_START()/DC_FP_END(). In x86 non-RT, DC_FP_START takes fpregs_lock(), which disables local softirqs. The DML1 path through dcn32_enable_phantom_plane() calls kvzalloc() to allocate ~335 KiB for dc_plane_state. This triggers the vmalloc path, which calls BUG_ON(in_interrupt()) because it's invoked within the FPU-enabled (softirq disabled) region, leading to a kernel crash. [How] Wrap the dc_state_create_phantom_plane() call with the DC_RUN_WITH_PREEMPTION_ENABLED() macro to allow preemption during this memory allocation. Fixes: 235c67634230 ("drm/amd/display: add DCN32/321 specific files for Display Core") Closes: https://gitlab.freedesktop.org/drm/amd/-/work_items/4470 Reviewed-by: Aurabindo Pillai Signed-off-by: Mikhail Gavrilov Signed-off-by: James Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 885ccbef7b94a8b38f69c4211c679021aa27ad11) Cc: stable@vger.kernel.org --- .../drm/amd/display/dc/resource/dcn32/dcn32_resource.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c index 82f81b586986..3751f7a94a05 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c @@ -92,9 +92,14 @@ #include "dml/dcn32/dcn32_fpu.h" #include "dc_state_priv.h" +#include "dc_fpu.h" #include "dml2_0/dml2_wrapper.h" +#if !defined(DC_RUN_WITH_PREEMPTION_ENABLED) +#define DC_RUN_WITH_PREEMPTION_ENABLED(code) code +#endif + #define DC_LOGGER_INIT(logger) enum dcn32_clk_src_array_id { @@ -1684,7 +1689,8 @@ static void dcn32_enable_phantom_plane(struct dc *dc, if (curr_pipe->top_pipe && curr_pipe->top_pipe->plane_state == curr_pipe->plane_state) phantom_plane = prev_phantom_plane; else - phantom_plane = dc_state_create_phantom_plane(dc, context, curr_pipe->plane_state); + DC_RUN_WITH_PREEMPTION_ENABLED(phantom_plane = + dc_state_create_phantom_plane(dc, context, curr_pipe->plane_state)); if (!phantom_plane) continue; From 6bbede02dc62a1021aeeae87ab243bd7a93c61d2 Mon Sep 17 00:00:00 2001 From: Xiang Liu Date: Thu, 7 May 2026 20:56:15 +0800 Subject: [PATCH 211/377] drm/amd/ras: Fix CPER ring debugfs read overflow The legacy CPER debugfs reader can reach the payload path without a valid pointer snapshot. The remaining user byte count is also treated as the ring occupancy in dwords, so reads past the header can copy more than requested. Take the CPER lock before sampling pointers. Resample rptr/wptr for payload reads, bound the payload copy by available dwords and the remaining user size, and advance the file position for each dword copied. Signed-off-by: Xiang Liu Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher (cherry picked from commit 1e40ef87ffdc291e05ccdade8b9170cc9c1c4249) --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 29 +++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 66e8a2f7afcf..d6bee5c30073 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -552,8 +552,9 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { struct amdgpu_ring *ring = file_inode(f)->i_private; - uint32_t value, result, early[3]; + u32 value, result, early[3] = { 0 }; uint64_t p; + u32 avail_dw, start_dw, read_dw; loff_t i; int r; @@ -565,10 +566,10 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, result = 0; - if (*pos < 12) { - if (ring->funcs->type == AMDGPU_RING_TYPE_CPER) - mutex_lock(&ring->adev->cper.ring_lock); + if (ring->funcs->type == AMDGPU_RING_TYPE_CPER) + mutex_lock(&ring->adev->cper.ring_lock); + if (*pos < 12) { early[0] = amdgpu_ring_get_rptr(ring) & ring->buf_mask; early[1] = amdgpu_ring_get_wptr(ring) & ring->buf_mask; early[2] = ring->wptr & ring->buf_mask; @@ -600,13 +601,24 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, *pos += 4; } } else { + early[0] = amdgpu_ring_get_rptr(ring) & ring->buf_mask; + early[1] = amdgpu_ring_get_wptr(ring) & ring->buf_mask; + p = early[0]; if (early[0] <= early[1]) - size = (early[1] - early[0]); + avail_dw = early[1] - early[0]; else - size = ring->ring_size - (early[0] - early[1]); + avail_dw = ring->buf_mask + 1 - (early[0] - early[1]); - while (size) { + start_dw = (*pos > 12) ? ((*pos - 12) >> 2) : 0; + if (start_dw >= avail_dw) + goto out; + + p = (p + start_dw) & ring->ptr_mask; + avail_dw -= start_dw; + read_dw = min_t(u32, avail_dw, size >> 2); + + while (read_dw) { if (p == early[1]) goto out; @@ -619,9 +631,10 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, buf += 4; result += 4; - size--; + read_dw--; p++; p &= ring->ptr_mask; + *pos += 4; } } From 5d08559c910cc37673b965a0d4e8d004444d0332 Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Fri, 3 Apr 2026 15:58:31 +0800 Subject: [PATCH 212/377] drm/amdgpu/gfx_v12_0: set gfx.rs64_enable from PFP header on GFX12 gfx_v12_0_init_microcode() always loads RS64 CP ucode but never set adev->gfx.rs64_enable, so it stayed false and code that branches on it (e.g. MEC pipe reset) used the legacy CP_MEC_CNTL path incorrectly. Match GFX11: derive RS64 mode from the PFP firmware header (v2.0) via amdgpu_ucode_hdr_version(). Log at debug when RS64 is enabled. Reviewed-by: Alex Deucher Signed-off-by: Jesse Zhang Signed-off-by: Alex Deucher (cherry picked from commit b03d53598b0d2048e8fa7303b8d0784768ec4fa6) --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 0e0b1e5b88fc..c35372e21261 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -602,6 +602,13 @@ static int gfx_v12_0_init_microcode(struct amdgpu_device *adev) "amdgpu/%s_pfp.bin", ucode_prefix); if (err) goto out; + + adev->gfx.rs64_enable = amdgpu_ucode_hdr_version( + (union amdgpu_firmware_header *) + adev->gfx.pfp_fw->data, 2, 0); + if (adev->gfx.rs64_enable) + dev_dbg(adev->dev, "CP RS64 enable\n"); + amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_PFP); amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_PFP_P0_STACK); From 9a415cc53711f2238e0f0ca8a6bcc796c003b127 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 May 2026 12:05:48 -1000 Subject: [PATCH 213/377] sched_ext: Avoid UAF in scx_root_enable_workfn() init failure path In scx_root_enable_workfn(), put_task_struct(p) is called before scx_error() dereferences p->comm and p->pid. If the iterator's reference is the last drop, the task is freed synchronously and the deref becomes a UAF. Move put_task_struct() past scx_error(). Reported-by: Sashiko Closes: https://lore.kernel.org/all/20260511214031.AF5E9C2BCB0@smtp.kernel.org/ Fixes: f0e1a0643a59 ("sched_ext: Implement BPF extensible scheduler class") Cc: stable@vger.kernel.org # v6.12+ Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1efd5d82b08b..9354da79e162 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6973,10 +6973,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (scx_get_task_state(p) != SCX_TASK_DEAD) scx_set_task_state(p, SCX_TASK_NONE); task_rq_unlock(rq, p, &rf); - put_task_struct(p); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); + put_task_struct(p); goto err_disable_unlock_all; } From e174929793195e0cd6a4adb0cad731b39f9019b4 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Tue, 5 May 2026 16:43:36 -0700 Subject: [PATCH 214/377] net/rds: reset op_nents when zerocopy page pin fails When iov_iter_get_pages2() fails in rds_message_zcopy_from_user(), the pinned pages are released with put_page(), and rm->data.op_mmp_znotifier is cleared. But we fail to properly clear rm->data.op_nents. Later when rds_message_purge() is called from rds_sendmsg() the cleanup loop iterates over the incorrectly non zero number of op_nents and frees them again. Fix this by properly resetting op_nents when it should be in rds_message_zcopy_from_user(). Fixes: 0cebaccef3ac ("rds: zerocopy Tx support.") Signed-off-by: Allison Henderson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260505234336.2132721-1-achender@kernel.org Signed-off-by: Jakub Kicinski --- net/rds/message.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/rds/message.c b/net/rds/message.c index 25fedcb3cd00..7feb0eb6537d 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -448,6 +448,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter * for (i = 0; i < rm->data.op_nents; i++) put_page(sg_page(&rm->data.op_sg[i])); + rm->data.op_nents = 0; mmp = &rm->data.op_mmp_znotifier->z_mmp; mm_unaccount_pinned_pages(mmp); ret = -EFAULT; From 24a08d7d6218d60c033015cf4870b6096446e734 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Thu, 7 May 2026 00:35:15 +0000 Subject: [PATCH 215/377] net: ena: PHC: Check return code before setting timestamp output ena_phc_gettimex64() is setting the output parameter regardless of whether ena_com_phc_get_timestamp() succeeded or failed. When ena_com_phc_get_timestamp() returns an error, the timestamp parameter may contain uninitialized stack memory (e.g., when PHC is disabled or in blocked state) or invalid hardware values. Passing these to userspace via the PTP ioctl is both a security issue (information leak) and a correctness bug. Fix by checking the return code after releasing the lock and only setting the output timestamp on success. Fixes: e0ea34158ee8 ("net: ena: Add PHC support in the ENA driver") Cc: stable@vger.kernel.org Signed-off-by: Arthur Kiyanovski Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20260507003518.22554-1-akiyano@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_phc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_phc.c b/drivers/net/ethernet/amazon/ena/ena_phc.c index 7867e893fd15..c2a3ff1ef645 100644 --- a/drivers/net/ethernet/amazon/ena/ena_phc.c +++ b/drivers/net/ethernet/amazon/ena/ena_phc.c @@ -46,9 +46,12 @@ static int ena_phc_gettimex64(struct ptp_clock_info *clock_info, spin_unlock_irqrestore(&phc_info->lock, flags); + if (rc) + return rc; + *ts = ns_to_timespec64(timestamp_nsec); - return rc; + return 0; } static int ena_phc_settime64(struct ptp_clock_info *clock_info, From 03cb001ef87b3f8d859cf7f96329acf3d6235d29 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 8 May 2026 12:08:46 +0000 Subject: [PATCH 216/377] tcp: Fix out-of-bounds access for twsk in tcp_ao_established_key(). lockdep_sock_is_held() was added in tcp_ao_established_key() by the cited commit. It can be called from tcp_v[46]_timewait_ack() with twsk. Since it does not have sk->sk_lock, the lockdep annotation results in out-of-bound access. $ pahole -C tcp_timewait_sock vmlinux | grep size /* size: 288, cachelines: 5, members: 8 */ $ pahole -C sock vmlinux | grep sk_lock socket_lock_t sk_lock; /* 440 192 */ Let's not use lockdep_sock_is_held() for TCP_TIME_WAIT. Fixes: 6b2d11e2d8fc ("net/tcp: Add missing lockdep annotations for TCP-AO hlist traversals") Reported-by: Damiano Melotti Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260508120853.4098365-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ao.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index a97cdf3e6af4..0a4b38b315fe 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -116,7 +116,8 @@ struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, { struct tcp_ao_key *key; - hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) { + hlist_for_each_entry_rcu(key, &ao->head, node, + sk_fullsock(sk) && lockdep_sock_is_held(sk)) { if ((sndid >= 0 && key->sndid != sndid) || (rcvid >= 0 && key->rcvid != rcvid)) continue; From f8e64e956a635b054227f56e9586a1997add0646 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 8 May 2026 19:05:10 +0200 Subject: [PATCH 217/377] net/sched: dualpi2: initialize timer earlier in dualpi2_init() 'pi2_timer' needs to be initialized in all error paths of dualpi2_init(): otherwise, a failure in qdisc_create_dflt() causes the following crash in dualpi2_destroy(): # tc qdisc add dev crash0 handle 1: root dualpi2 BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI CPU: 4 UID: 0 PID: 471 Comm: tc Tainted: G E 7.1.0-rc1-virtme #2 PREEMPT(full) Tainted: [E]=UNSIGNED_MODULE Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: 0010:hrtimer_active+0x39/0x60 Code: f9 eb 23 0f b6 41 38 3c 01 0f 87 87 64 c0 ff 83 e0 01 75 33 48 39 4a 50 74 28 44 3b 42 10 75 06 48 3b 51 30 74 21 48 8b 51 30 <44> 8b 42 10 41 f6 c0 01 74 cf f3 90 44 8b 42 10 41 f6 c0 01 74 c3 RSP: 0018:ffffd0db80b93620 EFLAGS: 00010282 RAX: ffffffffc0400320 RBX: ffff8cf24a4c86b8 RCX: ffff8cf24a4c86b8 RDX: 0000000000000000 RSI: ffff8cf2429c2ab0 RDI: ffff8cf24a4c86b8 RBP: 00000000fffffff4 R08: 0000000000000003 R09: 0000000000000000 R10: 0000000000000001 R11: ffff8cf24a39c500 R12: ffff8cf24822c000 R13: ffffd0db80b936c0 R14: ffffffffc02cf360 R15: 00000000ffffffff FS: 00007fbc01706580(0000) GS:ffff8cf2dc759000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000008e02003 CR4: 0000000000172ef0 Call Trace: hrtimer_cancel+0x15/0x40 dualpi2_destroy+0x20/0x40 [sch_dualpi2] qdisc_create+0x230/0x570 tc_modify_qdisc+0x716/0xc10 rtnetlink_rcv_msg+0x188/0x780 netlink_rcv_skb+0xcd/0x150 netlink_unicast+0x1ba/0x290 netlink_sendmsg+0x242/0x4d0 ____sys_sendmsg+0x39e/0x3e0 ___sys_sendmsg+0xe1/0x130 __sys_sendmsg+0xad/0x110 do_syscall_64+0x14f/0xf80 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fbc0188b08e Code: 4d 89 d8 e8 94 bd 00 00 4c 8b 5d f8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 11 c9 c3 0f 1f 80 00 00 00 00 48 8b 45 10 0f 05 c3 83 e2 39 83 fa 08 75 e7 e8 03 ff ff ff 0f 1f 00 f3 0f 1e fa RSP: 002b:00007fff593260e0 EFLAGS: 00000202 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fbc0188b08e RDX: 0000000000000000 RSI: 00007fff59326190 RDI: 0000000000000003 RBP: 00007fff593260f0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000202 R12: 000055f06124f260 R13: 0000000069fca043 R14: 000055f061255640 R15: 000055f06124d3f8 Modules linked in: sch_dualpi2(E) CR2: 0000000000000010 [1] https://lore.kernel.org/netdev/2e78e01c504c633ebdff18d041833cf2e079a3a4.1607020450.git.dcaratti@redhat.com/ [2] https://lore.kernel.org/netdev/20200725201707.16909-1-xiyou.wangcong@gmail.com/ v2: - rebased on top of latest net.git Fixes: 320d031ad6e4 ("sched: Struct definition and parsing of dualpi2 qdisc") Signed-off-by: Davide Caratti Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/1faca91179702b31da5d87653e1e036543e32722.1778259798.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_dualpi2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 241e6a46bd00..a22489c14458 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -938,6 +938,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, int err; sch->flags |= TCQ_F_DEQUEUE_DROPS; + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, 1), extack); @@ -950,8 +952,6 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, q->sch = sch; dualpi2_reset_default(sch); - hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED_SOFT); if (opt && nla_len(opt)) { err = dualpi2_change(sch, opt, extack); From be48e5fe51a5864566307998286a699d6b986934 Mon Sep 17 00:00:00 2001 From: Evgenii Burenchev Date: Thu, 7 May 2026 17:55:17 +0300 Subject: [PATCH 218/377] qed: fix division by zero in qed_init_wfq_param when all vports are configured In qed_init_wfq_param(), variable non_requested_count can become zero when the number of vports with the configured flag set (including the current vport being configured) equals total num_vports. This happens when configuring the last unconfigured vport or when re-configuring an already configured vport. The function then calculates left_rate_per_vp = total_left_rate / non_requested_count, which causes division by zero. Fix this by skipping the division when non_requested_count is zero. In that case, there is no remaining bandwidth to distribute, so just record the configuration for the current vport and return success. Fixes: bcd197c81f63 ("qed: Add vport WFQ configuration APIs") Signed-off-by: Evgenii Burenchev Link: https://patch.msgid.link/20260507145520.23106-1-evg28bur@yandex.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_dev.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 42c6dcfb1f0f..dd75c47758e1 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -5103,6 +5103,13 @@ static int qed_init_wfq_param(struct qed_hwfn *p_hwfn, return -EINVAL; } + /* All vports are already or become configured, nothing to distribute */ + if (non_requested_count == 0) { + p_hwfn->qm_info.wfq_data[vport_id].min_speed = req_rate; + p_hwfn->qm_info.wfq_data[vport_id].configured = true; + return 0; + } + total_left_rate = min_pf_rate - total_req_min_rate; left_rate_per_vp = total_left_rate / non_requested_count; From 53597deca0e38c30e6cd4ba2114fa42d2bcd85bb Mon Sep 17 00:00:00 2001 From: Guangshuo Li Date: Thu, 7 May 2026 18:06:03 +0800 Subject: [PATCH 219/377] drm/bridge: imx8qxp-pxl2dpi: avoid ERR_PTR with device_node cleanup imx8qxp_pxl2dpi_get_available_ep_from_port() returns ERR_PTR() on errors. imx8qxp_pxl2dpi_find_next_bridge() stores its return value in a __free(device_node) variable before checking IS_ERR(). When the function returns on the error path, the cleanup action calls of_node_put() on the ERR_PTR() value. Do not let a device_node cleanup variable hold error pointers. Change imx8qxp_pxl2dpi_get_available_ep_from_port() to return an int and pass the endpoint node through an output argument. Initialize the output argument to NULL so callers hold either NULL on error paths or a valid device_node pointer on successful path. Fixes: ceea3f7806a10 ("drm/bridge: imx8qxp-pxl2dpi: simplify put of device_node pointers") Cc: stable@vger.kernel.org Reviewed-by: Liu Ying Signed-off-by: Guangshuo Li Link: https://patch.msgid.link/20260507100604.667731-1-lgs201920130244@gmail.com Signed-off-by: Liu Ying --- drivers/gpu/drm/bridge/imx/imx8qxp-pxl2dpi.c | 40 +++++++++++--------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/bridge/imx/imx8qxp-pxl2dpi.c b/drivers/gpu/drm/bridge/imx/imx8qxp-pxl2dpi.c index 441fd32dc91c..d64e328bf542 100644 --- a/drivers/gpu/drm/bridge/imx/imx8qxp-pxl2dpi.c +++ b/drivers/gpu/drm/bridge/imx/imx8qxp-pxl2dpi.c @@ -222,52 +222,58 @@ static const struct drm_bridge_funcs imx8qxp_pxl2dpi_bridge_funcs = { imx8qxp_pxl2dpi_bridge_atomic_get_output_bus_fmts, }; -static struct device_node * +static int imx8qxp_pxl2dpi_get_available_ep_from_port(struct imx8qxp_pxl2dpi *p2d, - u32 port_id) + u32 port_id, + struct device_node **ep) { - struct device_node *port, *ep; + struct device_node *port; + int ret = 0; int ep_cnt; + *ep = NULL; + port = of_graph_get_port_by_id(p2d->dev->of_node, port_id); if (!port) { DRM_DEV_ERROR(p2d->dev, "failed to get port@%u\n", port_id); - return ERR_PTR(-ENODEV); + return -ENODEV; } ep_cnt = of_get_available_child_count(port); if (ep_cnt == 0) { DRM_DEV_ERROR(p2d->dev, "no available endpoints of port@%u\n", port_id); - ep = ERR_PTR(-ENODEV); + ret = -ENODEV; goto out; } else if (ep_cnt > 1) { DRM_DEV_ERROR(p2d->dev, "invalid available endpoints of port@%u\n", port_id); - ep = ERR_PTR(-EINVAL); + ret = -EINVAL; goto out; } - ep = of_get_next_available_child(port, NULL); - if (!ep) { + *ep = of_get_next_available_child(port, NULL); + if (!*ep) { DRM_DEV_ERROR(p2d->dev, "failed to get available endpoint of port@%u\n", port_id); - ep = ERR_PTR(-ENODEV); + ret = -ENODEV; goto out; } out: of_node_put(port); - return ep; + return ret; } static int imx8qxp_pxl2dpi_find_next_bridge(struct imx8qxp_pxl2dpi *p2d) { - struct device_node *ep __free(device_node) = - imx8qxp_pxl2dpi_get_available_ep_from_port(p2d, 1); - if (IS_ERR(ep)) - return PTR_ERR(ep); + struct device_node *ep __free(device_node) = NULL; + int ret; + + ret = imx8qxp_pxl2dpi_get_available_ep_from_port(p2d, 1, &ep); + if (ret) + return ret; struct device_node *remote __free(device_node) = of_graph_get_remote_port_parent(ep); if (!remote || !of_device_is_available(remote)) { @@ -291,9 +297,9 @@ static int imx8qxp_pxl2dpi_set_pixel_link_sel(struct imx8qxp_pxl2dpi *p2d) struct of_endpoint endpoint; int ret; - ep = imx8qxp_pxl2dpi_get_available_ep_from_port(p2d, 0); - if (IS_ERR(ep)) - return PTR_ERR(ep); + ret = imx8qxp_pxl2dpi_get_available_ep_from_port(p2d, 0, &ep); + if (ret) + return ret; ret = of_graph_parse_endpoint(ep, &endpoint); if (ret) { From c21b90f77687075115d989e53a8ec5e2bb427ab1 Mon Sep 17 00:00:00 2001 From: Prathyushi Nangia Date: Tue, 9 Dec 2025 10:01:33 -0600 Subject: [PATCH 220/377] x86/CPU/AMD: Prevent improper isolation of shared resources in Zen2's op cache Make sure resources are not improperly shared in the op cache and cause instruction corruption this way. Signed-off-by: Prathyushi Nangia Co-developed-by: Borislav Petkov (AMD) Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/msr-index.h | 3 ++- arch/x86/kernel/cpu/amd.c | 3 +++ tools/arch/x86/include/asm/msr-index.h | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a14a0f43e04a..86554de9a3f5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -803,9 +803,10 @@ #define MSR_AMD64_LBR_SELECT 0xc000010e /* Zen4 */ -#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG 0xc001102e #define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4 #define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 +#define MSR_ZEN2_BP_CFG_BUG_FIX_BIT 33 /* Fam 19h MSRs */ #define MSR_F19H_UMC_PERF_CTL 0xc0010800 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d9ae6ab1701..2f8e8ff2d000 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -989,6 +989,9 @@ static void init_amd_zen2(struct cpuinfo_x86 *c) /* Correct misconfigured CPUID on some clients. */ clear_cpu_cap(c, X86_FEATURE_INVLPGB); + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN2_BP_CFG_BUG_FIX_BIT); } static void init_amd_zen3(struct cpuinfo_x86 *c) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 6673601246b3..eff29645719b 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -793,9 +793,10 @@ #define MSR_AMD64_LBR_SELECT 0xc000010e /* Zen4 */ -#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG 0xc001102e #define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4 #define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 +#define MSR_ZEN2_BP_CFG_BUG_FIX_BIT 33 /* Fam 19h MSRs */ #define MSR_F19H_UMC_PERF_CTL 0xc0010800 From 8d57bb61734b23f6342e9de781173f1d83f90d3a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 5 May 2026 20:47:56 +0200 Subject: [PATCH 221/377] powerpc/g5: Enable all windfarms by default The G5 defconfig is clearly intended for the G5 Powermac series, and that should enable all the available windfarm drivers, or the machine will overheat a short while after booting and shut itself down, which is annoying. Signed-off-by: Linus Walleij Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260505-powermac-g5-config-v3-1-7747bf72f874@kernel.org --- arch/powerpc/configs/g5_defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index e9996711b362..5ca1676e6058 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -85,6 +85,8 @@ CONFIG_PMAC_SMU=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_WINDFARM=y CONFIG_WINDFARM_PM81=y +CONFIG_WINDFARM_PM72=y +CONFIG_WINDFARM_RM31=y CONFIG_WINDFARM_PM91=y CONFIG_WINDFARM_PM112=y CONFIG_WINDFARM_PM121=y From acd1e47db03d4b528fd5efb8565dd0de1c79f62a Mon Sep 17 00:00:00 2001 From: Ally Heev Date: Sun, 16 Nov 2025 19:55:44 +0530 Subject: [PATCH 222/377] powerpc: 82xx: fix uninitialized pointers with free attribute Uninitialized pointers with `__free` attribute can cause undefined behavior as the memory allocated to the pointer is freed automatically when the pointer goes out of scope. powerpc/km82xx doesn't have any bugs related to this as of now, but, it is better to initialize and assign pointers with `__free` attribute in one statement to ensure proper scope-based cleanup Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/aPiG_F5EBQUjZqsl@stanley.mountain/ Signed-off-by: Ally Heev Fixes: 4aa5cc1e0012 ("powerpc-km82xx.c: replace of_node_put() with __free") Reviewed-by: Christophe Leroy (CS GROUP) Reviewed-by: Krzysztof Kozlowski Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20251116-aheev-uninitialized-free-attr-km82xx-v2-1-4307e2b5300d@gmail.com --- arch/powerpc/platforms/82xx/km82xx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/82xx/km82xx.c b/arch/powerpc/platforms/82xx/km82xx.c index 99f0f0f41876..4ad223525e89 100644 --- a/arch/powerpc/platforms/82xx/km82xx.c +++ b/arch/powerpc/platforms/82xx/km82xx.c @@ -27,8 +27,8 @@ static void __init km82xx_pic_init(void) { - struct device_node *np __free(device_node); - np = of_find_compatible_node(NULL, NULL, "fsl,pq2-pic"); + struct device_node *np __free(device_node) = of_find_compatible_node(NULL, + NULL, "fsl,pq2-pic"); if (!np) { pr_err("PIC init: can not find cpm-pic node\n"); From 108d7f951271cbd36ca36efc5e5d106966f5180c Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Sun, 16 Nov 2025 10:44:11 +0800 Subject: [PATCH 223/377] powerpc/warp: Fix error handling in pika_dtm_thread pika_dtm_thread() acquires client through of_find_i2c_device_by_node() but fails to release it in error handling path. This could result in a reference count leak, preventing proper cleanup and potentially leading to resource exhaustion. Add put_device() to release the reference in the error handling path. Found by code review. Cc: stable@vger.kernel.org Fixes: 3984114f0562 ("powerpc/warp: Platform fix for i2c change") Signed-off-by: Ma Ke Reviewed-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20251116024411.21968-1-make24@iscas.ac.cn --- arch/powerpc/platforms/44x/warp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index a5001d32f978..6f674f86dc85 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -293,6 +293,8 @@ static int pika_dtm_thread(void __iomem *fpga) schedule_timeout(HZ); } + put_device(&client->dev); + return 0; } From f98020c22ad9be07d6feefd0496e70f9f36a2f63 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 16 Mar 2026 10:47:42 -0700 Subject: [PATCH 224/377] powerpc/powermac: Remove pmac_low_i2c_{lock,unlock}() Commit a28d3af2a26c ("[PATCH] 2/5 powerpc: Rework PowerMac i2c part 2") removed the last calls to the pmac_low_i2c_{lock,unlock}() functions. Hence, remove these two functions. Reviewed-by: Christophe Leroy (CS GROUP) Signed-off-by: Bart Van Assche Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260316174747.3871924-1-bvanassche@acm.org --- arch/powerpc/include/asm/pmac_low_i2c.h | 4 --- arch/powerpc/platforms/powermac/low_i2c.c | 34 ----------------------- 2 files changed, 38 deletions(-) diff --git a/arch/powerpc/include/asm/pmac_low_i2c.h b/arch/powerpc/include/asm/pmac_low_i2c.h index 21bd7297c87f..fead8fae08ab 100644 --- a/arch/powerpc/include/asm/pmac_low_i2c.h +++ b/arch/powerpc/include/asm/pmac_low_i2c.h @@ -79,10 +79,6 @@ extern int pmac_i2c_match_adapter(struct device_node *dev, struct i2c_adapter *adapter); -/* (legacy) Locking functions exposed to i2c-keywest */ -extern int pmac_low_i2c_lock(struct device_node *np); -extern int pmac_low_i2c_unlock(struct device_node *np); - /* Access functions for platform code */ extern int pmac_i2c_open(struct pmac_i2c_bus *bus, int polled); extern void pmac_i2c_close(struct pmac_i2c_bus *bus); diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index 73b7f4e8c047..da72a30ab865 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -1058,40 +1058,6 @@ int pmac_i2c_match_adapter(struct device_node *dev, struct i2c_adapter *adapter) } EXPORT_SYMBOL_GPL(pmac_i2c_match_adapter); -int pmac_low_i2c_lock(struct device_node *np) -{ - struct pmac_i2c_bus *bus, *found = NULL; - - list_for_each_entry(bus, &pmac_i2c_busses, link) { - if (np == bus->controller) { - found = bus; - break; - } - } - if (!found) - return -ENODEV; - return pmac_i2c_open(bus, 0); -} -EXPORT_SYMBOL_GPL(pmac_low_i2c_lock); - -int pmac_low_i2c_unlock(struct device_node *np) -{ - struct pmac_i2c_bus *bus, *found = NULL; - - list_for_each_entry(bus, &pmac_i2c_busses, link) { - if (np == bus->controller) { - found = bus; - break; - } - } - if (!found) - return -ENODEV; - pmac_i2c_close(bus); - return 0; -} -EXPORT_SYMBOL_GPL(pmac_low_i2c_unlock); - - int pmac_i2c_open(struct pmac_i2c_bus *bus, int polled) { int rc; From aef656a0e6c01796190bb5bd2bdba1c644ed7811 Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Sun, 5 Apr 2026 17:15:45 +0100 Subject: [PATCH 225/377] powerpc: fix dead default for GUEST_STATE_BUFFER_TEST The GUEST_STATE_BUFFER_TEST config option should default to KUNIT_ALL_TESTS so that if all tests are enabled then it is included, but currently the 'default KUNIT_ALL_TESTS' statement is shadowed by 'def_tristate n', meaning that this second default statement is currently dead code. It looks to me like the commit 6ccbbc33f06a ("KVM: PPC: Add helper library for Guest State Buffers") intended to set the default to KUNIT_ALL_TESTS, but mistakenly missed the def_tristate. This dead code was found by kconfirm, a static analysis tool for Kconfig. Fixes: 6ccbbc33f06a ("KVM: PPC: Add helper library for Guest State Buffers") Signed-off-by: Julian Braha Tested-by: Gautam Menghani Reviewed-by: Amit Machhiwal Reviewed-by: Harsh Prateek Bora Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260405161545.161006-1-julianbraha@gmail.com --- arch/powerpc/Kconfig.debug | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index f15e5920080b..e8718bc13eeb 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -83,11 +83,10 @@ config MSI_BITMAP_SELFTEST depends on DEBUG_KERNEL config GUEST_STATE_BUFFER_TEST - def_tristate n + def_tristate KUNIT_ALL_TESTS prompt "Enable Guest State Buffer unit tests" depends on KUNIT depends on KVM_BOOK3S_HV_POSSIBLE - default KUNIT_ALL_TESTS help The Guest State Buffer is a data format specified in the PAPR. It is by hcalls to communicate the state of L2 guests between From dbc30a57bd8e026995e9fa8e8c31cffd18542c01 Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Fri, 8 May 2026 09:42:56 +0530 Subject: [PATCH 226/377] powerpc/hv-gpci: fix preempt count leak in sysfs show paths Four sysfs show() callbacks in hv-gpci take get_cpu_var(hv_gpci_reqb) (which calls preempt_disable()) but only call the matching put_cpu_var() on the error path under the 'out:' label. Every successful read leaks one preempt_disable(): processor_bus_topology_show() processor_config_show() affinity_domain_via_virtual_processor_show() affinity_domain_via_domain_show() (affinity_domain_via_partition_show() was already correct.) On a CONFIG_PREEMPT=y kernel, repeated reads raise preempt_count and eventually return to userspace with preemption still disabled. The next user-mode page fault then hits faulthandler_disabled() == 1, gets forced to SIGSEGV, and the resulting coredump trips 'BUG: scheduling while atomic' in call_usermodehelper_exec -> wait_for_completion_state -> schedule: BUG: scheduling while atomic: //0x00000004 ... __schedule_bug+0x6c/0x90 __schedule+0x58c/0x13a0 schedule+0x48/0x1a0 schedule_timeout+0x104/0x170 wait_for_completion_state+0x16c/0x330 call_usermodehelper_exec+0x254/0x2d0 vfs_coredump+0x1050/0x2590 get_signal+0xb9c/0xc80 do_notify_resume+0xf8/0x470 Add an out_success label that calls put_cpu_var() before returning the byte count, mirroring affinity_domain_via_partition_show(). Fixes: 71f1c39647d8 ("powerpc/hv_gpci: Add sysfs file inside hv_gpci device to show processor bus topology information") Fixes: 1a160c2a13c6 ("powerpc/hv_gpci: Add sysfs file inside hv_gpci device to show processor config information") Fixes: 71a7ccb478fc ("powerpc/hv_gpci: Add sysfs file inside hv_gpci device to show affinity domain via virtual processor information") Fixes: a69a57cac1ec ("powerpc/hv_gpci: Add sysfs file inside hv_gpci device to show affinity domain via domain information") Signed-off-by: Aboorva Devarajan Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260508041256.3447113-1-aboorvad@linux.ibm.com --- arch/powerpc/perf/hv-gpci.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index 5cac2cf3bd1e..10c82cf8f5b3 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -210,7 +210,7 @@ static ssize_t processor_bus_topology_show(struct device *dev, struct device_att 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -244,12 +244,14 @@ static ssize_t processor_bus_topology_show(struct device *dev, struct device_att starting_index, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: @@ -278,7 +280,7 @@ static ssize_t processor_config_show(struct device *dev, struct device_attribute 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -312,12 +314,14 @@ static ssize_t processor_config_show(struct device *dev, struct device_attribute starting_index, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: @@ -346,7 +350,7 @@ static ssize_t affinity_domain_via_virtual_processor_show(struct device *dev, 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -382,12 +386,14 @@ static ssize_t affinity_domain_via_virtual_processor_show(struct device *dev, starting_index, secondary_index, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: @@ -416,7 +422,7 @@ static ssize_t affinity_domain_via_domain_show(struct device *dev, struct device 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -448,12 +454,14 @@ static ssize_t affinity_domain_via_domain_show(struct device *dev, struct device starting_index, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: From 4cfe4c0efbdcde742a47813180cc69b132d7598e Mon Sep 17 00:00:00 2001 From: Sebastian Brzezinka Date: Thu, 16 Apr 2026 13:31:18 +0200 Subject: [PATCH 227/377] drm/i915: skip __i915_request_skip() for already signaled requests After a GPU reset the HWSP is zeroed, so previously completed requests appear incomplete. If such a request is picked up during reset_rewind() and marked guilty, i915_request_set_error_once() returns early (fence already signaled), leaving fence.error without a fatal error code. The subsequent __i915_request_skip() then hits: ``` GEM_BUG_ON(!fatal_error(rq->fence.error)) ``` Fixes a kernel BUG observed on Sandy Bridge (Gen6) during heartbeat-triggered engine resets. ``` kernel BUG at drivers/gpu/drm/i915/i915_request.c:556! RIP: __i915_request_skip+0x15e/0x1d0 [i915] ... __i915_request_reset+0x212/0xa70 [i915] reset_rewind+0xe4/0x280 [i915] intel_gt_reset+0x30d/0x5b0 [i915] heartbeat+0x516/0x530 [i915] ``` Guard __i915_request_skip() with i915_request_signaled(), if the fence is already signaled, the ring content is committed and there is nothing left to skip. Fixes: 36e191f0644b ("drm/i915: Apply i915_request_skip() on submission") Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/work_items/13729 Signed-off-by: Sebastian Brzezinka Cc: stable@vger.kernel.org # v5.7+ Reviewed-by: Krzysztof Karas Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/fe76921d35b6ae85aa651822726d0d9815aa5362.1776339012.git.sebastian.brzezinka@intel.com (cherry picked from commit 5ba54393dcd7adf75a9f39f5a933b1538349cad5) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_reset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 984d0056c01c..adff482a6c9c 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -132,7 +132,8 @@ void __i915_request_reset(struct i915_request *rq, bool guilty) rcu_read_lock(); /* protect the GEM context */ if (guilty) { i915_request_set_error_once(rq, -EIO); - __i915_request_skip(rq); + if (!i915_request_signaled(rq)) + __i915_request_skip(rq); banned = mark_guilty(rq); } else { i915_request_set_error_once(rq, -EAGAIN); From 1ae15b6c7965d137eef21f2cc7d367b29cb88369 Mon Sep 17 00:00:00 2001 From: Chaitanya Kumar Borah Date: Tue, 5 May 2026 14:39:20 +0530 Subject: [PATCH 228/377] drm/i915/dp: Fix VSC dynamic range signaling for RGB formats For RGB, set dynamic_range to CTA or VESA based on crtc_state->limited_color_range so sinks apply correct quantization. YCbCr remains limited (CTA) range. (DP v1.4, Table 5-1) v2: - Added Reported-by and Tested-by tags v3: - Add back YCbCr comment(Suraj) Cc: stable@vger.kernel.org #v5.8+ Reported-by: DeepChirp Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/work_items/15874 Tested-by: DeepChirp Fixes: 9799c4c3b76e ("drm/i915/dp: Add compute routine for DP VSC SDP") Assisted-by: GitHub-Copilot:GPT-5.4 Signed-off-by: Chaitanya Kumar Borah Reviewed-by: Suraj Kandpal Signed-off-by: Suraj Kandpal Link: https://patch.msgid.link/20260505090920.2479112-1-chaitanya.kumar.borah@intel.com (cherry picked from commit 38e10ddae6f8d42a2e8437fcd25a1cac51106c64) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_dp.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 4955bd8b11d7..50d34b4fdf82 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -3119,8 +3119,13 @@ static void intel_dp_compute_vsc_colorimetry(const struct intel_crtc_state *crtc drm_WARN_ON(display->drm, vsc->bpc == 6 && vsc->pixelformat != DP_PIXELFORMAT_RGB); - /* all YCbCr are always limited range */ - vsc->dynamic_range = DP_DYNAMIC_RANGE_CTA; + /* All YCbCr formats are always limited range. */ + if (vsc->pixelformat == DP_PIXELFORMAT_RGB) + vsc->dynamic_range = crtc_state->limited_color_range ? + DP_DYNAMIC_RANGE_CTA : DP_DYNAMIC_RANGE_VESA; + else + vsc->dynamic_range = DP_DYNAMIC_RANGE_CTA; + vsc->content_type = DP_CONTENT_TYPE_NOT_DEFINED; } From 911f54771ca97947cfdca360e9e9b4147a330740 Mon Sep 17 00:00:00 2001 From: Quan Sun <2022090917019@std.uestc.edu.cn> Date: Fri, 8 May 2026 20:46:36 +0800 Subject: [PATCH 229/377] net: hsr: fix NULL pointer dereference in hsr_get_node_data() In the HSR (High-availability Seamless Redundancy) protocol, node information is maintained in the node_db. When a supervision frame is received, node->addr_B_port is updated to track the receiving port type (e.g., HSR_PT_SLAVE_B). If the underlying physical interface associated with this slave port is removed (e.g., via `ip link del`), hsr_del_port() frees the hsr_port object. However, the stale node->addr_B_port reference is kept in the node_db until the node ages out. Subsequently, if userspace queries the node status via the Netlink command HSR_C_GET_NODE_STATUS, the kernel calls hsr_get_node_data(). This function unconditionally dereferences the pointer returned by hsr_port_get_hsr(): if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); *addr_b_ifindex = port->dev->ifindex; // <-- NULL deref } If the slave port has been deleted, hsr_port_get_hsr() returns NULL, resulting in a kernel panic. Oops: general protection fault, probably for non-canonical address KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] RIP: 0010:hsr_get_node_data+0x7b6/0x9e0 Call Trace: hsr_get_node_status+0x445/0xa40 Fix this by adding a proper NULL pointer check. If the port lookup fails due to a stale port type, gracefully treat it as if no valid port exists and assign -1 to the interface index. Steps to reproduce: 1. Create an HSR interface with two slave devices. 2. Receive a supervision frame to populate node_db with addr_B_port assigned to SLAVE_B. 3. Delete the underlying slave device B. 4. Send an HSR_C_GET_NODE_STATUS Netlink message. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Quan Sun <2022090917019@std.uestc.edu.cn> Link: https://patch.msgid.link/20260508124636.1462346-1-2022090917019@std.uestc.edu.cn Signed-off-by: Paolo Abeni --- net/hsr/hsr_framereg.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index d09875b33588..124619920d38 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -889,7 +889,10 @@ int hsr_get_node_data(struct hsr_priv *hsr, if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); - *addr_b_ifindex = port->dev->ifindex; + if (port) + *addr_b_ifindex = port->dev->ifindex; + else + *addr_b_ifindex = -1; } else { *addr_b_ifindex = -1; } From 5f344d809e015fba3709e5219428c00b8ac5d7df Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 8 May 2026 18:44:10 +0200 Subject: [PATCH 230/377] vsock/virtio: fix length and offset in tap skb for split packets virtio_transport_build_skb() builds a new skb to be delivered to the vsockmon tap device. To build the new skb, it uses the original skb data length as payload length, but as the comment notes, the original packet stored in the skb may have been split in multiple packets, so we need to use the length in the header, which is correctly updated before the packet is delivered to the tap, and the offset for the data. This was also similar to what we did before commit 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") where we probably missed something during the skb conversion. Also update the comment above, which was left stale by the skb conversion and still mentioned a buffer pointer that no longer exists. Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") Signed-off-by: Stefano Garzarella Reviewed-by: Bobby Eshleman Reviewed-by: Arseniy Krasnov Link: https://patch.msgid.link/20260508164411.261440-2-sgarzare@redhat.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 9b8014516f4f..a678d5d75704 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -166,12 +166,12 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) struct sk_buff *skb; size_t payload_len; - /* A packet could be split to fit the RX buffer, so we can retrieve - * the payload length from the header and the buffer pointer taking - * care of the offset in the original packet. + /* A packet could be split to fit the RX buffer, so we use + * the payload length from the header, which has been updated + * by the sender to reflect the fragment size. */ pkt_hdr = virtio_vsock_hdr(pkt); - payload_len = pkt->len; + payload_len = le32_to_cpu(pkt_hdr->len); skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len, GFP_ATOMIC); @@ -219,7 +219,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) virtio_transport_copy_nonlinear_skb(pkt, data, payload_len); } else { - skb_put_data(skb, pkt->data, payload_len); + skb_put_data(skb, pkt->data + VIRTIO_VSOCK_SKB_CB(pkt)->offset, + payload_len); } } From 3a3e3d90cbc79600544536723911657730759af3 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 8 May 2026 18:44:11 +0200 Subject: [PATCH 231/377] vsock/virtio: fix empty payload in tap skb for non-linear buffers For non-linear skbs, virtio_transport_build_skb() goes through virtio_transport_copy_nonlinear_skb() to copy the original payload in the new skb to be delivered to the vsockmon tap device. This manually initializes an iov_iter but does not set iov_iter.count. Since the iov_iter is zero-initialized, the copy length is zero and no payload is actually copied to the monitor interface, leaving data un-initialized. Fix this by removing the linear vs non-linear split and using skb_copy_datagram_iter() with iov_iter_kvec() for all cases, as vhost-vsock already does. This handles both linear and non-linear skbs, properly initializes the iov_iter, and removes the now unused virtio_transport_copy_nonlinear_skb(). While touching this code, let's also check the return value of skb_copy_datagram_iter(), even though it's unlikely to fail. Fixes: 4b0bf10eb077 ("vsock/virtio: non-linear skb handling for tap") Reported-by: Yiqi Sun Signed-off-by: Stefano Garzarella Reviewed-by: Bobby Eshleman Reviewed-by: Arseniy Krasnov Link: https://patch.msgid.link/20260508164411.261440-3-sgarzare@redhat.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 38 +++++++------------------ 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index a678d5d75704..989cc252d3d3 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -136,27 +136,6 @@ static void virtio_transport_init_hdr(struct sk_buff *skb, hdr->fwd_cnt = cpu_to_le32(0); } -static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb, - void *dst, - size_t len) -{ - struct iov_iter iov_iter = { 0 }; - struct kvec kvec; - size_t to_copy; - - kvec.iov_base = dst; - kvec.iov_len = len; - - iov_iter.iter_type = ITER_KVEC; - iov_iter.kvec = &kvec; - iov_iter.nr_segs = 1; - - to_copy = min_t(size_t, len, skb->len); - - skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, - &iov_iter, to_copy); -} - /* Packet capture */ static struct sk_buff *virtio_transport_build_skb(void *opaque) { @@ -214,13 +193,18 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr)); if (payload_len) { - if (skb_is_nonlinear(pkt)) { - void *data = skb_put(skb, payload_len); + struct iov_iter iov_iter; + struct kvec kvec; + void *data = skb_put(skb, payload_len); - virtio_transport_copy_nonlinear_skb(pkt, data, payload_len); - } else { - skb_put_data(skb, pkt->data + VIRTIO_VSOCK_SKB_CB(pkt)->offset, - payload_len); + kvec.iov_base = data; + kvec.iov_len = payload_len; + iov_iter_kvec(&iov_iter, ITER_DEST, &kvec, 1, payload_len); + + if (skb_copy_datagram_iter(pkt, VIRTIO_VSOCK_SKB_CB(pkt)->offset, + &iov_iter, payload_len)) { + kfree_skb(skb); + return NULL; } } From 2cb156213093a62b80cf40b1ec71738e93491971 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 9 May 2026 00:13:36 +0200 Subject: [PATCH 232/377] net: ethernet: cortina: No mapping is a dropped rx Increase stats.rx_dropped++ even if this is the first fragment (skb == NULL) so we are doing proper accounting. Fixes: b266bacba796 ("net: ethernet: cortina: Drop half-assembled SKB") Link: https://sashiko.dev/#/patchset/20260505-gemini-ethernet-fix-v2-1-997c31d06079%40kernel.org Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20260509-gemini-ethernet-fixes-v1-1-6c5d20ddc35b@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cortina/gemini.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 065cbbf52686..466445c9e08b 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1491,9 +1491,9 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) gpage = gmac_get_queue_page(geth, port, mapping + PAGE_SIZE); if (!gpage) { dev_err(geth->dev, "could not find mapping\n"); + port->stats.rx_dropped++; if (skb) { napi_free_frags(&port->napi); - port->stats.rx_dropped++; skb = NULL; } continue; From 06937db21ee311ed07eba47954447245041a982d Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 9 May 2026 00:13:37 +0200 Subject: [PATCH 233/377] net: ethernet: cortina: Make RX SKB per-port The SKB used to assemble packets from fragments in gmac_rx() is static local, but the Gemini has two ethernet ports, meaning there can be races between the ports on a bad day if a device is using both. Make the RX SKB a per-port variable and carry it over between invocations in the port struct instead. Zero the pointer once we call napi_gro_frags(), on error (after calling napi_free_frags()) or if the port is stopped. Zero it in some place where not strictly necessary just to emphasize what is going on. This was found by Sashiko during normal patch review. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Link: https://sashiko.dev/#/patchset/20260505-gemini-ethernet-fix-v2-1-997c31d06079%40kernel.org Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20260509-gemini-ethernet-fixes-v1-2-6c5d20ddc35b@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cortina/gemini.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 466445c9e08b..d5a56366fb43 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -122,6 +122,8 @@ struct gemini_ethernet_port { struct napi_struct napi; struct hrtimer rx_coalesce_timer; unsigned int rx_coalesce_nsecs; + struct sk_buff *rx_skb; + unsigned int freeq_refill; struct gmac_txq txq[TX_QUEUE_NUM]; unsigned int txq_order; @@ -1442,10 +1444,10 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) unsigned short m = (1 << port->rxq_order) - 1; struct gemini_ethernet *geth = port->geth; void __iomem *ptr_reg = port->rxq_rwptr; + struct sk_buff *skb = port->rx_skb; unsigned int frame_len, frag_len; struct gmac_rxdesc *rx = NULL; struct gmac_queue_page *gpage; - static struct sk_buff *skb; union gmac_rxdesc_0 word0; union gmac_rxdesc_1 word1; union gmac_rxdesc_3 word3; @@ -1504,6 +1506,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) if (skb) { napi_free_frags(&port->napi); port->stats.rx_dropped++; + skb = NULL; } skb = gmac_skb_if_good_frame(port, word0, frame_len); @@ -1554,6 +1557,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) port->stats.rx_dropped++; } + port->rx_skb = skb; writew(r, ptr_reg); return budget; } @@ -1881,6 +1885,7 @@ static int gmac_stop(struct net_device *netdev) gmac_disable_tx_rx(netdev); gmac_stop_dma(port); napi_disable(&port->napi); + port->rx_skb = NULL; gmac_enable_irq(netdev, 0); gmac_cleanup_rxq(netdev); From ebd8ec2b309e3a447851b456ccaf8fb39f3661e7 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 9 May 2026 00:13:38 +0200 Subject: [PATCH 234/377] net: ethernet: cortina: Carry over frag counter The gmac_rx() NAPI poll function assembles packets in an SKB from a ring buffer. If the ring buffer gets completely emptied during a poll cycle, we exit gmac_rx(), but the packet is not yet completely assembled in the SKB, yet the fragment counter frag_nr is reset to zero on the next invocation. Solve this by making the RX fragment counter a part of the port struct, and carry it over between invocations. Reset the fragment counter only right after calling napi_gro_frags(), on error (after calling napi_free_frags()) or if stopping the port. Reset it in some place where not strictly necessary just to emphasize what is going on. This was found by Sashiko during normal patch review. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Link: https://sashiko.dev/#/patchset/20260505-gemini-ethernet-fix-v2-1-997c31d06079%40kernel.org Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20260509-gemini-ethernet-fixes-v1-3-6c5d20ddc35b@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cortina/gemini.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index d5a56366fb43..4c762229ce42 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -123,6 +123,7 @@ struct gemini_ethernet_port { struct hrtimer rx_coalesce_timer; unsigned int rx_coalesce_nsecs; struct sk_buff *rx_skb; + unsigned int rx_frag_nr; unsigned int freeq_refill; struct gmac_txq txq[TX_QUEUE_NUM]; @@ -1444,6 +1445,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) unsigned short m = (1 << port->rxq_order) - 1; struct gemini_ethernet *geth = port->geth; void __iomem *ptr_reg = port->rxq_rwptr; + unsigned int frag_nr = port->rx_frag_nr; struct sk_buff *skb = port->rx_skb; unsigned int frame_len, frag_len; struct gmac_rxdesc *rx = NULL; @@ -1457,7 +1459,6 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) unsigned short r, w; union dma_rwptr rw; dma_addr_t mapping; - int frag_nr = 0; spin_lock_irqsave(&geth->irq_lock, flags); rw.bits32 = readl(ptr_reg); @@ -1497,6 +1498,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) if (skb) { napi_free_frags(&port->napi); skb = NULL; + frag_nr = 0; } continue; } @@ -1507,6 +1509,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) napi_free_frags(&port->napi); port->stats.rx_dropped++; skb = NULL; + frag_nr = 0; } skb = gmac_skb_if_good_frame(port, word0, frame_len); @@ -1541,6 +1544,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) if (word3.bits32 & EOF_BIT) { napi_gro_frags(&port->napi); skb = NULL; + frag_nr = 0; --budget; } continue; @@ -1549,6 +1553,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) if (skb) { napi_free_frags(&port->napi); skb = NULL; + frag_nr = 0; } if (mapping) @@ -1558,6 +1563,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) } port->rx_skb = skb; + port->rx_frag_nr = frag_nr; writew(r, ptr_reg); return budget; } @@ -1886,6 +1892,7 @@ static int gmac_stop(struct net_device *netdev) gmac_stop_dma(port); napi_disable(&port->napi); port->rx_skb = NULL; + port->rx_frag_nr = 0; gmac_enable_irq(netdev, 0); gmac_cleanup_rxq(netdev); From 36a8d04a8293afcb9304cf0cd3741f67698f2a1a Mon Sep 17 00:00:00 2001 From: Ethan Nelson-Moore Date: Fri, 8 May 2026 19:37:28 -0700 Subject: [PATCH 235/377] net: ethernet: cs89x0: remove stale CONFIG_MACH_MX31ADS reference The legacy ARM board file for MACH_MX31ADS was removed in commit c93197b0041d ("ARM: imx: Remove i.MX31 board files"), but a reference to it remained in the cs89x0 driver. Drop this unused code. Signed-off-by: Ethan Nelson-Moore Fixes: c93197b0041d ("ARM: imx: Remove i.MX31 board files") Link: https://patch.msgid.link/20260509023732.42256-1-enelsonmoore@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cirrus/cs89x0.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/cirrus/cs89x0.c b/drivers/net/ethernet/cirrus/cs89x0.c index fa5857923db4..b4bfd6c174e7 100644 --- a/drivers/net/ethernet/cirrus/cs89x0.c +++ b/drivers/net/ethernet/cirrus/cs89x0.c @@ -1271,7 +1271,6 @@ static const struct net_device_ops net_ops = { static void __init reset_chip(struct net_device *dev) { -#if !defined(CONFIG_MACH_MX31ADS) struct net_local *lp = netdev_priv(dev); unsigned long reset_start_time; @@ -1298,7 +1297,6 @@ static void __init reset_chip(struct net_device *dev) while ((readreg(dev, PP_SelfST) & INIT_DONE) == 0 && time_before(jiffies, reset_start_time + 2)) ; -#endif /* !CONFIG_MACH_MX31ADS */ } /* This is the real probe routine. From 7cee43fcb0c3f71441d2faaa8c2202b6a88b6bef Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:28:55 -0700 Subject: [PATCH 236/377] net: shaper: flip the polarity of the valid flag The usual way of inserting entries which are not yet fully ready into XArray is to have a VALID flag. The shaper code has a NOT_VALID flag. Since XArray code does not let us create entries with marks already set - the creation of entries is currently not atomic. Flip the polarity of the VALID flag. This closes the tiny race in net_shaper_pre_insert() of entries being created without the NOT_VALID flag. Fixes: 93954b40f6a4 ("net-shapers: implement NL set and delete operations") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 1069fa4eb9f6..d2b8f1f951b1 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -275,11 +275,13 @@ static void net_shaper_default_parent(const struct net_shaper_handle *handle, parent->id = 0; } -/* - * MARK_0 is already in use due to XA_FLAGS_ALLOC, can't reuse such flag as - * it's cleared by xa_store(). +/* MARK_0 is already in use due to XA_FLAGS_ALLOC. The VALID mark is set on + * an entry only after the device-side configuration has completed + * successfully (see net_shaper_commit()). Lookups and dumps must filter on + * this mark to avoid exposing tentative entries inserted by + * net_shaper_pre_insert() while the driver call is still in flight. */ -#define NET_SHAPER_NOT_VALID XA_MARK_1 +#define NET_SHAPER_VALID XA_MARK_1 static struct net_shaper * net_shaper_lookup(struct net_shaper_binding *binding, @@ -289,8 +291,8 @@ net_shaper_lookup(struct net_shaper_binding *binding, struct net_shaper_hierarchy *hierarchy; hierarchy = net_shaper_hierarchy_rcu(binding); - if (!hierarchy || xa_get_mark(&hierarchy->shapers, index, - NET_SHAPER_NOT_VALID)) + if (!hierarchy || !xa_get_mark(&hierarchy->shapers, index, + NET_SHAPER_VALID)) return NULL; return xa_load(&hierarchy->shapers, index); @@ -370,13 +372,10 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding, goto free_id; } - /* Mark 'tentative' shaper inside the hierarchy container. - * xa_set_mark is a no-op if the previous store fails. + /* Insert as 'tentative' (no VALID mark). The mark will be set by + * net_shaper_commit() once the driver-side configuration succeeds. */ - xa_lock(&hierarchy->shapers); - prev = __xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); - __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID); - xa_unlock(&hierarchy->shapers); + prev = xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); if (xa_err(prev)) { NL_SET_ERR_MSG(extack, "Can't insert shaper into device store"); kfree_rcu(cur, rcu); @@ -413,8 +412,7 @@ static void net_shaper_commit(struct net_shaper_binding *binding, /* Successful update: drop the tentative mark * and update the hierarchy container. */ - __xa_clear_mark(&hierarchy->shapers, index, - NET_SHAPER_NOT_VALID); + __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_VALID); *cur = shapers[i]; } xa_unlock(&hierarchy->shapers); @@ -431,8 +429,9 @@ static void net_shaper_rollback(struct net_shaper_binding *binding) return; xa_lock(&hierarchy->shapers); - xa_for_each_marked(&hierarchy->shapers, index, cur, - NET_SHAPER_NOT_VALID) { + xa_for_each(&hierarchy->shapers, index, cur) { + if (xa_get_mark(&hierarchy->shapers, index, NET_SHAPER_VALID)) + continue; __xa_erase(&hierarchy->shapers, index); kfree(cur); } @@ -836,7 +835,8 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb, goto out_unlock; for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, - U32_MAX, XA_PRESENT)); ctx->start_index++) { + U32_MAX, NET_SHAPER_VALID)); + ctx->start_index++) { ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; From 235fb5376139c3419f2218349f1fa2f06f24f7ad Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:28:56 -0700 Subject: [PATCH 237/377] net: shaper: fix trivial ordering issue in net_shaper_commit() We should update the entry before we mark it as valid. Fixes: 93954b40f6a4 ("net-shapers: implement NL set and delete operations") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-3-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index d2b8f1f951b1..86319ddbf290 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -295,6 +295,10 @@ net_shaper_lookup(struct net_shaper_binding *binding, NET_SHAPER_VALID)) return NULL; + /* Pairs with smp_wmb() in net_shaper_commit(): if the entry is + * valid, its contents must be visible too. + */ + smp_rmb(); return xa_load(&hierarchy->shapers, index); } @@ -412,8 +416,9 @@ static void net_shaper_commit(struct net_shaper_binding *binding, /* Successful update: drop the tentative mark * and update the hierarchy container. */ - __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_VALID); *cur = shapers[i]; + smp_wmb(); + __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_VALID); } xa_unlock(&hierarchy->shapers); } @@ -837,6 +842,10 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb, for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, U32_MAX, NET_SHAPER_VALID)); ctx->start_index++) { + /* Pairs with smp_wmb() in net_shaper_commit(): the entry + * is marked VALID, so its contents must be visible too. + */ + smp_rmb(); ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; From a9a2fa1da619f276580b0d4c5d12efac89e8642b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:28:57 -0700 Subject: [PATCH 238/377] net: shaper: reject duplicate leaves in GROUP request net_shaper_nl_group_doit() does not deduplicate NET_SHAPER_A_LEAVES entries. When userspace supplies the same leaf handle twice, the same old-parent pointer lands twice in old_nodes[]. The cleanup loop double frees the parent. Of course the same parent may still be in old_nodes[] twice if we are moving multiple of its leaves. Note that this patch also implicitly fixes the fact that the i >= leaves_count path forgets to set ret. Fixes: 5d5d4700e75d ("net-shapers: implement NL group operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-4-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 60 +++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 86319ddbf290..c8960821cf23 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -941,6 +941,46 @@ static int net_shaper_handle_cmp(const struct net_shaper_handle *a, return memcmp(a, b, sizeof(*a)); } +static int net_shaper_parse_leaves(struct net_shaper_binding *binding, + struct genl_info *info, + const struct net_shaper *node, + struct net_shaper *leaves, + int leaves_count) +{ + struct nlattr *attr; + int i, j, ret, rem; + + i = 0; + nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + if (WARN_ON_ONCE(i >= leaves_count)) + return -EINVAL; + + ret = net_shaper_parse_leaf(binding, attr, info, + node, &leaves[i]); + if (ret) + return ret; + + /* Reject duplicates */ + for (j = 0; j < i; j++) { + if (net_shaper_handle_cmp(&leaves[i].handle, + &leaves[j].handle)) + continue; + + NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr, + "Duplicate leaf shaper %d:%d", + leaves[i].handle.scope, + leaves[i].handle.id); + return -EINVAL; + } + + i++; + } + + return 0; +} + static int net_shaper_parent_from_leaves(int leaves_count, const struct net_shaper *leaves, struct net_shaper *node, @@ -1197,10 +1237,9 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) struct net_shaper **old_nodes, *leaves, node = {}; struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; - int i, ret, rem, leaves_count; + int i, ret, leaves_count; int old_nodes_count = 0; struct sk_buff *msg; - struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_LEAVES)) return -EINVAL; @@ -1228,19 +1267,10 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto free_leaves; - i = 0; - nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, - genlmsg_data(info->genlhdr), - genlmsg_len(info->genlhdr), rem) { - if (WARN_ON_ONCE(i >= leaves_count)) - goto free_leaves; - - ret = net_shaper_parse_leaf(binding, attr, info, - &node, &leaves[i]); - if (ret) - goto free_leaves; - i++; - } + ret = net_shaper_parse_leaves(binding, info, &node, + leaves, leaves_count); + if (ret) + goto free_leaves; /* Prepare the msg reply in advance, to avoid device operation * rollback on allocation failure. From 6e8ae9d805d4b9ecec49bb9e457d9bae0b21b540 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:28:58 -0700 Subject: [PATCH 239/377] selftests: drv-net: add shaper test for duplicate leaves Add test exercising duplicate leaves. Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-5-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/shaper.py | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/shaper.py b/tools/testing/selftests/drivers/net/shaper.py index 11310f19bfa0..e39d270e688d 100755 --- a/tools/testing/selftests/drivers/net/shaper.py +++ b/tools/testing/selftests/drivers/net/shaper.py @@ -1,7 +1,10 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_true, KsftSkipEx +import errno + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, ksft_raises, ksft_true, KsftSkipEx from lib.py import EthtoolFamily, NetshaperFamily from lib.py import NetDrvEnv from lib.py import NlError @@ -438,6 +441,21 @@ def queue_update(cfg, nl_shaper) -> None: nl_shaper.delete({'ifindex': cfg.ifindex, 'handle': {'scope': 'queue', 'id': i}}) +def dup_leaves(cfg, nl_shaper) -> None: + """ Ensure that the kernel rejects duplicate leaves. """ + if not cfg.groups: + raise KsftSkipEx("device does not support node scope") + + with ksft_raises(NlError) as cm: + nl_shaper.group({ + 'ifindex': cfg.ifindex, + 'leaves':[{'handle': {'scope': 'queue', 'id': 0}}, + {'handle': {'scope': 'queue', 'id': 0}}], + 'handle': {'scope':'node'}, + 'metric': 'bps', + 'bw-max': 10000}) + ksft_eq(cm.exception.error, errno.EINVAL) + def main() -> None: with NetDrvEnv(__file__, queue_count=4) as cfg: cfg.queues = False @@ -453,7 +471,9 @@ def main() -> None: basic_groups, qgroups, delegation, - queue_update], args=(cfg, NetshaperFamily())) + dup_leaves, + queue_update], + args=(cfg, NetshaperFamily())) ksft_exit() From 8054f85b83f42a37d482fc77ea7c9ff06a9407d9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:28:59 -0700 Subject: [PATCH 240/377] net: shaper: set ret to -ENOMEM when genlmsg_new() fails in group_doit genlmsg_new() alloc failure path in net_shaper_nl_group_doit() forgets to set ret before jumping to error handling. Fixes: 5d5d4700e75d ("net-shapers: implement NL group operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-6-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index c8960821cf23..12e5e0c18643 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -1276,8 +1276,10 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) * rollback on allocation failure. */ msg = genlmsg_new(net_shaper_handle_size(), GFP_KERNEL); - if (!msg) + if (!msg) { + ret = -ENOMEM; goto free_leaves; + } hierarchy = net_shaper_hierarchy_setup(binding); if (!hierarchy) { From 0f9a857e34d0f8c018a3e4435c6f0e92e8d2f38c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:29:00 -0700 Subject: [PATCH 241/377] net: shaper: fix undersized reply skb allocation in GROUP command net_shaper_group_send_reply() writes both the NET_SHAPER_A_IFINDEX attribute (via net_shaper_fill_binding()) and the nested NET_SHAPER_A_HANDLE attribute (via net_shaper_fill_handle()), but the reply skb at the call site in net_shaper_nl_group_doit() is allocated using net_shaper_handle_size(), which only accounts for the nested handle. The allocation is therefore short by nla_total_size(sizeof(u32)) (8 bytes) for the IFINDEX attribute. In practice the slab allocator rounds up the small allocation so the bug is latent, but the size accounting is wrong and could bite if the reply grew further. Introduce net_shaper_group_reply_size() that accounts for the full reply payload and use it both at the genlmsg_new() call site and in the defensive WARN_ONCE message. Fixes: 5d5d4700e75d ("net-shapers: implement NL group operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-7-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 12e5e0c18643..08fde2d9e8aa 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -90,6 +90,12 @@ static int net_shaper_handle_size(void) nla_total_size(sizeof(u32))); } +static int net_shaper_group_reply_size(void) +{ + return nla_total_size(sizeof(u32)) + /* NET_SHAPER_A_IFINDEX */ + net_shaper_handle_size(); /* NET_SHAPER_A_HANDLE */ +} + static int net_shaper_fill_binding(struct sk_buff *msg, const struct net_shaper_binding *binding, u32 type) @@ -1227,7 +1233,7 @@ static int net_shaper_group_send_reply(struct net_shaper_binding *binding, free_msg: /* Should never happen as msg is pre-allocated with enough space. */ WARN_ONCE(true, "calculated message payload length (%d)", - net_shaper_handle_size()); + net_shaper_group_reply_size()); nlmsg_free(msg); return -EMSGSIZE; } @@ -1275,7 +1281,7 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) /* Prepare the msg reply in advance, to avoid device operation * rollback on allocation failure. */ - msg = genlmsg_new(net_shaper_handle_size(), GFP_KERNEL); + msg = genlmsg_new(net_shaper_group_reply_size(), GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto free_leaves; From fbf5df34a4dbcd09d433dd4f0916bf9b2ddb16de Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:29:01 -0700 Subject: [PATCH 242/377] tools: ynl: add scope qualifier for definitions Using definitions in kernel policies is awkward right now. On one hand we want defines for max values and such. On the other we don't have a way of adding kernel-only defines. Adding unnecessary defines to uAPI is a bad idea, we won't be able to delete them. And when it comes to policy user space should just query it via the policy dump, not use hard coded defines. Add a "scope" property to definitions, which will let us tell the codegen that a definition is for kernel use only. Support following values: - uapi: render into the uAPI header (default, today's behavior) - kernel: render to kernel header only - user: same as kernel but for the user-side generated header Definitions may have a header property (definition is "external", provided by existing header). Extend the scope to headers, too. If definition has both scope and header properties we will only generate the includes in the right scope. Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-8-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/genetlink-c.yaml | 9 ++++++ Documentation/netlink/genetlink-legacy.yaml | 9 ++++++ Documentation/netlink/genetlink.yaml | 9 ++++++ Documentation/netlink/netlink-raw.yaml | 9 ++++++ tools/net/ynl/pyynl/ynl_gen_c.py | 31 +++++++++++++++++++-- 5 files changed, 65 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml index 57f59fe23e3f..4ea31e8fc4d1 100644 --- a/Documentation/netlink/genetlink-c.yaml +++ b/Documentation/netlink/genetlink-c.yaml @@ -69,6 +69,15 @@ properties: header: description: For C-compatible languages, header which already defines this value. type: string + scope: + description: | + Visibility of this definition. "uapi" (default) renders into + the uAPI header, "kernel" renders into the kernel-side + generated header, "user" renders into the user-side + generated header. When combined with `header:`, the + definition is not rendered, and the named header is + included only by code matching the scope. + enum: [ uapi, kernel, user ] type: enum: [ const, enum, flags ] doc: diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index 66fb8653a344..f9c44747729a 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -83,6 +83,15 @@ properties: header: description: For C-compatible languages, header which already defines this value. type: string + scope: + description: | + Visibility of this definition. "uapi" (default) renders into + the uAPI header, "kernel" renders into the kernel-side + generated header, "user" renders into the user-side + generated header. When combined with `header:`, the + definition is not rendered, and the named header is + included only by code matching the scope. + enum: [ uapi, kernel, user ] type: enum: [ const, enum, flags, struct ] # Trim doc: diff --git a/Documentation/netlink/genetlink.yaml b/Documentation/netlink/genetlink.yaml index a1194d5d93fc..d3f3f3399ddf 100644 --- a/Documentation/netlink/genetlink.yaml +++ b/Documentation/netlink/genetlink.yaml @@ -55,6 +55,15 @@ properties: header: description: For C-compatible languages, header which already defines this value. type: string + scope: + description: | + Visibility of this definition. "uapi" (default) renders into + the uAPI header, "kernel" renders into the kernel-side + generated header, "user" renders into the user-side + generated header. When combined with `header:`, the + definition is not rendered, and the named header is + included only by code matching the scope. + enum: [ uapi, kernel, user ] type: enum: [ const, enum, flags ] doc: diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml index dd98dda55bd0..4c436b59a34b 100644 --- a/Documentation/netlink/netlink-raw.yaml +++ b/Documentation/netlink/netlink-raw.yaml @@ -87,6 +87,15 @@ properties: header: description: For C-compatible languages, header which already defines this value. type: string + scope: + description: | + Visibility of this definition. "uapi" (default) renders into + the uAPI header, "kernel" renders into the kernel-side + generated header, "user" renders into the user-side + generated header. When combined with `header:`, the + definition is not rendered, and the named header is + included only by code matching the scope. + enum: [ uapi, kernel, user ] type: enum: [ const, enum, flags, struct ] # Trim doc: diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 0e1e486c1185..cdc3646f2642 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -3212,6 +3212,8 @@ def render_uapi(family, cw): for const in family['definitions']: if const.get('header'): continue + if const.get('scope', 'uapi') != 'uapi': + continue if const['type'] != 'const': cw.writes_defines(defines) @@ -3339,6 +3341,25 @@ def render_uapi(family, cw): cw.p(f'#endif /* {hdr_prot} */') +def render_scoped_consts(family, cw, scope): + defines = [] + for const in family['definitions']: + if const['type'] != 'const': + continue + if const.get('header'): + continue + if const.get('scope') != scope: + continue + name_pfx = const.get('name-prefix', f"{family.ident_name}-") + defines.append([ + c_upper(family.get('c-define-name', + f"{name_pfx}{const['name']}")), + const['value']]) + if defines: + cw.writes_defines(defines) + cw.nl() + + def _render_user_ntf_entry(ri, op): if not ri.family.is_classic(): ri.cw.block_start(line=f"[{op.enum_name}] = ") @@ -3504,8 +3525,12 @@ def main(): cw.p('#include "ynl.h"') headers = [] for definition in parsed['definitions'] + parsed['attribute-sets']: - if 'header' in definition: - headers.append(definition['header']) + if 'header' not in definition: + continue + scope = definition.get('scope', 'uapi') + if scope != 'uapi' and scope != args.mode: + continue + headers.append(definition['header']) if args.mode == 'user': headers.append(parsed.uapi_header) seen_header = [] @@ -3522,6 +3547,7 @@ def main(): for one in args.user_header: cw.p(f'#include "{one}"') else: + render_scoped_consts(parsed, cw, 'user') cw.p('struct ynl_sock;') cw.nl() render_user_family(parsed, cw, True) @@ -3529,6 +3555,7 @@ def main(): if args.mode == "kernel": if args.header: + render_scoped_consts(parsed, cw, 'kernel') for _, struct in sorted(parsed.pure_nested_structs.items()): if struct.request: cw.p('/* Common nested types */') From 8d5806c600fddb907ebe378f9c366d4b52ac3a39 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:29:02 -0700 Subject: [PATCH 243/377] net: shaper: reject handle IDs exceeding internal bit-width net_shaper_parse_handle() reads the user-supplied handle ID via nla_get_u32(), accepting the full u32 range. However, the xarray key is built by net_shaper_handle_to_index() using FIELD_PREP(NET_SHAPER_ID_MASK, handle->id), where NET_SHAPER_ID_MASK is GENMASK(25, 0) - only 26 bits wide. FIELD_PREP silently masks off the upper bits at runtime. A user-supplied NODE id like 0x04000123 becomes id 0x123. Additionally, a user-supplied id equal to NET_SHAPER_ID_UNSPEC (0x03FFFFFF, which is NET_SHAPER_ID_MASK itself) would collide with the sentinel used internally by the group operation to signal "allocate a new NODE id". Reject user-supplied IDs >= NET_SHAPER_ID_MASK (i.e., >= 0x03FFFFFF) in the policy. Fixes: 4b623f9f0f59 ("net-shapers: implement NL get operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-9-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/net_shaper.yaml | 7 +++++++ net/shaper/shaper.c | 4 +++- net/shaper/shaper_nl_gen.c | 7 ++++++- net/shaper/shaper_nl_gen.h | 2 ++ 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/net_shaper.yaml b/Documentation/netlink/specs/net_shaper.yaml index 3f2ad772b64b..de01f922040a 100644 --- a/Documentation/netlink/specs/net_shaper.yaml +++ b/Documentation/netlink/specs/net_shaper.yaml @@ -33,6 +33,11 @@ doc: | @cap-get operation. definitions: + - + type: const + name: max-handle-id + value: 0x3fffffe + scope: kernel - type: enum name: scope @@ -140,6 +145,8 @@ attribute-sets: - name: id type: u32 + checks: + max: max-handle-id doc: | Numeric identifier of a shaper. The id semantic depends on the scope. For @queue scope it's the queue id and for @node diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 08fde2d9e8aa..eb049847fed6 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -21,6 +21,8 @@ #define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK +static_assert(NET_SHAPER_ID_UNSPEC == NET_SHAPER_MAX_HANDLE_ID + 1); + struct net_shaper_hierarchy { struct xarray shapers; }; @@ -360,7 +362,7 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding, handle->id == NET_SHAPER_ID_UNSPEC) { u32 min, max; - handle->id = NET_SHAPER_ID_MASK - 1; + handle->id = NET_SHAPER_MAX_HANDLE_ID; max = net_shaper_handle_to_index(handle); handle->id = 0; min = net_shaper_handle_to_index(handle); diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c index 9b29be3ef19a..76eff85ec66d 100644 --- a/net/shaper/shaper_nl_gen.c +++ b/net/shaper/shaper_nl_gen.c @@ -11,10 +11,15 @@ #include +/* Integer value ranges */ +static const struct netlink_range_validation net_shaper_a_handle_id_range = { + .max = NET_SHAPER_MAX_HANDLE_ID, +}; + /* Common nested types */ const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1] = { [NET_SHAPER_A_HANDLE_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3), - [NET_SHAPER_A_HANDLE_ID] = { .type = NLA_U32, }, + [NET_SHAPER_A_HANDLE_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &net_shaper_a_handle_id_range), }; const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1] = { diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h index 42c46c52c775..2406652a9014 100644 --- a/net/shaper/shaper_nl_gen.h +++ b/net/shaper/shaper_nl_gen.h @@ -12,6 +12,8 @@ #include +#define NET_SHAPER_MAX_HANDLE_ID 67108862 + /* Common nested types */ extern const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1]; extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1]; From b62b29e6de6711f5918940aa6ff2bbab6d6af502 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:29:03 -0700 Subject: [PATCH 244/377] net: shaper: enforce singleton NETDEV scope with id 0 The NETDEV scope represents a singleton root shaper in the per-device hierarchy. All code assumes NETDEV shapers have id 0: net_shaper_default_parent() hardcodes parent->id = 0 when returning the NETDEV parent for QUEUE/NODE children, and the UAPI documentation describes NETDEV scope as "the main shaper" (singular, not plural). Make sure we reject non-0 IDs. Fixes: 4b623f9f0f59 ("net-shapers: implement NL get operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-10-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index eb049847fed6..4ae3ee6764a0 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -482,6 +482,12 @@ static int net_shaper_parse_handle(const struct nlattr *attr, else if (handle->scope == NET_SHAPER_SCOPE_NODE) id = NET_SHAPER_ID_UNSPEC; + if (id && handle->scope == NET_SHAPER_SCOPE_NETDEV) { + NL_SET_ERR_MSG_ATTR(info->extack, id_attr, + "Netdev scope is a singleton, must use ID 0"); + return -EINVAL; + } + handle->id = id; return 0; } From ce372e869f9f492f3d5aa9a0ae75ed52c61d2d6f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:29:04 -0700 Subject: [PATCH 245/377] net: shaper: reject QUEUE scope handle with missing id net_shaper_parse_handle() does not enforce that the user provides the handle ID. For NODE the ID defaults to UNSPEC for all other cases it defaults to 0. For NETDEV 0 is the only option. For QUEUE defaulting to 0 makes less intuitive sense. Specifically because the behavior should (IMHO) be the same for all cases where there may be more than one ID (QUEUE and NODE). We should either document this as intentional or reject. I picked the latter with no strong conviction. Fixes: 4b623f9f0f59 ("net-shapers: implement NL get operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-11-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 4ae3ee6764a0..b1c65110f04d 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -477,10 +477,15 @@ static int net_shaper_parse_handle(const struct nlattr *attr, * shaper (any other value). */ id_attr = tb[NET_SHAPER_A_HANDLE_ID]; - if (id_attr) + if (id_attr) { id = nla_get_u32(id_attr); - else if (handle->scope == NET_SHAPER_SCOPE_NODE) + } else if (handle->scope == NET_SHAPER_SCOPE_NODE) { id = NET_SHAPER_ID_UNSPEC; + } else if (handle->scope == NET_SHAPER_SCOPE_QUEUE) { + NL_SET_ERR_ATTR_MISS(info->extack, attr, + NET_SHAPER_A_HANDLE_ID); + return -EINVAL; + } if (id && handle->scope == NET_SHAPER_SCOPE_NETDEV) { NL_SET_ERR_MSG_ATTR(info->extack, id_attr, From 603ab5ea6482c723216b59cb733e8ba248619ee9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 11 May 2026 21:55:23 -0500 Subject: [PATCH 246/377] SMB3.1.1: add missing QUERY_DIR info levels New Infolevels for QUERY_DIR (and QUERY_INFO) levels 78 through 81 are now being used by Windows clients and were added to the documentation. Add defines for them (and correct some typos in documentation). See MS-SMB2 2.2.33 and MS-FSCC 2.4 Signed-off-by: Steve French --- fs/smb/common/fscc.h | 4 ++-- fs/smb/common/smb2pdu.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/smb/common/fscc.h b/fs/smb/common/fscc.h index b4ccddca9256..bc3012cc295d 100644 --- a/fs/smb/common/fscc.h +++ b/fs/smb/common/fscc.h @@ -260,12 +260,12 @@ typedef struct { char FileName[]; } __packed FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */ -/* See MS-FSCC 2.4.13 */ +/* See MS-FSCC 2.4.14 */ struct smb2_file_eof_info { /* encoding of request for level 10 */ __le64 EndOfFile; /* new end of file value */ } __packed; /* level 20 Set */ -/* See MS-FSCC 2.4.14 */ +/* See MS-FSCC 2.4.15 */ typedef struct { __le32 NextEntryOffset; __u32 FileIndex; diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index a4b12eb8df81..aeb0a245c532 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1566,6 +1566,10 @@ struct validate_negotiate_info_rsp { #define FILE_STANDARD_LINK_INFORMATION 54 #define FILE_ID_INFORMATION 59 #define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 /* also for QUERY_DIR */ +#define FileId64ExtdDirectoryInformation 78 /* also for QUERY_DIR */ +#define FileId64ExtdBothDirectoryInformation 79 /* also for QUERY_DIR */ +#define FileIdAllExtdDirectoryInformation 80 /* also for QUERY_DIR */ +#define FileIdAllExtdBothDirectoryInformation 81 /* also for QUERY_DIR */ /* Used for Query Info and Find File POSIX Info for SMB3.1.1 and SMB1 */ #define SMB_FIND_FILE_POSIX_INFO 0x064 From 17ee873dba04d05090dfc5b2b9e08cfc8e4f147f Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 3 Mar 2026 10:48:54 +0100 Subject: [PATCH 247/377] HID: hid-sjoy: race between init and usage The driver uses an initial IO to set the device to a default state. That initialization is currently being done after the device node has been created. That means that the single buffer used for output can be altered while IO is in progress. Move the intialization before announcement to user space. Fixes: fac733f029251 ("HID: force feedback support for SmartJoy PLUS PS2/USB adapter") Signed-off-by: Oliver Neukum Signed-off-by: Jiri Kosina --- drivers/hid/hid-sjoy.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/hid/hid-sjoy.c b/drivers/hid/hid-sjoy.c index bab93d71b760..963c45113204 100644 --- a/drivers/hid/hid-sjoy.c +++ b/drivers/hid/hid-sjoy.c @@ -91,17 +91,17 @@ static int sjoyff_init(struct hid_device *hid) set_bit(FF_RUMBLE, dev->ffbit); - error = input_ff_create_memless(dev, sjoyff, hid_sjoyff_play); - if (error) { - kfree(sjoyff); - return error; - } - sjoyff->report = report; sjoyff->report->field[0]->value[0] = 0x01; sjoyff->report->field[0]->value[1] = 0x00; sjoyff->report->field[0]->value[2] = 0x00; hid_hw_request(hid, sjoyff->report, HID_REQ_SET_REPORT); + + error = input_ff_create_memless(dev, sjoyff, hid_sjoyff_play); + if (error) { + kfree(sjoyff); + return error; + } } hid_info(hid, "Force feedback for SmartJoy PLUS PS2/USB adapter\n"); From 637ad3a56a3b889527d1dacea6fea2a8bd648140 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Mon, 11 May 2026 22:51:51 +0100 Subject: [PATCH 248/377] block: don't overwrite bip_vcnt in bio_integrity_copy_user() bio_integrity_add_page() already sets bip_vcnt to 1 for the bounce segment. Overwriting it with nr_vecs breaks bip_vcnt <= bip_max_vcnt on WRITE (bip_max_vcnt is 1), so the gap-merge checks in block/blk.h read past the bip_vec[] flex array. On READ the read is in bounds but lands on a saved user bvec instead of the bounce. The line was added for split propagation, but bio_integrity_clone() doesn't copy bip_vcnt and BIP_CLONE_FLAGS excludes BIP_COPY_USER. Fixes: 3991657ae707 ("block: set bip_vcnt correctly") Signed-off-by: David Carlier Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260511215151.346228-1-devnexen@gmail.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index e54c6e06e1cb..78e678d4104b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -308,7 +308,6 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, } bip->bip_flags |= BIP_COPY_USER; - bip->bip_vcnt = nr_vecs; return 0; free_bip: bio_integrity_free(bio); From 2c6e6a18a37b905cb584eb0dda3ae482162a81ca Mon Sep 17 00:00:00 2001 From: Casey Chen Date: Mon, 11 May 2026 15:22:30 -0600 Subject: [PATCH 249/377] block: recompute nr_integrity_segments in blk_insert_cloned_request blk_insert_cloned_request() already recomputes nr_phys_segments against the bottom queue, because "the queue settings related to segment counting may differ from the original queue." The exact same reasoning applies to integrity segments: a stacked driver's underlying queue can have tighter virt_boundary_mask, seg_boundary_mask, or max_segment_size than the top queue, in which case blk_rq_count_integrity_sg() against the bottom queue produces a different count than the cached rq->nr_integrity_segments inherited from the source request by blk_rq_prep_clone(). When the cached count is lower than the bottom queue's actual count, blk_rq_map_integrity_sg() trips BUG_ON(segments > rq->nr_integrity_segments); on dispatch. The same families of stacked setups that motivated the existing nr_phys_segments recompute -- dm-multipath fanning out to nvme-rdma in particular -- can produce this. Mirror the nr_phys_segments handling: when the request carries integrity, recompute nr_integrity_segments against the bottom queue and reject the request if it exceeds the bottom queue's max_integrity_segments. blk_rq_count_integrity_sg() and queue_max_integrity_segments() are both already available via , which blk-mq.c includes. This closes a latent gap in the stacking contract and brings the integrity-segment accounting in line with the existing phys-segment accounting. Fixes: 76c313f658d2 ("blk-integrity: improved sg segment mapping") Signed-off-by: Casey Chen Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260511212230.27511-1-cachen@purestorage.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c5c16cce4f8..d0c37daf568f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3307,6 +3307,25 @@ blk_status_t blk_insert_cloned_request(struct request *rq) return BLK_STS_IOERR; } + /* + * Integrity segment counting depends on the same queue limits + * (virt_boundary_mask, seg_boundary_mask, max_segment_size) that + * vary across stacked queues, so recompute against the bottom + * queue just like nr_phys_segments above. + */ + if (blk_integrity_rq(rq) && rq->bio) { + unsigned short max_int_segs = queue_max_integrity_segments(q); + + rq->nr_integrity_segments = + blk_rq_count_integrity_sg(rq->q, rq->bio); + if (rq->nr_integrity_segments > max_int_segs) { + printk(KERN_ERR "%s: over max integrity segments limit. (%u > %u)\n", + __func__, rq->nr_integrity_segments, + max_int_segs); + return BLK_STS_IOERR; + } + } + if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; From 5f90dcfa8dc32a488581b78e575cdd7808ba5c78 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 5 Feb 2026 09:11:31 +0100 Subject: [PATCH 250/377] HID: quirks: really enable the intended work around for appledisplay Commit c7fabe4ad921 ("HID: quirks: work around VID/PID conflict for appledisplay") intends to add a quirk for kernels built with Apple Cinema Display support, but it refers to the non-existing config option CONFIG_APPLEDISPLAY, whereas the config option for Apple Cinema Display support is named CONFIG_USB_APPLEDISPLAY. Refer to the intended config option CONFIG_USB_APPLEDISPLAY in the ifdef directive. Fixes: c7fabe4ad921 ("HID: quirks: work around VID/PID conflict for appledisplay") Signed-off-by: Lukas Bulwahn Signed-off-by: Jiri Kosina --- drivers/hid/hid-quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 9e88c9d6c6dc..512049963978 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -235,7 +235,7 @@ static const struct hid_device_id hid_quirks[] = { * used as a driver. See hid_scan_report(). */ static const struct hid_device_id hid_have_special_driver[] = { -#if IS_ENABLED(CONFIG_APPLEDISPLAY) +#if IS_ENABLED(CONFIG_USB_APPLEDISPLAY) { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, 0x9218) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, 0x9219) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, 0x921c) }, From 8582792cf23b3d94674d4d838f7cde9a28d0fcaf Mon Sep 17 00:00:00 2001 From: Sungwoo Kim Date: Tue, 12 May 2026 01:09:29 -0400 Subject: [PATCH 251/377] block: bio-integrity: Fix null-ptr-deref in bio_integrity_map_user() pin_user_pages_fast() can partially succeed and return the number of pages that were actually pinned. However, the bio_integrity_map_user() does not handle this partial pinning. This leads to a general protection fault since bvec_from_pages() dereferences an unpinned page address, which is 0. To fix this, add a check to verify that all requested memory is pinned. If partial pinning occurs, unpin the memory and return -EFAULT. Kernel Oops: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 0 UID: 0 PID: 1061 Comm: nvme-passthroug Not tainted 7.0.0-11783-g90957f9314e8-dirty #16 PREEMPT(lazy) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.17.0-0-gb52ca86e094d-prebuilt.qemu.org 04/01/2014 RIP: 0010:bio_integrity_map_user.cold+0x1b0/0x9d6 Fixes: 492c5d455969 ("block: bio-integrity: directly map user buffers") Acked-by: Chao Shi Acked-by: Weidong Zhu Acked-by: Dave Tian Signed-off-by: Sungwoo Kim Tested-by: Shin'ichiro Kawasaki Link: https://github.com/linux-blktests/blktests/pull/244 Link: https://patch.msgid.link/20260512050929.541397-2-iam@sung-woo.kim Signed-off-by: Jens Axboe --- block/bio-integrity.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 78e678d4104b..e796de1a749e 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -402,6 +402,24 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) if (unlikely(ret < 0)) goto free_bvec; + /* + * Handle partial pinning. This can happen when pin_user_pages_fast() + * returns fewer pages than requested. + */ + if (user_backed_iter(iter) && unlikely(ret != bytes)) { + if (ret > 0) { + int npinned = DIV_ROUND_UP(offset + ret, PAGE_SIZE); + int i; + + for (i = 0; i < npinned; i++) + unpin_user_page(pages[i]); + } + if (pages != stack_pages) + kvfree(pages); + ret = -EFAULT; + goto free_bvec; + } + nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset, &is_p2p); if (pages != stack_pages) From 11f152c0acaa924d93339000cb785d34e003aff5 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 21 Apr 2026 16:27:01 +0200 Subject: [PATCH 252/377] xen/arm: Replace __ASSEMBLY__ with __ASSEMBLER__ in interface.h While the GCC and Clang compilers already define __ASSEMBLER__ automatically when compiling assembly code, __ASSEMBLY__ is a macro that only gets defined by the Makefiles in the kernel. This can be very confusing when switching between userspace and kernelspace coding, or when dealing with uapi headers that rather should use __ASSEMBLER__ instead. So let's standardize now on the __ASSEMBLER__ macro that is provided by the compilers. Signed-off-by: Thomas Huth Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Message-ID: <20260421142701.548978-1-thuth@redhat.com> --- include/xen/arm/interface.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xen/arm/interface.h b/include/xen/arm/interface.h index c3eada2642aa..61360b89da40 100644 --- a/include/xen/arm/interface.h +++ b/include/xen/arm/interface.h @@ -30,7 +30,7 @@ #define __HYPERVISOR_platform_op_raw __HYPERVISOR_platform_op -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Explicitly size integers that represent pfns in the interface with * Xen so that we can have one ABI that works for 32 and 64 bit guests. * Note that this means that the xen_pfn_t type may be capable of From f097d246677b03db814c5862f368cea341b76a00 Mon Sep 17 00:00:00 2001 From: Florian Pradines Date: Sat, 9 May 2026 09:45:17 +0000 Subject: [PATCH 253/377] HID: mcp2221: fix OOB write in mcp2221_raw_event() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mcp2221_raw_event() copies device-supplied data into mcp->rxbuf at offset rxbuf_idx without checking that the copy fits within the destination buffer. A device responding with up to 60 bytes to a small I2C/SMBus read can overflow the buffer. Add a rxbuf_size field to struct mcp2221, set it alongside rxbuf in mcp_i2c_smbus_read(), and check rxbuf_idx + data[3] <= rxbuf_size before the memcpy. Reported-by: Benoît Sevens Signed-off-by: Florian Pradines Signed-off-by: Jiri Kosina --- drivers/hid/hid-mcp2221.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/hid/hid-mcp2221.c b/drivers/hid/hid-mcp2221.c index be80970ab48e..e4ddd8e9293b 100644 --- a/drivers/hid/hid-mcp2221.c +++ b/drivers/hid/hid-mcp2221.c @@ -128,6 +128,7 @@ struct mcp2221 { u8 *rxbuf; u8 txbuf[64]; int rxbuf_idx; + int rxbuf_size; int status; u8 cur_i2c_clk_div; struct gpio_chip *gc; @@ -330,12 +331,14 @@ static int mcp_i2c_smbus_read(struct mcp2221 *mcp, mcp->txbuf[3] = (u8)(msg->addr << 1); total_len = msg->len; mcp->rxbuf = msg->buf; + mcp->rxbuf_size = msg->len; } else { mcp->txbuf[1] = smbus_len; mcp->txbuf[2] = 0; mcp->txbuf[3] = (u8)(smbus_addr << 1); total_len = smbus_len; mcp->rxbuf = smbus_buf; + mcp->rxbuf_size = smbus_len; } ret = mcp_send_data_req_status(mcp, mcp->txbuf, 4); @@ -919,6 +922,10 @@ static int mcp2221_raw_event(struct hid_device *hdev, mcp->status = -EINVAL; break; } + if (mcp->rxbuf_idx + data[3] > mcp->rxbuf_size) { + mcp->status = -EINVAL; + break; + } buf = mcp->rxbuf; memcpy(&buf[mcp->rxbuf_idx], &data[4], data[3]); mcp->rxbuf_idx = mcp->rxbuf_idx + data[3]; From d93ba918a185aca2594da63e92fdc5495b559c0f Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 16 Apr 2026 14:16:54 +0100 Subject: [PATCH 254/377] HID: magicmouse: Prevent out-of-bounds (OOB) read during DOUBLE_REPORT_ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is currently possible for a malicious or misconfigured USB device to cause an out-of-bounds (OOB) read when submitting reports using DOUBLE_REPORT_ID by specifying a large report length and providing a smaller one. Let's prevent that by comparing the specified report length with the actual size of the data read in from userspace. If the actual data length ends up being smaller than specified, we'll politely warn the user and prevent any further processing. Signed-off-by: Lee Jones Reviewed-by: Günther Noack Signed-off-by: Jiri Kosina --- drivers/hid/hid-magicmouse.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/hid/hid-magicmouse.c b/drivers/hid/hid-magicmouse.c index e70bd3dc07ab..802a3479e24b 100644 --- a/drivers/hid/hid-magicmouse.c +++ b/drivers/hid/hid-magicmouse.c @@ -390,6 +390,10 @@ static int magicmouse_raw_event(struct hid_device *hdev, struct input_dev *input = msc->input; int x = 0, y = 0, ii, clicks = 0, npoints; + /* Protect against zero sized recursive calls from DOUBLE_REPORT_ID */ + if (size < 1) + return 0; + switch (data[0]) { case TRACKPAD_REPORT_ID: case TRACKPAD2_BT_REPORT_ID: @@ -490,6 +494,18 @@ static int magicmouse_raw_event(struct hid_device *hdev, /* Sometimes the trackpad sends two touch reports in one * packet. */ + + /* Ensure that we have at least 2 elements (report type and size) */ + if (size < 2) + return 0; + + if (size < data[1] + 2) { + hid_warn(hdev, + "received report length (%d) was smaller than specified (%d)", + size, data[1] + 2); + return 0; + } + magicmouse_raw_event(hdev, report, data + 2, data[1]); magicmouse_raw_event(hdev, report, data + 2 + data[1], size - 2 - data[1]); From cac61b58a3b6340c52afa06bb15eac033158db2f Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Fri, 17 Apr 2026 08:47:02 -0700 Subject: [PATCH 255/377] HID: playstation: Clamp num_touch_reports A device would never lie about the number of touch reports would it? If it does the loop in dualshock4_parse_report will read off the end of the touch_reports array, up to about 2 KiB for the maximum number of 256 loop iteraions. The data that is read is emitted via evdev if the DS4_TOUCH_POINT_INACTIVE bit happens to be set. Protect against this by clamping the num_touch_reports value provided by the device to the maximum size of the touch_reports array. Fixes: 752038248808 ("HID: playstation: add DualShock4 touchpad support.") Cc: stable@vger.kernel.org Reported-by: Xingyu Jin Signed-off-by: T.J. Mercier Signed-off-by: Jiri Kosina --- drivers/hid/hid-playstation.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-playstation.c b/drivers/hid/hid-playstation.c index c43caac20b61..e48537331675 100644 --- a/drivers/hid/hid-playstation.c +++ b/drivers/hid/hid-playstation.c @@ -2384,7 +2384,8 @@ static int dualshock4_parse_report(struct ps_device *ps_dev, struct hid_report * } ds4_report = &usb->common; - num_touch_reports = usb->num_touch_reports; + num_touch_reports = min_t(u8, usb->num_touch_reports, + ARRAY_SIZE(usb->touch_reports)); touch_reports = usb->touch_reports; } else if (hdev->bus == BUS_BLUETOOTH && report->id == DS4_INPUT_REPORT_BT && size == DS4_INPUT_REPORT_BT_SIZE) { @@ -2404,7 +2405,8 @@ static int dualshock4_parse_report(struct ps_device *ps_dev, struct hid_report * } ds4_report = &bt->common; - num_touch_reports = bt->num_touch_reports; + num_touch_reports = min_t(u8, bt->num_touch_reports, + ARRAY_SIZE(bt->touch_reports)); touch_reports = bt->touch_reports; } else if (hdev->bus == BUS_BLUETOOTH && report->id == DS4_INPUT_REPORT_BT_MINIMAL && From 4db2af929279c799b5653a39eb0795c72baffca4 Mon Sep 17 00:00:00 2001 From: Sangyun Kim Date: Mon, 20 Apr 2026 14:13:17 +0900 Subject: [PATCH 256/377] HID: appletb-kbd: fix UAF in inactivity-timer cleanup path Commit 38224c472a03 ("HID: appletb-kbd: fix slab use-after-free bug in appletb_kbd_probe") added timer_delete_sync(&kbd->inactivity_timer) to both the probe close_hw error path and appletb_kbd_remove(), but the way it was wired in left the inactivity timer reachable during driver tear-down via two distinct windows. Window A -- put_device() before timer_delete_sync(): put_device(&kbd->backlight_dev->dev); timer_delete_sync(&kbd->inactivity_timer); The inactivity_timer softirq reads kbd->backlight_dev and calls backlight_device_set_brightness() -> mutex_lock(&ops_lock). If a concurrent hid_appletb_bl unbind drops the last devm reference between these two calls, the backlight_device is freed and the mutex_lock() touches freed memory. Window B -- backlight cleanup before hid_hw_stop(): if (kbd->backlight_dev) { timer_delete_sync(...); put_device(...); } hid_hw_close(hdev); hid_hw_stop(hdev); Even after Window A is closed, hid_hw_close()/hid_hw_stop() still run afterwards, so a late ".event" callback from the HID core (USB URB completion on real Apple hardware) can arrive after timer_delete_sync() drained the softirq but before put_device() drops the reference. That callback reaches reset_inactivity_timer(), which calls mod_timer() and re-arms the timer. The freshly re-armed timer can then fire on the about-to-be-freed backlight_device. Both windows produce the same KASAN slab-use-after-free: BUG: KASAN: slab-use-after-free in __mutex_lock+0x1aab/0x21c0 Read of size 8 at addr ffff88803ee9a108 by task swapper/0/0 Call Trace: __mutex_lock backlight_device_set_brightness appletb_inactivity_timer call_timer_fn run_timer_softirq handle_softirqs Allocated by task N: devm_backlight_device_register appletb_bl_probe Freed by task M: (concurrent hid_appletb_bl unbind path) Close both windows at once by reworking the tear-down in appletb_kbd_remove() and in the probe close_hw error path so that 1) hid_hw_close()/hid_hw_stop() run before the backlight cleanup, guaranteeing no further .event callback can fire and re-arm the timer, and 2) inside the "if (kbd->backlight_dev)" block, timer_delete_sync() runs before put_device(), so the softirq is drained before the final reference is dropped. Fixes: 38224c472a03 ("HID: appletb-kbd: fix slab use-after-free bug in appletb_kbd_probe") Cc: stable@vger.kernel.org Signed-off-by: Sangyun Kim Signed-off-by: Jiri Kosina --- drivers/hid/hid-appletb-kbd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/hid/hid-appletb-kbd.c b/drivers/hid/hid-appletb-kbd.c index 0fdc0968b9ef..8feac9e3589b 100644 --- a/drivers/hid/hid-appletb-kbd.c +++ b/drivers/hid/hid-appletb-kbd.c @@ -440,13 +440,13 @@ static int appletb_kbd_probe(struct hid_device *hdev, const struct hid_device_id unregister_handler: input_unregister_handler(&kbd->inp_handler); close_hw: - if (kbd->backlight_dev) { - put_device(&kbd->backlight_dev->dev); - timer_delete_sync(&kbd->inactivity_timer); - } hid_hw_close(hdev); stop_hw: hid_hw_stop(hdev); + if (kbd->backlight_dev) { + timer_delete_sync(&kbd->inactivity_timer); + put_device(&kbd->backlight_dev->dev); + } return ret; } @@ -457,13 +457,13 @@ static void appletb_kbd_remove(struct hid_device *hdev) appletb_kbd_set_mode(kbd, APPLETB_KBD_MODE_OFF); input_unregister_handler(&kbd->inp_handler); - if (kbd->backlight_dev) { - put_device(&kbd->backlight_dev->dev); - timer_delete_sync(&kbd->inactivity_timer); - } - hid_hw_close(hdev); hid_hw_stop(hdev); + + if (kbd->backlight_dev) { + timer_delete_sync(&kbd->inactivity_timer); + put_device(&kbd->backlight_dev->dev); + } } static int appletb_kbd_suspend(struct hid_device *hdev, pm_message_t msg) From 1654e53349d4e657b331de354313461f401f5063 Mon Sep 17 00:00:00 2001 From: Sangyun Kim Date: Mon, 20 Apr 2026 14:13:18 +0900 Subject: [PATCH 257/377] HID: appletb-kbd: run inactivity autodim from workqueues The autodim code in hid-appletb-kbd takes backlight_device->ops_lock via backlight_device_set_brightness() -> mutex_lock() from two different atomic contexts: * appletb_inactivity_timer() is a struct timer_list callback, so it runs in softirq context. Every expiry triggers BUG: sleeping function called from invalid context at kernel/locking/mutex.c:591 Call Trace: __might_resched __mutex_lock backlight_device_set_brightness appletb_inactivity_timer call_timer_fn run_timer_softirq * reset_inactivity_timer() is called from appletb_kbd_hid_event() and appletb_kbd_inp_event(). On real USB hardware these run in softirq/IRQ context (URB completion and input-event dispatch). When the Touch Bar has already been dimmed or turned off, the reset path calls backlight_device_set_brightness() directly to restore brightness, producing the same warning. Both call sites hit the same mutex_lock()-from-atomic bug. Fix them together by moving the blocking work onto the system workqueue: * Convert the inactivity timer from struct timer_list to struct delayed_work; the callback (appletb_inactivity_work) now runs in process context where mutex_lock() is legal. * Add a dedicated struct work_struct restore_brightness_work and have reset_inactivity_timer() schedule it instead of calling backlight_device_set_brightness() directly. Cancel both works synchronously during driver tear-down alongside the existing backlight reference drop. The semantics are unchanged (same delays, same state transitions on dim, turn-off and user activity); only the execution context of the sleeping call changes. The timer field and callback are renamed to match their new type; reset_inactivity_timer() keeps its name because it is invoked from input event paths that read naturally as "reset the inactivity timer". Fixes: 93a0fc489481 ("HID: hid-appletb-kbd: add support for automatic brightness control while using the touchbar") Cc: stable@vger.kernel.org Signed-off-by: Sangyun Kim Signed-off-by: Jiri Kosina --- drivers/hid/hid-appletb-kbd.c | 44 ++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/drivers/hid/hid-appletb-kbd.c b/drivers/hid/hid-appletb-kbd.c index 8feac9e3589b..462010a75899 100644 --- a/drivers/hid/hid-appletb-kbd.c +++ b/drivers/hid/hid-appletb-kbd.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include "hid-ids.h" @@ -62,7 +62,8 @@ struct appletb_kbd { struct input_handle kbd_handle; struct input_handle tpd_handle; struct backlight_device *backlight_dev; - struct timer_list inactivity_timer; + struct delayed_work inactivity_work; + struct work_struct restore_brightness_work; bool has_dimmed; bool has_turned_off; u8 saved_mode; @@ -164,16 +165,18 @@ static int appletb_tb_key_to_slot(unsigned int code) } } -static void appletb_inactivity_timer(struct timer_list *t) +static void appletb_inactivity_work(struct work_struct *work) { - struct appletb_kbd *kbd = timer_container_of(kbd, t, inactivity_timer); + struct appletb_kbd *kbd = container_of(to_delayed_work(work), + struct appletb_kbd, + inactivity_work); if (kbd->backlight_dev && appletb_tb_autodim) { if (!kbd->has_dimmed) { backlight_device_set_brightness(kbd->backlight_dev, 1); kbd->has_dimmed = true; - mod_timer(&kbd->inactivity_timer, - jiffies + secs_to_jiffies(appletb_tb_idle_timeout)); + mod_delayed_work(system_wq, &kbd->inactivity_work, + secs_to_jiffies(appletb_tb_idle_timeout)); } else if (!kbd->has_turned_off) { backlight_device_set_brightness(kbd->backlight_dev, 0); kbd->has_turned_off = true; @@ -181,16 +184,25 @@ static void appletb_inactivity_timer(struct timer_list *t) } } +static void appletb_restore_brightness_work(struct work_struct *work) +{ + struct appletb_kbd *kbd = container_of(work, struct appletb_kbd, + restore_brightness_work); + + if (kbd->backlight_dev) + backlight_device_set_brightness(kbd->backlight_dev, 2); +} + static void reset_inactivity_timer(struct appletb_kbd *kbd) { if (kbd->backlight_dev && appletb_tb_autodim) { if (kbd->has_dimmed || kbd->has_turned_off) { - backlight_device_set_brightness(kbd->backlight_dev, 2); kbd->has_dimmed = false; kbd->has_turned_off = false; + schedule_work(&kbd->restore_brightness_work); } - mod_timer(&kbd->inactivity_timer, - jiffies + secs_to_jiffies(appletb_tb_dim_timeout)); + mod_delayed_work(system_wq, &kbd->inactivity_work, + secs_to_jiffies(appletb_tb_dim_timeout)); } } @@ -408,9 +420,11 @@ static int appletb_kbd_probe(struct hid_device *hdev, const struct hid_device_id dev_err_probe(dev, -ENODEV, "Failed to get backlight device\n"); } else { backlight_device_set_brightness(kbd->backlight_dev, 2); - timer_setup(&kbd->inactivity_timer, appletb_inactivity_timer, 0); - mod_timer(&kbd->inactivity_timer, - jiffies + secs_to_jiffies(appletb_tb_dim_timeout)); + INIT_DELAYED_WORK(&kbd->inactivity_work, appletb_inactivity_work); + INIT_WORK(&kbd->restore_brightness_work, + appletb_restore_brightness_work); + mod_delayed_work(system_wq, &kbd->inactivity_work, + secs_to_jiffies(appletb_tb_dim_timeout)); } kbd->inp_handler.event = appletb_kbd_inp_event; @@ -444,7 +458,8 @@ static int appletb_kbd_probe(struct hid_device *hdev, const struct hid_device_id stop_hw: hid_hw_stop(hdev); if (kbd->backlight_dev) { - timer_delete_sync(&kbd->inactivity_timer); + cancel_delayed_work_sync(&kbd->inactivity_work); + cancel_work_sync(&kbd->restore_brightness_work); put_device(&kbd->backlight_dev->dev); } return ret; @@ -461,7 +476,8 @@ static void appletb_kbd_remove(struct hid_device *hdev) hid_hw_stop(hdev); if (kbd->backlight_dev) { - timer_delete_sync(&kbd->inactivity_timer); + cancel_delayed_work_sync(&kbd->inactivity_work); + cancel_work_sync(&kbd->restore_brightness_work); put_device(&kbd->backlight_dev->dev); } } From b08665fe80fab0956e64741c07d9bbcec635c34d Mon Sep 17 00:00:00 2001 From: Myeonghun Pak Date: Fri, 24 Apr 2026 21:50:41 +0900 Subject: [PATCH 258/377] HID: google: hammer: stop hardware on devres action failure hammer_probe() starts the HID hardware before registering the devres action that stops it. If devm_add_action() fails, probe returns an error with the hardware still started because the cleanup action was never registered and the driver's remove callback is not called after a failed probe. Use devm_add_action_or_reset() so the stop action runs immediately on registration failure while preserving the existing devres-managed cleanup path for later probe failures and remove. Signed-off-by: Myeonghun Pak Signed-off-by: Jiri Kosina --- drivers/hid/hid-google-hammer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-google-hammer.c b/drivers/hid/hid-google-hammer.c index 1af477e58480..c99c3c0d442e 100644 --- a/drivers/hid/hid-google-hammer.c +++ b/drivers/hid/hid-google-hammer.c @@ -496,7 +496,7 @@ static int hammer_probe(struct hid_device *hdev, if (error) return error; - error = devm_add_action(&hdev->dev, hammer_stop, hdev); + error = devm_add_action_or_reset(&hdev->dev, hammer_stop, hdev); if (error) return error; From 2c85c61d1332e1e16f020d76951baf167dcb6f7a Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 4 May 2026 10:47:22 +0200 Subject: [PATCH 259/377] HID: pass the buffer size to hid_report_raw_event commit 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") enforced the provided data to be at least the size of the declared buffer in the report descriptor to prevent a buffer overflow. However, we can try to be smarter by providing both the buffer size and the data size, meaning that hid_report_raw_event() can make better decision whether we should plaining reject the buffer (buffer overflow attempt) or if we can safely memset it to 0 and pass it to the rest of the stack. Fixes: 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires Acked-by: Johan Hovold Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- drivers/hid/bpf/hid_bpf_dispatch.c | 6 +++-- drivers/hid/hid-core.c | 42 ++++++++++++++++++++---------- drivers/hid/hid-gfrm.c | 4 +-- drivers/hid/hid-logitech-hidpp.c | 2 +- drivers/hid/hid-multitouch.c | 2 +- drivers/hid/hid-primax.c | 2 +- drivers/hid/hid-vivaldi-common.c | 2 +- drivers/hid/wacom_sys.c | 6 ++--- drivers/staging/greybus/hid.c | 2 +- include/linux/hid.h | 4 +-- include/linux/hid_bpf.h | 14 ++++++---- 11 files changed, 53 insertions(+), 33 deletions(-) diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c index 50c7b45c59e3..d0130658091b 100644 --- a/drivers/hid/bpf/hid_bpf_dispatch.c +++ b/drivers/hid/bpf/hid_bpf_dispatch.c @@ -24,7 +24,8 @@ EXPORT_SYMBOL(hid_ops); u8 * dispatch_hid_bpf_device_event(struct hid_device *hdev, enum hid_report_type type, u8 *data, - u32 *size, int interrupt, u64 source, bool from_bpf) + size_t *buf_size, u32 *size, int interrupt, u64 source, + bool from_bpf) { struct hid_bpf_ctx_kern ctx_kern = { .ctx = { @@ -74,6 +75,7 @@ dispatch_hid_bpf_device_event(struct hid_device *hdev, enum hid_report_type type *size = ret; } + *buf_size = ctx_kern.ctx.allocated_size; return ctx_kern.data; } EXPORT_SYMBOL_GPL(dispatch_hid_bpf_device_event); @@ -505,7 +507,7 @@ __hid_bpf_input_report(struct hid_bpf_ctx *ctx, enum hid_report_type type, u8 *b if (ret) return ret; - return hid_ops->hid_input_report(ctx->hid, type, buf, size, 0, (u64)(long)ctx, true, + return hid_ops->hid_input_report(ctx->hid, type, buf, size, size, 0, (u64)(long)ctx, true, lock_already_taken); } diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 61afec5915ec..a806820df7e5 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2033,24 +2033,32 @@ int __hid_request(struct hid_device *hid, struct hid_report *report, } EXPORT_SYMBOL_GPL(__hid_request); -int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, - int interrupt) +int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt) { struct hid_report_enum *report_enum = hid->report_enum + type; struct hid_report *report; struct hid_driver *hdrv; int max_buffer_size = HID_MAX_BUFFER_SIZE; u32 rsize, csize = size; + size_t bsize = bufsize; u8 *cdata = data; int ret = 0; report = hid_get_report(report_enum, data); if (!report) - goto out; + return 0; + + if (unlikely(bsize < csize)) { + hid_warn_ratelimited(hid, "Event data for report %d is incorrect (%d vs %ld)\n", + report->id, csize, bsize); + return -EINVAL; + } if (report_enum->numbered) { cdata++; csize--; + bsize--; } rsize = hid_compute_report_size(report); @@ -2063,11 +2071,16 @@ int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 * else if (rsize > max_buffer_size) rsize = max_buffer_size; + if (bsize < rsize) { + hid_warn_ratelimited(hid, "Event data for report %d was too short (%d vs %ld)\n", + report->id, rsize, bsize); + return -EINVAL; + } + if (csize < rsize) { - hid_warn_ratelimited(hid, "Event data for report %d was too short (%d vs %d)\n", - report->id, rsize, csize); - ret = -EINVAL; - goto out; + dbg_hid("report %d is too short, (%d < %d)\n", report->id, + csize, rsize); + memset(cdata + csize, 0, rsize - csize); } if ((hid->claimed & HID_CLAIMED_HIDDEV) && hid->hiddev_report_event) @@ -2075,7 +2088,7 @@ int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 * if (hid->claimed & HID_CLAIMED_HIDRAW) { ret = hidraw_report_event(hid, data, size); if (ret) - goto out; + return ret; } if (hid->claimed != HID_CLAIMED_HIDRAW && report->maxfield) { @@ -2087,15 +2100,15 @@ int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 * if (hid->claimed & HID_CLAIMED_INPUT) hidinput_report_event(hid, report); -out: + return ret; } EXPORT_SYMBOL_GPL(hid_report_raw_event); static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, - bool lock_already_taken) + u8 *data, size_t bufsize, u32 size, int interrupt, u64 source, + bool from_bpf, bool lock_already_taken) { struct hid_report_enum *report_enum; struct hid_driver *hdrv; @@ -2120,7 +2133,8 @@ static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, report_enum = hid->report_enum + type; hdrv = hid->driver; - data = dispatch_hid_bpf_device_event(hid, type, data, &size, interrupt, source, from_bpf); + data = dispatch_hid_bpf_device_event(hid, type, data, &bufsize, &size, interrupt, + source, from_bpf); if (IS_ERR(data)) { ret = PTR_ERR(data); goto unlock; @@ -2149,7 +2163,7 @@ static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, goto unlock; } - ret = hid_report_raw_event(hid, type, data, size, interrupt); + ret = hid_report_raw_event(hid, type, data, bufsize, size, interrupt); unlock: if (!lock_already_taken) @@ -2171,7 +2185,7 @@ static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) { - return __hid_input_report(hid, type, data, size, interrupt, 0, + return __hid_input_report(hid, type, data, size, size, interrupt, 0, false, /* from_bpf */ false /* lock_already_taken */); } diff --git a/drivers/hid/hid-gfrm.c b/drivers/hid/hid-gfrm.c index 699186ff2349..d2a56bf92b41 100644 --- a/drivers/hid/hid-gfrm.c +++ b/drivers/hid/hid-gfrm.c @@ -66,7 +66,7 @@ static int gfrm_raw_event(struct hid_device *hdev, struct hid_report *report, switch (data[1]) { case GFRM100_SEARCH_KEY_DOWN: ret = hid_report_raw_event(hdev, HID_INPUT_REPORT, search_key_dn, - sizeof(search_key_dn), 1); + sizeof(search_key_dn), sizeof(search_key_dn), 1); break; case GFRM100_SEARCH_KEY_AUDIO_DATA: @@ -74,7 +74,7 @@ static int gfrm_raw_event(struct hid_device *hdev, struct hid_report *report, case GFRM100_SEARCH_KEY_UP: ret = hid_report_raw_event(hdev, HID_INPUT_REPORT, search_key_up, - sizeof(search_key_up), 1); + sizeof(search_key_up), sizeof(search_key_up), 1); break; default: diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index b1330d23bd2d..b3ff9265377b 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -3673,7 +3673,7 @@ static int hidpp10_consumer_keys_raw_event(struct hidpp_device *hidpp, memcpy(&consumer_report[1], &data[3], 4); /* We are called from atomic context */ hid_report_raw_event(hidpp->hid_dev, HID_INPUT_REPORT, - consumer_report, 5, 1); + consumer_report, sizeof(consumer_report), 5, 1); return 1; } diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index e82a3c4e5b44..eeab0b6e32cc 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -533,7 +533,7 @@ static void mt_get_feature(struct hid_device *hdev, struct hid_report *report) } ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, buf, - size, 0); + size, size, 0); if (ret) dev_warn(&hdev->dev, "failed to report feature\n"); } diff --git a/drivers/hid/hid-primax.c b/drivers/hid/hid-primax.c index e44d79dff8de..8db054280afb 100644 --- a/drivers/hid/hid-primax.c +++ b/drivers/hid/hid-primax.c @@ -44,7 +44,7 @@ static int px_raw_event(struct hid_device *hid, struct hid_report *report, data[0] |= (1 << (data[idx] - 0xE0)); data[idx] = 0; } - hid_report_raw_event(hid, HID_INPUT_REPORT, data, size, 0); + hid_report_raw_event(hid, HID_INPUT_REPORT, data, size, size, 0); return 1; default: /* unknown report */ diff --git a/drivers/hid/hid-vivaldi-common.c b/drivers/hid/hid-vivaldi-common.c index bf734055d4b6..b12bb5cc091a 100644 --- a/drivers/hid/hid-vivaldi-common.c +++ b/drivers/hid/hid-vivaldi-common.c @@ -85,7 +85,7 @@ void vivaldi_feature_mapping(struct hid_device *hdev, } ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, report_data, - report_len, 0); + report_len, report_len, 0); if (ret) { dev_warn(&hdev->dev, "failed to report feature %d\n", field->report->id); diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 0d1c6d90fe21..a32320b351e3 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -90,7 +90,7 @@ static void wacom_wac_queue_flush(struct hid_device *hdev, kfree(buf); continue; } - err = hid_report_raw_event(hdev, HID_INPUT_REPORT, buf, size, false); + err = hid_report_raw_event(hdev, HID_INPUT_REPORT, buf, size, size, false); if (err) { hid_warn(hdev, "%s: unable to flush event due to error %d\n", __func__, err); @@ -334,7 +334,7 @@ static void wacom_feature_mapping(struct hid_device *hdev, data, n, WAC_CMD_RETRIES); if (ret == n && features->type == HID_GENERIC) { ret = hid_report_raw_event(hdev, - HID_FEATURE_REPORT, data, n, 0); + HID_FEATURE_REPORT, data, n, n, 0); } else if (ret == 2 && features->type != HID_GENERIC) { features->touch_max = data[1]; } else { @@ -395,7 +395,7 @@ static void wacom_feature_mapping(struct hid_device *hdev, data, n, WAC_CMD_RETRIES); if (ret == n) { ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, - data, n, 0); + data, n, n, 0); } else { hid_warn(hdev, "%s: could not retrieve sensor offsets\n", __func__); diff --git a/drivers/staging/greybus/hid.c b/drivers/staging/greybus/hid.c index 1f58c907c036..f1f9f6fbc00e 100644 --- a/drivers/staging/greybus/hid.c +++ b/drivers/staging/greybus/hid.c @@ -201,7 +201,7 @@ static void gb_hid_init_report(struct gb_hid *ghid, struct hid_report *report) * we just need to setup the input fields, so using * hid_report_raw_event is safe. */ - hid_report_raw_event(ghid->hid, report->type, ghid->inbuf, size, 1); + hid_report_raw_event(ghid->hid, report->type, ghid->inbuf, ghid->bufsize, size, 1); } static void gb_hid_init_reports(struct gb_hid *ghid) diff --git a/include/linux/hid.h b/include/linux/hid.h index 442a80d79e89..ac432a2ef415 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1298,8 +1298,8 @@ static inline u32 hid_report_len(struct hid_report *report) return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } -int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, - int interrupt); +int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); /* HID quirks API */ unsigned long hid_lookup_quirk(const struct hid_device *hdev); diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index a2e47dbcf82c..19fffa4574a4 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -72,8 +72,8 @@ struct hid_ops { int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, u64 source, bool from_bpf); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, - bool lock_already_taken); + u8 *data, size_t bufsize, u32 size, int interrupt, u64 source, + bool from_bpf, bool lock_already_taken); struct module *owner; const struct bus_type *bus_type; }; @@ -200,7 +200,8 @@ struct hid_bpf { #ifdef CONFIG_HID_BPF u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, - u32 *size, int interrupt, u64 source, bool from_bpf); + size_t *buf_size, u32 *size, int interrupt, u64 source, + bool from_bpf); int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, u32 size, enum hid_report_type rtype, @@ -215,8 +216,11 @@ int hid_bpf_device_init(struct hid_device *hid); const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size); #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 *size, int interrupt, - u64 source, bool from_bpf) { return data; } + u8 *data, size_t *buf_size, u32 *size, + int interrupt, u64 source, bool from_bpf) +{ + return data; +} static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, u8 *buf, u32 size, enum hid_report_type rtype, From 206342541fc887ae919774a43942dc883161fece Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 4 May 2026 10:47:23 +0200 Subject: [PATCH 260/377] HID: core: introduce hid_safe_input_report() hid_input_report() is used in too many places to have a commit that doesn't cross subsystem borders. Instead of changing the API, introduce a new one when things matters in the transport layers: - usbhid - i2chid This effectively revert to the old behavior for those two transport layers. Fixes: 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 25 +++++++++++++++++++++++++ drivers/hid/i2c-hid/i2c-hid-core.c | 7 ++++--- drivers/hid/usbhid/hid-core.c | 11 ++++++----- include/linux/hid.h | 2 ++ 4 files changed, 37 insertions(+), 8 deletions(-) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index a806820df7e5..b3596851c719 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2181,6 +2181,7 @@ static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, * @interrupt: distinguish between interrupt and control transfers * * This is data entry for lower layers. + * Legacy, please use hid_safe_input_report() instead. */ int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) @@ -2191,6 +2192,30 @@ int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data } EXPORT_SYMBOL_GPL(hid_input_report); +/** + * hid_safe_input_report - report data from lower layer (usb, bt...) + * + * @hid: hid device + * @type: HID report type (HID_*_REPORT) + * @data: report contents + * @bufsize: allocated size of the data buffer + * @size: useful size of data parameter + * @interrupt: distinguish between interrupt and control transfers + * + * This is data entry for lower layers. + * Please use this function instead of the non safe version because we provide + * here the size of the buffer, allowing hid-core to make smarter decisions + * regarding the incoming buffer. + */ +int hid_safe_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt) +{ + return __hid_input_report(hid, type, data, bufsize, size, interrupt, 0, + false, /* from_bpf */ + false /* lock_already_taken */); +} +EXPORT_SYMBOL_GPL(hid_safe_input_report); + bool hid_match_one_id(const struct hid_device *hdev, const struct hid_device_id *id) { diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index 5a183af3d5c6..e0a302544cef 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -574,9 +574,10 @@ static void i2c_hid_get_input(struct i2c_hid *ihid) if (ihid->hid->group != HID_GROUP_RMI) pm_wakeup_event(&ihid->client->dev, 0); - hid_input_report(ihid->hid, HID_INPUT_REPORT, - ihid->inbuf + sizeof(__le16), - ret_size - sizeof(__le16), 1); + hid_safe_input_report(ihid->hid, HID_INPUT_REPORT, + ihid->inbuf + sizeof(__le16), + ihid->bufsize - sizeof(__le16), + ret_size - sizeof(__le16), 1); } return; diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index fbbfc0f60829..5af93b9b1fb5 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -283,9 +283,9 @@ static void hid_irq_in(struct urb *urb) break; usbhid_mark_busy(usbhid); if (!test_bit(HID_RESUME_RUNNING, &usbhid->iofl)) { - hid_input_report(urb->context, HID_INPUT_REPORT, - urb->transfer_buffer, - urb->actual_length, 1); + hid_safe_input_report(urb->context, HID_INPUT_REPORT, + urb->transfer_buffer, urb->transfer_buffer_length, + urb->actual_length, 1); /* * autosuspend refused while keys are pressed * because most keyboards don't wake up when @@ -482,9 +482,10 @@ static void hid_ctrl(struct urb *urb) switch (status) { case 0: /* success */ if (usbhid->ctrl[usbhid->ctrltail].dir == USB_DIR_IN) - hid_input_report(urb->context, + hid_safe_input_report(urb->context, usbhid->ctrl[usbhid->ctrltail].report->type, - urb->transfer_buffer, urb->actual_length, 0); + urb->transfer_buffer, urb->transfer_buffer_length, + urb->actual_length, 0); break; case -ESHUTDOWN: /* unplug */ unplug = 1; diff --git a/include/linux/hid.h b/include/linux/hid.h index ac432a2ef415..bfb9859f391e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1030,6 +1030,8 @@ struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_ty int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); +int hid_safe_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); From a991aa5e89365ba1959fae6847fd288125b209e5 Mon Sep 17 00:00:00 2001 From: Xu Rao Date: Sat, 9 May 2026 16:21:32 +0800 Subject: [PATCH 261/377] HID: i2c-hid: add reset quirk for BLTP7853 touchpad The BLTP7853 I2C HID touchpad may fail to probe after reboot or reprobe because reset completion is not signalled to the host. The driver then waits for the reset-complete interrupt until it times out and the device probe fails: i2c_hid i2c-BLTP7853:00: failed to reset device. i2c_hid i2c-BLTP7853:00: can't add hid device: -61 i2c_hid: probe of i2c-BLTP7853:00 failed with error -61 Add I2C_HID_QUIRK_NO_IRQ_AFTER_RESET for the device so i2c-hid does not wait for a reset interrupt that may never arrive. Signed-off-by: Xu Rao Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 3 +++ drivers/hid/i2c-hid/i2c-hid-core.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 8cfec7dced66..4657d96fb083 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -277,6 +277,9 @@ #define USB_VENDOR_ID_BIGBEN 0x146b #define USB_DEVICE_ID_BIGBEN_PS3OFMINIPAD 0x0902 +#define I2C_VENDOR_ID_BLTP 0x36b6 +#define I2C_PRODUCT_ID_BLTP7853 0xc001 + #define USB_VENDOR_ID_BTC 0x046e #define USB_DEVICE_ID_BTC_EMPREX_REMOTE 0x5578 #define USB_DEVICE_ID_BTC_EMPREX_REMOTE_2 0x5577 diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index e0a302544cef..3adb16366e93 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -149,6 +149,8 @@ static const struct i2c_hid_quirks { I2C_HID_QUIRK_BOGUS_IRQ }, { I2C_VENDOR_ID_GOODIX, I2C_DEVICE_ID_GOODIX_0D42, I2C_HID_QUIRK_DELAY_WAKEUP_AFTER_RESUME }, + { I2C_VENDOR_ID_BLTP, I2C_PRODUCT_ID_BLTP7853, + I2C_HID_QUIRK_NO_IRQ_AFTER_RESET }, { 0, 0 } }; From 48d1677779ad6816978ad4a4f7588aec5ec960fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Paku=C5=82a?= Date: Sun, 10 May 2026 14:23:52 +0200 Subject: [PATCH 262/377] HID: pidff: Fix integer overflow in pidff_rescale MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rescaling values close to the max (U16_MAX) temporarily creates values that exceed the s32 range. This caused value overflow in case when, for example, a periodic effect phase was higer than 180 degrees. In turn, rescale function could return values outised of the logical range of the HID field. Fix by using 64 bit signed integer to store the value during calculation but still return only 32 bit integer. Closes: https://github.com/JacKeTUs/universal-pidff/issues/116 Fixes: 224ee88fe395 ("Input: add force feedback driver for PID devices") Cc: stable@vger.kernel.org Signed-off-by: Tomasz Pakuła Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-pidff.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/hid/usbhid/hid-pidff.c b/drivers/hid/usbhid/hid-pidff.c index aee8a4443305..c45f182d0448 100644 --- a/drivers/hid/usbhid/hid-pidff.c +++ b/drivers/hid/usbhid/hid-pidff.c @@ -11,6 +11,7 @@ #include "hid-pidff.h" #include #include +#include #include #include #include @@ -326,8 +327,10 @@ static s32 pidff_clamp(s32 i, struct hid_field *field) */ static int pidff_rescale(int i, int max, struct hid_field *field) { - return i * (field->logical_maximum - field->logical_minimum) / max + - field->logical_minimum; + /* 64 bits needed for big values during rescale */ + s64 result = field->logical_maximum - field->logical_minimum; + + return div_s64(result * i, max) + field->logical_minimum; } /* From 64ffa2e5e02ff54b23221d0282155f37283fabea Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Tue, 12 May 2026 13:22:44 +0000 Subject: [PATCH 263/377] HID: logitech-hidpp: Add support for newer Bluetooth keyboards Add product IDs (PIDs) for several newer Logitech Bluetooth keyboards to the hidpp_devices matching table, enabling full HID++ support for them. The added keyboards are: - Logitech Signature K650 & B2B - Logitech Pebble Keys 2 K380S - Logitech Casa Pop-Up Desk & B2B - Logitech Wave Keys & B2B - Logitech Signature Slim K950 & B2B - Logitech MX Keys S & B2B - Logitech Keys-To-Go 2 - Logitech Pop Icon Keys - Logitech MX Keys Mini & B2B - Logitech Signature Slim Solar+ K980 B2B - Logitech Bluetooth Keyboard K250/K251 - Logitech Signature Comfort K880 & B2B Signed-off-by: Alain Michaud Reviewed-by: Olivier Gay Signed-off-by: Jiri Kosina --- drivers/hid/hid-logitech-hidpp.c | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index b3ff9265377b..ccbf28869a96 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -4685,6 +4685,44 @@ static const struct hid_device_id hidpp_devices[] = { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb391) }, { /* MX Master 4 mouse over Bluetooth */ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb042) }, + { /* Logitech Signature K650 over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb36f) }, + { /* Logitech Signature K650 B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb370) }, + { /* Logitech Pebble Keys 2 K380S over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb377) }, + { /* Logitech Casa Pop-Up Desk over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb371) }, + { /* Logitech Casa Pop-Up Desk B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb374) }, + { /* Logitech Wave Keys over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb383) }, + { /* Logitech Wave Keys B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb384) }, + { /* Logitech Signature Slim K950 over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb386) }, + { /* Logitech Signature Slim K950 B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb388) }, + { /* Logitech MX Keys S over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb378) }, + { /* Logitech MX Keys S B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb380) }, + { /* Logitech Keys-To-Go 2 over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb38c) }, + { /* Logitech Pop Icon Keys over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb38f) }, + { /* Logitech MX Keys Mini over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb369) }, + { /* Logitech MX Keys Mini B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb36e) }, + { /* Logitech Signature Slim Solar+ K980 B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb394) }, + { /* Logitech Bluetooth Keyboard K250/K251 over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb397) }, + { /* Logitech Signature Comfort K880 over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb39c) }, + { /* Logitech Signature Comfort K880 B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb39d) }, {} }; From aa16b2bc0f02709919e2435f531406531e5bcc69 Mon Sep 17 00:00:00 2001 From: Zack McKevitt Date: Thu, 30 Apr 2026 12:39:01 -0700 Subject: [PATCH 264/377] accel/qaic: Add overflow check to remap_pfn_range during mmap The call to remap_pfn_range in qaic_gem_object_mmap is susceptible to (re)mapping beyond the VMA if the BO is too large. This can cause use after free issues when munmap() unmaps only the VMA region and not the additional mappings. To prevent this, check the remaining size of the VMA before remapping and truncate the remapped length if sg->length is too large. Reported-by: Lukas Maar Fixes: ff13be830333 ("accel/qaic: Add datapath") Reviewed-by: Karol Wachowski Signed-off-by: Zack McKevitt Reviewed-by: Jeff Hugo [jhugo: fix braces from checkpatch --strict] Signed-off-by: Jeff Hugo Link: https://patch.msgid.link/20260430193858.1178641-1-zachary.mckevitt@oss.qualcomm.com --- drivers/accel/qaic/qaic_data.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index 95300c2f7d8a..1e4c579d2725 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -606,8 +606,11 @@ static const struct vm_operations_struct drm_vm_ops = { static int qaic_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) { struct qaic_bo *bo = to_qaic_bo(obj); + unsigned long remap_start; unsigned long offset = 0; + unsigned long remap_end; struct scatterlist *sg; + unsigned long length; int ret = 0; if (drm_gem_is_imported(obj)) @@ -615,11 +618,27 @@ static int qaic_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_struc for (sg = bo->sgt->sgl; sg; sg = sg_next(sg)) { if (sg_page(sg)) { + /* if sg is too large for the VMA, so truncate it to fit */ + if (check_add_overflow(vma->vm_start, offset, &remap_start)) + return -EINVAL; + if (check_add_overflow(remap_start, sg->length, &remap_end)) + return -EINVAL; + + if (remap_end > vma->vm_end) { + if (check_sub_overflow(vma->vm_end, remap_start, &length)) + return -EINVAL; + } else { + length = sg->length; + } + + if (length == 0) + goto out; + ret = remap_pfn_range(vma, vma->vm_start + offset, page_to_pfn(sg_page(sg)), - sg->length, vma->vm_page_prot); + length, vma->vm_page_prot); if (ret) goto out; - offset += sg->length; + offset += length; } } From b7cdd59de5ae8062d2cb0121c429a271eb70daec Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 12 May 2026 18:25:17 +0200 Subject: [PATCH 265/377] ACPI: PAD: xen: Check ACPI_COMPANION() against NULL Every platform driver can be forced to match a device that doesn't match its list of device IDs because of device_match_driver_override(), so platform drivers that rely on the existence of a device's ACPI companion object need to verify its presence. Accordingly, add a requisite ACPI_COMPANION() check against NULL to the Xen variant of the ACPI processor aggregator device (PAD) driver. Fixes: 112b2f978afe ("ACPI: PAD: xen: Convert to a platform driver") Signed-off-by: Rafael J. Wysocki Acked-by: Juergen Gross Link: https://patch.msgid.link/3427762.aeNJFYEL58@rafael.j.wysocki --- drivers/xen/xen-acpi-pad.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c index 75a39862c1df..5b98e0e93807 100644 --- a/drivers/xen/xen-acpi-pad.c +++ b/drivers/xen/xen-acpi-pad.c @@ -110,9 +110,13 @@ static void acpi_pad_notify(acpi_handle handle, u32 event, static int acpi_pad_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; acpi_status status; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME); strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS); From aed3c3346765e4317bb2ec6ff872e1c952e128ab Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 May 2026 11:47:53 +0200 Subject: [PATCH 266/377] Documentation: security-bugs: do not systematically Cc the security team With the increase of automated reports, the security team is dealing with way more messages than really needed. The reporting process works well with most teams so there is no need to systematically involve the security team in reports. Let's suggest to keep it for small lists of recipients and new reporters only. This should continue to cover the risk of lost messages while reducing the volume from prolific reporters. Cc: Greg KH Cc: Leon Romanovsky Reviewed-by: Leon Romanovsky Reviewed-by: Greg Kroah-Hartman Signed-off-by: Willy Tarreau Signed-off-by: Jonathan Corbet Message-ID: <20260509094755.2838-2-w@1wt.eu> --- Documentation/process/security-bugs.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/process/security-bugs.rst b/Documentation/process/security-bugs.rst index 27b028e85861..6dc525858125 100644 --- a/Documentation/process/security-bugs.rst +++ b/Documentation/process/security-bugs.rst @@ -148,7 +148,15 @@ run additional tests. Reports where the reporter does not respond promptly or cannot effectively discuss their findings may be abandoned if the communication does not quickly improve. -The report must be sent to maintainers, with the security team in ``Cc:``. +The report must be sent to maintainers. If there are two or fewer +recipients in your message, you must also always Cc: the Linux kernel +security team who will ensure the message is delivered to the proper +people, and will be able to assist small maintainer teams with processes +they may not be familiar with. For larger teams, Cc: the Linux kernel +security team for your first few reports or when seeking specific help, +such as when resending a message which got no response within a week. +Once you have become comfortable with the process for a few reports, it is +no longer necessary to Cc: the security list when sending to large teams. The Linux kernel security team can be contacted by email at . This is a private list of security officers who will help verify the bug report and assist developers working on a fix. From a03ef333fbd6cd861c8457c3d055ee3643a9baad Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 May 2026 11:47:54 +0200 Subject: [PATCH 267/377] Documentation: security-bugs: explain what is and is not a security bug The use of automated tools to find bugs in random locations of the kernel induces a raise of security reports even if most of them should just be reported as regular bugs. This patch is an attempt at drawing a line between what qualifies as a security bug and what does not, hoping to improve the situation and ease decision on the reporter's side. It defers the enumeration to a new file, threat-model.rst, that tries to enumerate various classes of issues that are and are not security bugs. This should permit to more easily update this file for various subsystem-specific rules without having to revisit the security bug reporting guide. Cc: Greg KH Cc: Leon Romanovsky Suggested-by: Leon Romanovsky Suggested-by: Greg KH Reviewed-by: Leon Romanovsky Reviewed-by: Shuah Khan Signed-off-by: Willy Tarreau Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jonathan Corbet Message-ID: <20260509094755.2838-3-w@1wt.eu> --- Documentation/process/index.rst | 1 + Documentation/process/security-bugs.rst | 38 +++- Documentation/process/threat-model.rst | 236 ++++++++++++++++++++++++ 3 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 Documentation/process/threat-model.rst diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst index dbd6ea16aca7..aa7c959a52b8 100644 --- a/Documentation/process/index.rst +++ b/Documentation/process/index.rst @@ -86,6 +86,7 @@ regressions and security problems. debugging/index handling-regressions security-bugs + threat-model cve embargoed-hardware-issues diff --git a/Documentation/process/security-bugs.rst b/Documentation/process/security-bugs.rst index 6dc525858125..54260dbfc64d 100644 --- a/Documentation/process/security-bugs.rst +++ b/Documentation/process/security-bugs.rst @@ -66,6 +66,42 @@ In addition, the following information are highly desirable: the issue appear. It is useful to share them, as they can be helpful to keep end users protected during the time it takes them to apply the fix. +What qualifies as a security bug +-------------------------------- + +It is important that most bugs are handled publicly so as to involve the widest +possible audience and find the best solution. By nature, bugs that are handled +in closed discussions between a small set of participants are less likely to +produce the best possible fix (e.g., risk of missing valid use cases, limited +testing abilities). + +It turns out that the majority of the bugs reported via the security team are +just regular bugs that have been improperly qualified as security bugs due to +a lack of awareness of the Linux kernel's threat model, as described in +Documentation/process/threat-model.rst, and ought to have been sent through +the normal channels described in Documentation/admin-guide/reporting-issues.rst +instead. + +The security list exists for urgent bugs that grant an attacker a capability +they are not supposed to have on a correctly configured production system, and +can be easily exploited, representing an imminent threat to many users. Before +reporting, consider whether the issue actually crosses a trust boundary on such +a system. + +**If you resorted to AI assistance to identify a bug, you must treat it as +public**. While you may have valid reasons to believe it is not, the security +team's experience shows that bugs discovered this way systematically surface +simultaneously across multiple researchers, often on the same day. In this +case, do not publicly share a reproducer, as this could cause unintended harm; +just mention that one is available and maintainers might ask for it privately +if they need it. + +If you are unsure whether an issue qualifies, err on the side of reporting +privately: the security team would rather triage a borderline report than miss +a real vulnerability. Reporting ordinary bugs to the security list, however, +does not make them move faster and consumes triage capacity that other reports +need. + Identifying contacts -------------------- @@ -74,7 +110,7 @@ affected subsystem's maintainers and Cc: the Linux kernel security team. Do not send it to a public list at this stage, unless you have good reasons to consider the issue as being public or trivial to discover (e.g. result of a widely available automated vulnerability scanning tool that can be repeated by -anyone). +anyone, or use of AI-based tools). If you're sending a report for issues affecting multiple parts in the kernel, even if they're fairly similar issues, please send individual messages (think diff --git a/Documentation/process/threat-model.rst b/Documentation/process/threat-model.rst new file mode 100644 index 000000000000..ecb432390e79 --- /dev/null +++ b/Documentation/process/threat-model.rst @@ -0,0 +1,236 @@ +.. _threatmodel: + +The Linux Kernel threat model +============================= + +There are a lot of assumptions regarding what the kernel does and does not +protect against. These assumptions tend to cause confusion for bug reports +(:doc:`security-related ones ` vs :doc:`non-security ones +<../admin-guide/reporting-issues>`), and can complicate security enforcement +when the responsibilities for some boundaries is not clear between the kernel, +distros, administrators and users. + +This document tries to clarify the responsibilities of the kernel in this +domain. + +The kernel's responsibilities +----------------------------- + +The kernel abstracts access to local hardware resources and to remote systems +in a way that allows multiple local users to get a fair share of the available +resources granted to them, and, when the underlying hardware permits, to assign +a level of confidentiality to their communications and to the data they are +processing or storing. + +The kernel assumes that the underlying hardware behaves according to its +specifications. This includes the integrity of the CPU's instruction set, the +transparency of the branch prediction unit and the cache units, the consistency +of the Memory Management Unit (MMU), the isolation of DMA-capable peripherals +(e.g., via IOMMU), state transitions in controllers, ranges of values read from +registers, the respect of documented hardware limitations, etc. + +When hardware fails to maintain its specified isolation (e.g., CPU bugs, +side-channels, hardware response to unexpected inputs), the kernel will usually +attempt to implement reasonable mitigations. These are best-effort measures +intended to reduce the attack surface or elevate the cost of an attack within +the limits of the hardware's facilities; they do not constitute a +kernel-provided safety guarantee. + +Users always perform their activities under the authority of an administrator +who is able to grant or deny various types of permissions that may affect how +users benefit from available resources, or the level of confidentiality of +their activities. Administrators may also delegate all or part of their own +permissions to some users, particularly via capabilities but not only. All this +is performed via configuration (sysctl, file-system permissions etc). + +The Linux Kernel applies a certain collection of default settings that match +its threat model. Distros have their own threat model and will come with their +own configuration presets, that the administrator may have to adjust to better +suit their expectations (relax or restrict). + +By default, the Linux Kernel guarantees the following protections when running +on common processors featuring privilege levels and memory management units: + +* **User-based isolation**: an unprivileged user may restrict access to their + own data from other unprivileged users running on the same system. This + includes: + + * stored data, via file system permissions + * in-memory data (pages are not accessible by default to other users) + * process activity (ptrace is not permitted to other users) + * inter-process communication (other users may not observe data exchanged via + UNIX domain sockets or other IPC mechanisms). + * network communications within the same or with other systems + +* **Capability-based protection**: + + * users not having the ``CAP_SYS_ADMIN`` capability may not alter the + kernel's configuration, memory nor state, change other users' view of the + file system layout, grant any user capabilities they do not have, nor + affect the system's availability (shutdown, reboot, panic, hang, or making + the system unresponsive via unbounded resource exhaustion). + * users not having the ``CAP_NET_ADMIN`` capability may not alter the network + configuration, intercept nor spoof network communications from other users + nor systems. + * users not having ``CAP_SYS_PTRACE`` may not observe other users' processes + activities. + +When ``CONFIG_USER_NS`` is set, the kernel also permits unprivileged users to +create their own user namespace in which they have all capabilities, but with a +number of restrictions (they may not perform actions that have impacts on the +initial user namespace, such as changing time, loading modules or mounting +block devices). Please refer to ``user_namespaces(7)`` for more details, the +possibilities of user namespaces are not covered in this document. + +The kernel also offers a lot of troubleshooting and debugging facilities, which +can constitute attack vectors when placed in wrong hands. While some of them +are designed to be accessible to regular local users with a low risk (e.g. +kernel logs via ``/proc/kmsg``), some would expose enough information to +represent a risk in most places and the decision to expose them is under the +administrator's responsibility (perf events, traces), and others are not +designed to be accessed by non-privileged users (e.g. debugfs). Access to these +facilities by a user who has been explicitly granted permission by an +administrator does not constitute a security breach. + +Bugs that permit to violate the principles above constitute security breaches. +However, bugs that permit one violation only once another one was already +achieved are only weaknesses. The kernel applies a number of self-protection +measures whose purpose is to avoid crossing a security boundary when certain +classes of bugs are found, but a failure of these extra protections do not +constitute a vulnerability alone. + +What does not constitute a security bug +--------------------------------------- + +In the Linux kernel's threat model, the following classes of problems are +**NOT** considered as Linux Kernel security bugs. However, when it is believed +that the kernel could do better, they should be reported, so that they can be +reviewed and fixed where reasonably possible, but they will be handled as any +regular bug: + +* **Configuration**: + + * outdated kernels and particularly end-of-life branches are out of the scope + of the kernel's threat model: administrators are responsible for keeping + their system up to date. For a bug to qualify as a security bug, it must be + demonstrated that it affects actively maintained versions. + + * build-level: changes to the kernel configuration that are explicitly + documented as lowering the security level (e.g. ``CONFIG_NOMMU``), or + targeted at developers only. + + * OS-level: changes to command line parameters, sysctls, filesystem + permissions, user capabilities, exposure of privileged interfaces, that + explicitly increase exposure by either offering non-default access to + unprivileged users, or reduce the kernel's ability to enforce some + protections or mitigations. Example: write access to procfs or debugfs. + + * issues triggered only when using features intended for development or + debugging (e.g., LOCKDEP, KASAN, FAULT_INJECTION): these features are known + to introduce overhead and potential instability and are not intended for + production use. + + * issues affecting drivers exposed under CONFIG_STAGING, as well as features + marked EXPERIMENTAL in the configuration. + + * loading of explicitly insecure/broken/staging modules, and generally any + using any subsystem marked as experimental or not intended for production + use. + + * running out-of-tree modules or unofficial kernel forks; these should be + reported to the relevant vendor. + +* **Excess of initial privileges**: + + * actions performed by a user already possessing the privileges required to + perform that action or modify that state (e.g. ``CAP_SYS_ADMIN``, + ``CAP_NET_ADMIN``, ``CAP_SYS_RAWIO``, ``CAP_SYS_MODULE`` with no further + boundary being crossed). + + * actions performed in user namespace that do not bypass the restrictions + imposed to the initial user (e.g. ptrace usage, signal delivery, resource + usage, access to FS/device/sysctl/memory, network binding, system/network + configuration etc). + + * anything performed by the root user in the initial namespace (e.g. kernel + oops when writing to a privileged device). + +* **Out of production use**: + + This covers theoretical/probabilistic attacks that rely on laboratory + conditions with zero system noise, or those requiring an unrealistic number + of attempts (e.g., billions of trials) that would be detected by standard + system monitoring long before success, such as: + + * prediction of random numbers that only works in a totally silent + environment (such as IP ID, TCP ports or sequence numbers that can only be + guessed in a lab). + + * activity observation and information leaks based on probabilistic + approaches that are prone to measurement noise and not realistically + reproducible on a production system. + + * issues that can only be triggered by heavy attacks (e.g. brute force) whose + impact on the system makes it unlikely or impossible to remain undetected + before they succeed (e.g. consuming all memory before succeeding). + + * problems seen only under development simulators, emulators, or combinations + that do not exist on real systems at the time of reporting (issues + involving tens of millions of threads, tens of thousands of CPUs, + unrealistic CPU frequencies, RAM sizes or disk capacities, network speeds. + + * issues whose reproduction requires hardware modification or emulation, + including fake USB devices that pretend to be another one. + + * as well as issues that can be triggered at a cost that is orders of + magnitude higher than the expected benefits (e.g. fully functional keyboard + emulator only to retrieve 7 uninitialized bytes in a structure, or + brute-force method involving millions of connection attempts to guess a + port number). + +* **Hardening failures**: + + * ability to bypass some of the kernel's hardening measures with no + demonstrable exploit path (e.g. ASLR bypass, events timing or probing with + no demonstrable consequence). These are just weaknesses, not + vulnerabilities. + + * missing argument checks and failure to report certain errors with no + immediate consequence. + +* **Random information leaks**: + + This concerns information leaks of small data parts that happen to be there + and that cannot be chosen by the attacker, or face access restrictions: + + * structure padding reported by syscalls or other interfaces. + + * identifiers, partial data, non-terminated strings reported in error + messages. + + * Leaks of kernel memory addresses/pointers do not constitute an immediately + exploitable vector and are not security bugs, though they must be reported + and fixed. + +* **Crafted file system images**: + + * bugs triggered by mounting a corrupted or maliciously crafted file system + image are generally not security bugs, as the kernel assumes the underlying + storage media is under the administrator's control, unless the filesystem + driver is specifically documented as being hardened against untrusted media. + + * issues that are resolved, mitigated, or detected by running a filesystem + consistency check (fsck) on the image prior to mounting. + +* **Physical access**: + + Issues that require physical access to the machine, hardware modification, or + the use of specialized hardware (e.g., logic analyzers, DMA-attack tools over + PCI-E/Thunderbolt) are out of scope unless the system is explicitly + configured with technologies meant to defend against such attacks + (e.g. IOMMU). + +* **Functional and performance regressions**: + + Any issue that can be mitigated by setting proper permissions and limits + doesn't qualify as a security bug. From 4bf85afb9f3ecd7c3b5d15a85b0902f8e725cd06 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 May 2026 11:47:55 +0200 Subject: [PATCH 268/377] Documentation: security-bugs: clarify requirements for AI-assisted reports AI tools are increasingly used to assist in bug discovery. While these tools can identify valid issues, reports that are submitted without manual verification often lack context, contain speculative impact assessments, or include unnecessary formatting. Such reports increase triage effort, waste maintainers' time and may be ignored. Reports where the reporter has verified the issue and the proposed fix typically meet quality standards. This documentation outlines specific requirements for length, formatting, and impact evaluation to reduce the effort needed to deal with these reports. Cc: Greg KH Acked-by: Greg Kroah-Hartman Reviewed-by: Leon Romanovsky Signed-off-by: Willy Tarreau Signed-off-by: Jonathan Corbet Message-ID: <20260509094755.2838-4-w@1wt.eu> --- Documentation/process/security-bugs.rst | 57 +++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/Documentation/process/security-bugs.rst b/Documentation/process/security-bugs.rst index 54260dbfc64d..f85c65f31f12 100644 --- a/Documentation/process/security-bugs.rst +++ b/Documentation/process/security-bugs.rst @@ -167,6 +167,63 @@ the Linux kernel security team only. Your message will be triaged, and you will receive instructions about whom to contact, if needed. Your message may equally be forwarded as-is to the relevant maintainers. +Responsible use of AI to find bugs +---------------------------------- + +A significant fraction of bug reports submitted to the security team are +actually the result of code reviews assisted by AI tools. While this can be an +efficient means to find bugs in rarely explored areas, it causes an overload on +maintainers, who are sometimes forced to ignore such reports due to their poor +quality or accuracy. As such, reporters must be particularly cautious about a +number of points which tend to make these reports needlessly difficult to +handle: + + * **Length**: AI-generated reports tend to be excessively long, containing + multiple sections and excessive detail. This makes it difficult to spot + important information such as affected files, versions, and impact. Please + ensure that a clear summary of the problem and all critical details are + presented first. Do not require triage engineers to scan multiple pages of + text. Configure your tools to produce concise, human-style reports. + + * **Formatting**: Most AI-generated reports are littered with Markdown tags. + These decorations complicate the search for important information and do + not survive the quoting processes involved in forwarding or replying. + Please **always convert your report to plain text** without any formatting + decorations before sending it. + + * **Impact Evaluation**: Many AI-generated reports lack an understanding of + the kernel's threat model and go to great lengths inventing theoretical + consequences. This adds noise and complicates triage. Please stick to + verifiable facts (e.g., "this bug permits any user to gain CAP_NET_ADMIN") + without enumerating speculative implications. Have your tool read this + documentation as part of the evaluation process. + + * **Reproducer**: AI-based tools are often capable of generating reproducers. + Please always ensure your tool provides one and **test it thoroughly**. If + the reproducer does not work, or if the tool cannot produce one, the + validity of the report should be seriously questioned. Note that since the + report will be posted to a public list, the reproducer should only be + shared upon maintainers' request. + + * **Propose a Fix**: Many AI tools are actually better at writing code than + evaluating it. Please ask your tool to propose a fix and **test it** before + reporting the problem. If the fix cannot be tested because it relies on + rare hardware or almost extinct network protocols, the issue is likely not + a security bug. In any case, if a fix is proposed, it must adhere to + Documentation/process/submitting-patches.rst and include a 'Fixes:' tag + designating the commit that introduced the bug. + +Failure to consider these points exposes your report to the risk of being +ignored. + +Use common sense when evaluating the report. If the affected file has not been +touched for more than one year and is maintained by a single individual, it is +likely that usage has declined and exposed users are virtually non-existent +(e.g., drivers for very old hardware, obsolete filesystems). In such cases, +there is no need to consume a maintainer's time with an unimportant report. If +the issue is clearly trivial and publicly discoverable, you should report it +directly to the public mailing lists. + Sending the report ------------------ From 793d2a057f7e7bc647c5401413f7bf7d4f08b969 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 11 May 2026 21:54:51 +0200 Subject: [PATCH 269/377] hwmon: (acpi_power_meter) Check ACPI_COMPANION() against NULL Every platform driver can be forced to match a device that doesn't match its list of device IDs because of device_match_driver_override(), so platform drivers that rely on the existence of a device's ACPI companion object need to verify its presence. Accordingly, add a requisite ACPI_COMPANION() check against NULL to the acpi_power_meter hwmon driver. Fixes: afc6c4aedea5 ("hwmon: (acpi_power_meter) Convert ACPI driver to a platform one") Signed-off-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/5068745.GXAFRqVoOG@rafael.j.wysocki Signed-off-by: Guenter Roeck --- drivers/hwmon/acpi_power_meter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/acpi_power_meter.c b/drivers/hwmon/acpi_power_meter.c index be7f702dcde9..0c9b9f4180fb 100644 --- a/drivers/hwmon/acpi_power_meter.c +++ b/drivers/hwmon/acpi_power_meter.c @@ -884,10 +884,14 @@ static void acpi_power_meter_notify(acpi_handle handle, u32 event, void *data) static int acpi_power_meter_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); struct acpi_power_meter_resource *resource; + struct acpi_device *device; int res; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + resource = kzalloc_obj(*resource); if (!resource) return -ENOMEM; From f06035ab324e52a845079c2e5f2380fa3cebde9b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 11 May 2026 21:56:14 +0200 Subject: [PATCH 270/377] hwmon: (asus_atk0110) Check ACPI_COMPANION() against NULL Every platform driver can be forced to match a device that doesn't match its list of device IDs because of device_match_driver_override(), so platform drivers that rely on the existence of a device's ACPI companion object need to verify its presence. Accordingly, add a requisite ACPI_HANDLE() check against NULL to the asus_atk0110 hwmon driver. Fixes: ee1752590733 ("hwmon: (asus_atk0110) Convert ACPI driver to a platform one") Signed-off-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/2261594.irdbgypaU6@rafael.j.wysocki Signed-off-by: Guenter Roeck --- drivers/hwmon/asus_atk0110.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/asus_atk0110.c b/drivers/hwmon/asus_atk0110.c index 5688ff5f7c28..109318b0434d 100644 --- a/drivers/hwmon/asus_atk0110.c +++ b/drivers/hwmon/asus_atk0110.c @@ -1273,15 +1273,20 @@ static int atk_probe(struct platform_device *pdev) struct acpi_buffer buf; union acpi_object *obj; struct atk_data *data; + acpi_handle handle; dev_dbg(&pdev->dev, "adding...\n"); + handle = ACPI_HANDLE(&pdev->dev); + if (!handle) + return -ENODEV; + data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->dev = &pdev->dev; - data->atk_handle = ACPI_HANDLE(&pdev->dev); + data->atk_handle = handle; INIT_LIST_HEAD(&data->sensor_list); data->disable_ec = false; From d289478cfc0bcf81c7914200d6abdcb78bd04ded Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Tue, 12 May 2026 09:29:30 +0200 Subject: [PATCH 271/377] libceph: handle rbtree insertion error in decode_choose_args() A message of type CEPH_MSG_OSD_MAP contains an OSD map that itself contains a CRUSH map. The received CRUSH map may optionally contain choose_args that get decoded in decode_choose_args(). In this function, num_choose_arg_maps is read from the message, and a corresponding number of crush_choose_arg_maps gets decoded afterwards. Each crush_choose_arg_map has a choose_args_index, which serves as the key when inserting it into the choose_args rbtree of the decoded crush_map. If a (potentially corrupted) message contains two crush_choose_arg_maps with the same index, the assertion in insert_choose_arg_map() triggers a kernel BUG when trying to insert the second crush_choose_arg_map. This patch fixes the issue by switching to the non-asserting rbtree insertion function and rejecting the message if the insertion fails. [ idryomov: changelog ] Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 2095e73ccf6c..0752a3808b91 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -392,7 +392,10 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c) goto e_inval; } - insert_choose_arg_map(&c->choose_args, arg_map); + if (!__insert_choose_arg_map(&c->choose_args, arg_map)) { + ret = -EEXIST; + goto fail; + } } return 0; From 28b0a2ab8c82d0bbdeb8013029c67c978ce6e4bf Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Tue, 12 May 2026 18:16:40 +0200 Subject: [PATCH 272/377] libceph: Fix potential null-ptr-deref in decode_choose_args() A message of type CEPH_MSG_OSD_MAP contains an OSD map that itself contains a CRUSH map. When decoding this CRUSH map in crush_decode(), an array of max_buckets CRUSH buckets is decoded, where some indices may not refer to actual buckets and are therefore set to NULL. The received CRUSH map may optionally contain choose_args that get decoded in decode_choose_args(). When decoding a crush_choose_arg_map, a series of choose_args for different buckets is decoded, with the bucket_index being read from the incoming message. It is only checked that the bucket index does not exceed max_buckets, but not that it doesn't point to an index with a NULL bucket. If a (potentially corrupted) message contains a crush_choose_arg_map including such a bucket_index, a null pointer dereference may occur in the subsequent processing when attempting to access the bucket with the given index. This patch fixes the issue by extending the affected check. Now, it is only attempted to access the bucket if it is not NULL. Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 0752a3808b91..8b5b0587a0cf 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -388,7 +388,8 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c) goto fail; if (arg->ids_size && - arg->ids_size != c->buckets[bucket_index]->size) + (!c->buckets[bucket_index] || + arg->ids_size != c->buckets[bucket_index]->size)) goto e_inval; } From e4a640475e43f406fdfd56d370b1f34b0cbbc18d Mon Sep 17 00:00:00 2001 From: Sergio Correia Date: Tue, 12 May 2026 14:28:33 +0100 Subject: [PATCH 273/377] audit: fix incorrect inheritable capability in CAPSET records __audit_log_capset() records the effective capability set into the inheritable field due to a copy-paste error. Every CAPSET audit record therefore reports cap_pi (process inheritable) with the value of cap_effective instead of cap_inheritable. This silently corrupts audit data used for compliance and forensic analysis: an attacker who modifies inheritable capabilities to prepare for a privilege-escalating exec would have the change masked in the audit trail. The bug has been present since the original introduction of CAPSET audit records in 2008. Cc: stable@vger.kernel.org Fixes: e68b75a027bb ("When the capset syscall is used it is not possible for audit to record the actual capbilities being added/removed. This patch adds a new record type which emits the target pid and the eff, inh, and perm cap sets.") Reviewed-by: Ricardo Robaina Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Sergio Correia Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ab54fccba215..abdf8da3be93 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2786,7 +2786,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; - context->capset.cap.inheritable = new->cap_effective; + context->capset.cap.inheritable = new->cap_inheritable; context->capset.cap.permitted = new->cap_permitted; context->capset.cap.ambient = new->cap_ambient; context->type = AUDIT_CAPSET; From f9e1c1324b4d98d591a6f7568fdebf5cf456dfc2 Mon Sep 17 00:00:00 2001 From: Sergio Correia Date: Tue, 12 May 2026 14:28:59 +0100 Subject: [PATCH 274/377] audit: enforce AUDIT_LOCKED for AUDIT_TRIM and AUDIT_MAKE_EQUIV AUDIT_ADD_RULE and AUDIT_DEL_RULE correctly check for AUDIT_LOCKED and return -EPERM, but AUDIT_TRIM and AUDIT_MAKE_EQUIV do not. This allows a process with CAP_AUDIT_CONTROL to modify directory tree watches and equivalence mappings even when the audit configuration has been locked, undermining the purpose of the lock. Add AUDIT_LOCKED checks to both commands. Cc: stable@vger.kernel.org Reviewed-by: Ricardo Robaina Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Sergio Correia Signed-off-by: Paul Moore --- kernel/audit.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/audit.c b/kernel/audit.c index e1d489bc2dff..34dc7cb246ff 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1468,6 +1468,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: + if (audit_enabled == AUDIT_LOCKED) + return -EPERM; audit_trim_trees(); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); @@ -1480,6 +1482,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, size_t msglen = data_len; char *old, *new; + if (audit_enabled == AUDIT_LOCKED) + return -EPERM; err = -EINVAL; if (msglen < 2 * sizeof(u32)) break; From 577a8d3bae0531f0e5ccfac919cd8192f920a804 Mon Sep 17 00:00:00 2001 From: Aaron Sacks Date: Tue, 12 May 2026 02:07:42 -0400 Subject: [PATCH 275/377] KVM: Reject wrapped offset in kvm_reset_dirty_gfn() kvm_reset_dirty_gfn() guards the gfn range with if (!memslot || (offset + __fls(mask)) >= memslot->npages) return; but offset is u64 and the addition is unchecked. The check can be silently bypassed by a u64 wrap. The dirty ring backing those entries is MAP_SHARED at KVM_DIRTY_LOG_PAGE_OFFSET of the vcpu fd, so the VMM can rewrite the slot and offset fields of any entry between when the kernel pushes them and when KVM_RESET_DIRTY_RINGS consumes them. On reset, kvm_dirty_ring_reset() re-reads the values via READ_ONCE() and feeds them straight back into this check; only the flags handshake is treated as the handover, the slot/offset payload is taken on trust. Crafting two entries entry[i].offset = 0xffffffffffffffc1 entry[i+1].offset = 0 makes the coalescing loop in kvm_dirty_ring_reset() compute delta = (s64)(0 - 0xffffffffffffffc1) = 63 which falls in [0, BITS_PER_LONG), so it folds entry[i+1] into the existing mask by setting bit 63. The trailing kvm_reset_dirty_gfn() call then sees offset = 0xffffffffffffffc1 and __fls(mask) = 63; the sum is 0 in u64 and the bounds check passes. That offset propagates into kvm_arch_mmu_enable_log_dirty_pt_masked() unchanged. On the legacy MMU path -- kvm_memslots_have_rmaps() == true, i.e. shadow paging, any VM that has allocated shadow roots, or a write-tracked slot -- it reaches gfn_to_rmap(), which indexes slot->arch.rmap[0][] with a near-U64_MAX gfn. That is an out-of-bounds load of a kvm_rmap_head, followed by a conditional clear of PT_WRITABLE_MASK in whatever the loaded pointer points at. The path is reachable from any process holding /dev/kvm. Range-check offset on its own first, so the addition cannot wrap. memslot->npages is bounded well below U64_MAX, so once offset < npages holds, offset + __fls(mask) (with __fls(mask) < BITS_PER_LONG) stays in range. Fixes: fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking") Cc: stable@vger.kernel.org Signed-off-by: Aaron Sacks Link: https://patch.msgid.link/20260512060742.1628959-1-contact@xchglabs.com/ Signed-off-by: Paolo Bonzini --- virt/kvm/dirty_ring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 02bc6b00d76c..572b854edf74 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -63,7 +63,8 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id); - if (!memslot || (offset + __fls(mask)) >= memslot->npages) + if (!memslot || offset >= memslot->npages || + offset + __fls(mask) >= memslot->npages) return; KVM_MMU_LOCK(kvm); From 2b72f1674e427c56e3772c5ccf785fdda2138820 Mon Sep 17 00:00:00 2001 From: Qiang Ma Date: Tue, 12 May 2026 09:53:13 +0800 Subject: [PATCH 276/377] KVM: x86: Fix Xen hypercall tracepoint argument assignment TRACE_EVENT(kvm_xen_hypercall) stores a5 in __entry->a4 instead of __entry->a5. That overwrites the recorded a4 argument and leaves a5 unset in the trace entry. Fix the typo so both arguments are captured correctly. Signed-off-by: Qiang Ma Link: https://patch.msgid.link/20260512015313.1685784-1-maqianga@uniontech.com/ Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e7fdbe9efc90..0db25bba17f6 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -154,7 +154,7 @@ TRACE_EVENT(kvm_xen_hypercall, __entry->a2 = a2; __entry->a3 = a3; __entry->a4 = a4; - __entry->a4 = a5; + __entry->a5 = a5; ), TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx", From 5bd1ddb7911ba7e94b61cf429970963f1b22dd76 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 May 2026 14:33:21 -0700 Subject: [PATCH 277/377] KVM: nSVM: Never use L0's PAUSE loop exiting while L2 is running Never use L0's (KVM's) PAUSE loop exiting controls while L2 is running, and instead always configure vmcb02 according to L1's exact capabilities and desires. The purpose of intercepting PAUSE after N attempts is to detect when the vCPU may be stuck waiting on a lock, so that KVM can schedule in a different vCPU that may be holding said lock. Barring a very interesting setup, L1 and L2 do not share locks, and it's extremely unlikely that an L1 vCPU would hold a spinlock while running L2. I.e. having a vCPU executing in L1 yield to a vCPU running in L2 will not allow the L1 vCPU to make forward progress, and vice versa. While teaching KVM's "on spin" logic to only yield to other vCPUs in L2 is doable, in all likelihood it would do more harm than good for most setups. KVM has limited visibility into which L2 "vCPUs" belong to the same VM, and thus share a locking domain. And even if L2 vCPUs are in the same VM, KVM has no visilibity into L2 vCPU's that are scheduled out by the L1 hypervisor. Furthermore, KVM doesn't actually steal PAUSE exits from L1. If L1 is intercepting PAUSE, KVM will route PAUSE exits to L1, not L0, as nested_svm_intercept() gives priority to the vmcb12 intercept. As such, overriding the count/threshold fields in vmcb02 with vmcb01's values is nonsensical, as doing so clobbers all the training/learning that has been done in L1. Even worse, if L1 is not intercepting PAUSE, i.e. KVM is handling PAUSE exits, then KVM will adjust the PLE knobs based on L2 behavior, which could very well be detrimental to L1, e.g. due to essentially poisoning L1 PLE training with bad data. And copying the count from vmcb02 to vmcb01 on a nested VM-Exit makes even less sense, because again, the purpose of PLE is to detect spinning vCPUs. Whether or not a vCPU is spinning in L2 at the time of a nested VM-Exit has no relevance as to the behavior of the vCPU when it executes in L1. The only scenarios where any of this actually works is if at least one of KVM or L1 is NOT intercepting PAUSE for the guest. Per the original changelog, those were the only scenarios considered to be supported. Disabling KVM's use of PLE makes it so the VM is always in a "supported" mode. Last, but certainly not least, using KVM's count/threshold instead of the values provided by L1 is a blatant violation of the SVM architecture. Fixes: 74fd41ed16fd ("KVM: x86: nSVM: support PAUSE filtering when L0 doesn't intercept PAUSE") Cc: Maxim Levitsky Tested-by: David Kaplan Signed-off-by: Sean Christopherson Link: https://patch.msgid.link/20260508213321.373309-1-seanjc@google.com/ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 43 +++++++++++++-------------------------- arch/x86/kvm/svm/svm.c | 15 ++++++++++++-- 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 961804df5f45..b340dc9991ad 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -160,6 +160,16 @@ void nested_vmcb02_recalc_intercepts(struct vcpu_svm *svm) if (!intercept_smi) vmcb_clr_intercept(&vmcb02->control, INTERCEPT_SMI); + /* + * Intercept PAUSE if and only if L1 wants to. KVM intercepts PAUSE so + * that a vCPU that may be spinning waiting for a lock can be scheduled + * out in favor of the vCPU that holds said lock. KVM doesn't support + * yielding across L2 vCPUs, as KVM has limited visilibity into which + * L2 vCPUs are in the same L2 VM, i.e. may be contending for locks. + */ + if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) + vmcb_clr_intercept(&vmcb02->control, INTERCEPT_PAUSE); + if (nested_vmcb_needs_vls_intercept(svm)) { /* * If the virtual VMLOAD/VMSAVE is not enabled for the L2, @@ -819,7 +829,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; struct vmcb *vmcb01 = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; - u32 pause_count12, pause_thresh12; nested_svm_transition_tlb_flush(vcpu); @@ -947,31 +956,13 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) - pause_count12 = vmcb12_ctrl->pause_filter_count; + vmcb02->control.pause_filter_count = vmcb12_ctrl->pause_filter_count; else - pause_count12 = 0; + vmcb02->control.pause_filter_count = 0; if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) - pause_thresh12 = vmcb12_ctrl->pause_filter_thresh; + vmcb02->control.pause_filter_thresh = vmcb12_ctrl->pause_filter_thresh; else - pause_thresh12 = 0; - if (kvm_pause_in_guest(svm->vcpu.kvm)) { - /* use guest values since host doesn't intercept PAUSE */ - vmcb02->control.pause_filter_count = pause_count12; - vmcb02->control.pause_filter_thresh = pause_thresh12; - - } else { - /* start from host values otherwise */ - vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count; - vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; - - /* ... but ensure filtering is disabled if so requested. */ - if (vmcb12_is_intercept(vmcb12_ctrl, INTERCEPT_PAUSE)) { - if (!pause_count12) - vmcb02->control.pause_filter_count = 0; - if (!pause_thresh12) - vmcb02->control.pause_filter_thresh = 0; - } - } + vmcb02->control.pause_filter_thresh = 0; /* * Take ALLOW_LARGER_RAP from vmcb12 even though it should be safe to @@ -1298,12 +1289,6 @@ void nested_svm_vmexit(struct vcpu_svm *svm) /* in case we halted in L2 */ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); - if (!kvm_pause_in_guest(vcpu->kvm)) { - vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; - vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); - - } - /* * Invalidate last_bus_lock_rip unless KVM is still waiting for the * guest to make forward progress before re-enabling bus lock detection. diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e7fdd7a9c280..e02a38da5296 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -913,7 +913,15 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; - if (kvm_pause_in_guest(vcpu->kvm)) + /* Adjusting pause_filter_count makes no sense if PLE is disabled. */ + WARN_ON_ONCE(kvm_pause_in_guest(vcpu->kvm)); + + /* + * While running L2, KVM should intercept PAUSE if and only if L1 wants + * to intercept PAUSE, and L1's intercept should take priority, i.e. + * KVM should never handle a PAUSE intercept from L2. + */ + if (WARN_ON_ONCE(is_guest_mode(vcpu))) return; control->pause_filter_count = __grow_ple_window(old, @@ -934,7 +942,10 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; - if (kvm_pause_in_guest(vcpu->kvm)) + /* Adjusting pause_filter_count makes no sense if PLE is disabled. */ + WARN_ON_ONCE(kvm_pause_in_guest(vcpu->kvm)); + + if (is_guest_mode(vcpu)) return; control->pause_filter_count = From 80f4a7b8ce7513c203562191426e4d4cc635b095 Mon Sep 17 00:00:00 2001 From: Ninad Naik Date: Mon, 11 May 2026 23:13:02 +0530 Subject: [PATCH 278/377] Documentation: kvm: update links in the references section of AMD Memory Encryption Replace non-working links in the reference section with the working ones. Signed-off-by: Ninad Naik Link: https://patch.msgid.link/20260511174302.811918-1-ninadnaik07@gmail.com/ Reviewed-by: Liam Merwick Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/x86/amd-memory-encryption.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index b2395dd4769d..bd04a908a8db 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -656,8 +656,8 @@ References See [white-paper]_, [api-spec]_, [amd-apm]_, [kvm-forum]_, and [snp-fw-abi]_ for more info. -.. [white-paper] https://developer.amd.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf -.. [api-spec] https://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf -.. [amd-apm] https://support.amd.com/TechDocs/24593.pdf (section 15.34) +.. [white-paper] https://docs.amd.com/v/u/en-US/memory-encryption-white-paper +.. [api-spec] https://docs.amd.com/v/u/en-US/55766_PUB_3.24_SEV_API +.. [amd-apm] https://docs.amd.com/v/u/en-US/24593_3.44_APM_Vol2 (section 15.34) .. [kvm-forum] https://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf -.. [snp-fw-abi] https://www.amd.com/system/files/TechDocs/56860.pdf +.. [snp-fw-abi] https://www.amd.com/content/dam/amd/en/documents/developer/56860.pdf From 87c810160ed738cd983e4a65ebe9709927c702c9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 May 2026 08:56:34 -0700 Subject: [PATCH 279/377] KVM: selftests: Ensure gmem file sizes are multiple of host page size When creating a guest_memfd file and associated memslot to validate shared guest memory, size the file+memslot to the maximum of the host or guest page size. Attempting to allocate a single guest page will fail if the host page size is greater than the guest page size, as KVM requires that the size of memslots and guest_memfd files are a multiple of the host page size. For simplicity, verify the entire file can be shared between guest and host, e.g. instead of trying to validate "partial" mappings. Fixes: 42188667be38 ("KVM: selftests: Add guest_memfd testcase to fault-in on !mmap()'d memory") Reported-by: Zenghui Yu Closes: https://lore.kernel.org/all/0064952b-048c-455d-ad89-e27e5cb82591@linux.dev Signed-off-by: Sean Christopherson Message-ID: <20260512155634.772602-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/guest_memfd_test.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index d6528c6f5e03..253e748c1d4a 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -510,7 +510,12 @@ static void test_guest_memfd_guest(void) "Default VM type should support INIT_SHARED, supported flags = 0x%x", vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); - size = vm->page_size; + /* + * Use the max of the host or guest page size for all operations, as + * KVM requires guest_memfd files and memslots to be sized to multiples + * of the host page size. + */ + size = max_t(size_t, vm->page_size, page_size); fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED); vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0); @@ -519,7 +524,7 @@ static void test_guest_memfd_guest(void) memset(mem, 0xaa, size); kvm_munmap(mem, size); - virt_pg_map(vm, gpa, gpa); + virt_map(vm, gpa, gpa, size / vm->page_size); vcpu_args_set(vcpu, 2, gpa, size); vcpu_run(vcpu); From 6b72d0578ca6f77b835d773d7c77c2f872d3e924 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 3 May 2026 23:09:17 +0200 Subject: [PATCH 280/377] KVM: x86: use again the flush argument of __link_shadow_page() Except in the case of parentless nested-TDP pages, mmu_page_zap_pte() clears the SPTE but leaves the invalid_list empty. In this case, using kvm_flush_remote_tlbs() as kvm_mmu_remote_flush_or_zap() does is overkill. Avoid flushing the entirety of the remote TLBs unless the invalid_list was populated: instead, use a more efficient gfn-targeting flush (if available) and skip it altogether if the caller guarantees that a TLB flush is not necessary. Based-on: <20260503201029.106481-1-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini Message-ID: <20260503210917.121840-1-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 892246204435..f0144ae8d891 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2526,6 +2526,23 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } +/* + * Note: while normally KVM uses a "bool flush" return value to let + * the caller batch flushes, __link_shadow_page() flushes immediately + * before populating the parent PTE with the new shadow page. The + * typical callers, direct_map() and FNAME(fetch)(), are not going + * to zap more than one huge SPTE anyway. + * + * The only exception, where @flush can be false, is when a huge SPTE + * is replaced with a shadow page SPTE with a fully populated page table, + * which can happen from shadow_mmu_split_huge_page(). In this case, + * no memory is unmapped across the change to the page tables and no + * immediate flush is needed for correctness. + * + * Even in that case, calls to kvm_mmu_commit_zap_page() are not + * batched. Doing so would require adding an invalid_list argument + * all the way down to __walk_slot_rmaps(). + */ static void __link_shadow_page(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, u64 *sptep, struct kvm_mmu_page *sp, bool flush) @@ -2541,8 +2558,10 @@ static void __link_shadow_page(struct kvm *kvm, parent_sp = sptep_to_sp(sptep); WARN_ON_ONCE(parent_sp->role.level == PG_LEVEL_4K); - mmu_page_zap_pte(kvm, parent_sp, sptep, &invalid_list); - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, true); + if (mmu_page_zap_pte(kvm, parent_sp, sptep, &invalid_list)) + kvm_mmu_commit_zap_page(kvm, &invalid_list); + else if (flush) + kvm_flush_remote_tlbs_sptep(kvm, sptep); } spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); From 3098c076c83ea2913245cb915cdcba98eb24214c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 May 2026 14:35:14 -0700 Subject: [PATCH 281/377] KVM: x86: Swap the dst and src operand for MOVNTDQA Swap the MOVNTDQA operands, as MOVNTDQA does NOT in fact have "the same characteristics as 0F E7 (MOVNTDQ)"; MOVNTDQA loads from memory and stores to registers, while MOVNTDQ loads from registers and stores to memory. Per the SDM: MOVNTDQ - Move packed integer values in xmm1 to m128 using non-temporal hint. MOVNTDQA - Move double quadword from m128 to xmm1 using non-temporal hint if WC memory type. Reported-by: Josh Eads Fixes: c57d9bafbd0b ("KVM: x86: Add support for emulating MOVNTDQA") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20260506213514.2781948-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c8c6cc0406d6..8013dccb3110 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4481,7 +4481,7 @@ static const struct opcode opcode_map_0f_38[256] = { X16(N), X16(N), /* 0x20 - 0x2f */ X8(N), - X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, + X2(N), GP(SrcMem | DstReg | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, /* 0x30 - 0x7f */ X16(N), X16(N), X16(N), X16(N), X16(N), /* 0x80 - 0xef */ From 39e25a2100604320e8d9df54c6c31258f7a3df29 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 May 2026 10:30:00 -1000 Subject: [PATCH 282/377] sched_ext: Drop NONE early return in scx_disable_and_exit_task() d3e73a0808dd ("sched_ext: Handle SCX_TASK_NONE in disable/switched_from paths") skipped the trailing scx_set_task_sched(p, NULL) on NONE tasks. After scx_fail_parent() parks a task at NONE/sched=parent and the parent is later freed via queue_rcu_work() during root_disable, the preserved p->scx.sched dangles - print_scx_info() from sched_show_task() reads sch->ops.name from freed memory. Drop the early return. __scx_disable_and_exit_task() already short- circuits on NONE and the SUB_INIT block was cleared by scx_fail_parent()'s earlier call, so clearing p->scx.sched is the only work left - and the one thing the path actually needs. v2: Extend the SUB_INIT block comment to note that the flag is only set on the sub-enable path, so it's always clear on the NONE re-entry (Andrea). Fixes: d3e73a0808dd ("sched_ext: Handle SCX_TASK_NONE in disable/switched_from paths") Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9354da79e162..68120f679178 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3703,22 +3703,14 @@ static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct * static void scx_disable_and_exit_task(struct scx_sched *sch, struct task_struct *p) { - /* - * %NONE means @p is already detached at the SCX level (e.g. handed - * back to the parent by scx_fail_parent() with no init to undo). - * Skip to avoid clobbering scx_task_sched() and writing %NONE again - * on a state that's already %NONE. - */ - if (scx_get_task_state(p) == SCX_TASK_NONE) - return; - __scx_disable_and_exit_task(sch, p); /* * If set, @p exited between __scx_init_task() and scx_enable_task() in * scx_sub_enable() and is initialized for both the associated sched and * its parent. Exit for the child too - scx_enable_task() never ran for - * it, so undo only init_task. + * it, so undo only init_task. The flag is only set on the sub-enable + * path, so it's always clear when @p arrives here in %SCX_TASK_NONE. */ if (p->scx.flags & SCX_TASK_SUB_INIT) { if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) From b273b75b8d677aea06dd06d80b61b3bb06e94680 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 May 2026 13:18:19 -1000 Subject: [PATCH 283/377] sched_ext: INIT_LIST_HEAD() &sch->all in scx_alloc_and_add_sched() On scx_link_sched() error paths (parent disabled, hash insert failure), &sch->all is never added to scx_sched_all. The cleanup path runs scx_unlink_sched() unconditionally, which calls list_del_rcu(&sch->all) on a list_head that was never initialized triggering a corruption warning. Initialize &sch->all. Fixes: 54be8de4236a ("sched_ext: Factor out scx_link_sched() and scx_unlink_sched()") Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 68120f679178..6d69ba29cfd7 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6635,6 +6635,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, rcu_assign_pointer(ops->priv, sch); sch->kobj.kset = scx_kset; + INIT_LIST_HEAD(&sch->all); #ifdef CONFIG_EXT_SUB_SCHED char *buf = kzalloc(PATH_MAX, GFP_KERNEL); From cceb874eee46fe4b3d3c6c496f19125d9a3a9a8f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 May 2026 13:18:23 -1000 Subject: [PATCH 284/377] sched_ext: Defer sub_kset base put to scx_sched_free_rcu_work scx_sub_enable_workfn() pins parent->kobj before dropping scx_sched_lock, but that does not pin parent->sub_kset. Concurrent disable can kset_unregister and free sub_kset before scx_alloc_and_add_sched() dereferences it. Split sub_kset teardown: kobject_del() at disable keeps sysfs removal; defer kobject_put() to scx_sched_free_rcu_work so the memory survives. A racing child sees state_in_sysfs=0 with valid memory, sysfs_create_dir() fails, and the existing exit_kind gate in scx_link_sched() turns it away with -ENOENT. Fixes: 411d3ef1a705 ("sched_ext: Unregister sub_kset on scheduler disable") Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6d69ba29cfd7..23f7b3f63b09 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4821,6 +4821,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work) kfree(sch->cgrp_path); if (sch_cgroup(sch)) cgroup_put(sch_cgroup(sch)); + if (sch->sub_kset) + kobject_put(&sch->sub_kset->kobj); #endif /* CONFIG_EXT_SUB_SCHED */ for_each_possible_cpu(cpu) { @@ -5861,7 +5863,7 @@ static void scx_sub_disable(struct scx_sched *sch) if (sch->ops.exit) SCX_CALL_OP(sch, exit, NULL, sch->exit_info); if (sch->sub_kset) - kset_unregister(sch->sub_kset); + kobject_del(&sch->sub_kset->kobj); kobject_del(&sch->kobj); } #else /* CONFIG_EXT_SUB_SCHED */ @@ -5995,7 +5997,7 @@ static void scx_root_disable(struct scx_sched *sch) */ #ifdef CONFIG_EXT_SUB_SCHED if (sch->sub_kset) - kset_unregister(sch->sub_kset); + kobject_del(&sch->sub_kset->kobj); #endif kobject_del(&sch->kobj); From 2c308cf34284420963607d677d576a2b4124d8bd Mon Sep 17 00:00:00 2001 From: Zoran Ilievski Date: Mon, 11 May 2026 08:40:02 +0200 Subject: [PATCH 285/377] net: atlantic: preserve PCI wake-from-D3 on shutdown when WOL enabled The shutdown handler aq_pci_shutdown() unconditionally calls pci_wake_from_d3(pdev, false), clearing the PCI PME_En bit even when wake-on-LAN has been configured. While aq_nic_shutdown() correctly programs the NIC firmware via aq_nic_set_power() to listen for magic packets, the PCI subsystem will not propagate the resulting PME wake event from D3, so the system never wakes after poweroff. WOL from suspend (S3) is unaffected because aq_suspend_common() does not touch pci_wake_from_d3() and relies on the PM core's wake configuration via device_may_wakeup(). This affects all atlantic-supported NICs (AQC107/108/111/112/113); users have reported that WOL works if the atlantic driver is never loaded, but breaks once it has run its shutdown path. Pass the configured WOL state to pci_wake_from_d3() instead of a literal false, so the PCI PME_En bit is preserved when the user has armed WOL via ethtool. Fixes: 90869ddfefeb ("net: aquantia: Implement pci shutdown callback") Cc: stable@vger.kernel.org Signed-off-by: Zoran Ilievski Reviewed-by: Sukhdeep Singh Link: https://patch.msgid.link/20260511064002.1857-1-goodboy@rexbytes.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index e9e38af680c3..39e1b606a75a 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -371,7 +371,7 @@ static void aq_pci_shutdown(struct pci_dev *pdev) pci_disable_device(pdev); if (system_state == SYSTEM_POWER_OFF) { - pci_wake_from_d3(pdev, false); + pci_wake_from_d3(pdev, self->aq_hw->aq_nic_cfg->wol); pci_set_power_state(pdev, PCI_D3hot); } } From e3adf69f8eb121a9128c2b0029efd050d3649153 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Sat, 9 May 2026 22:50:46 +0100 Subject: [PATCH 286/377] net: ethtool: phy: avoid NULL deref when PHY driver is unbound phydev->drv can become NULL while the phy_device is still attached to its net_device, namely after the PHY driver is unbound via sysfs: echo > /sys/bus/mdio_bus/drivers//unbind phy_remove() clears phydev->drv but doesn't call phy_detach(), so the phy_device stays in the link topology xarray and ethnl_req_get_phydev() still hands it back. ETHTOOL_MSG_PHY_GET then oopses on: rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); drvname is already treated as optional by phy_reply_size(), phy_fill_reply() and phy_cleanup_data(), so just skip the allocation when there is no driver bound. Fixes: 9dd2ad5e92b9 ("net: ethtool: phy: Convert the PHY_GET command to generic phy dump") Cc: stable@vger.kernel.org # 6.13.x Signed-off-by: David Carlier Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/20260509215046.107157-1-devnexen@gmail.com Signed-off-by: Jakub Kicinski --- net/ethtool/phy.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index f76d94d848d6..ddc6eab701ed 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -94,10 +94,12 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, if (!rep_data->name) return -ENOMEM; - rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); - if (!rep_data->drvname) { - ret = -ENOMEM; - goto err_free_name; + if (phydev->drv) { + rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + if (!rep_data->drvname) { + ret = -ENOMEM; + goto err_free_name; + } } rep_data->upstream_type = pdn->upstream_type; From f9e2342046ef1560d35bcd4a4b1197648ffd151d Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 9 May 2026 20:23:58 +0800 Subject: [PATCH 287/377] net: atm: fix skb leak in sigd_send() default branch The default branch in sigd_send() calls sock_put() and returns -EINVAL without freeing the skb, while all other exit paths do so. Add the missing dev_kfree_skb() before sock_put() to fix the leak. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Wei Yang Link: https://patch.msgid.link/20260509122358.1102997-1-albin_yang@163.com Signed-off-by: Jakub Kicinski --- net/atm/signaling.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 358fbe5e4d1d..b991d937205a 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -179,6 +179,7 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) break; default: pr_alert("bad message type %d\n", (int)msg->type); + dev_kfree_skb(skb); /* Paired with find_get_vcc(msg->vcc) above */ sock_put(sk); return -EINVAL; From a3fdd924d88c30b9f488636ce0e4696012cf5511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Coccia?= Date: Sun, 10 May 2026 12:34:13 -0400 Subject: [PATCH 288/377] net/smc: fix sleep-inside-lock in __smc_setsockopt() causing local DoS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A logic flaw in __smc_setsockopt() allows a local unprivileged user to cause a Denial of Service (DoS) by holding the socket lock indefinitely. The function __smc_setsockopt() calls copy_from_sockptr() while holding lock_sock(sk). By passing a userfaultfd-monitored memory page (or FUSE-backed memory on systems where unprivileged userfaultfd is disabled) as the optval, an attacker can halt execution during the copy operation, keeping the lock held. Combined with asynchronous tear-down operations like shutdown(), this exhausts the kernel wq (kworkers) and triggers the hung task watchdog. [ 240.123456] INFO: task kworker/u8:2 blocked for more than 120 seconds. [ 240.123489] Call Trace: [ 240.123501] smc_shutdown+... [ 240.123512] lock_sock_nested+... This patch moves the user-space copy outside the lock_sock() critical section to prevent the issue. Fixes: a6a6fe27bab4 ("net/smc: Dynamic control handshake limitation by socket options") Signed-off-by: Nicolò Coccia Reviewed-by: Dust Li Tested-by: Dust Li Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 185dbed7de5d..da28652f6810 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3054,18 +3054,17 @@ static int __smc_setsockopt(struct socket *sock, int level, int optname, smc = smc_sk(sk); + /* pre-fetch user data outside the lock */ + if (optname == SMC_LIMIT_HS) { + if (optlen < sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; + } + lock_sock(sk); switch (optname) { case SMC_LIMIT_HS: - if (optlen < sizeof(int)) { - rc = -EINVAL; - break; - } - if (copy_from_sockptr(&val, optval, sizeof(int))) { - rc = -EFAULT; - break; - } - smc->limit_smc_hs = !!val; rc = 0; break; From 7bf563badd37cb796df5477d2b78bb64148a1268 Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Sun, 10 May 2026 15:26:40 -0700 Subject: [PATCH 289/377] net/smc: avoid NULL deref of conn->lnk in smc_msg_event tracepoint The smc_msg_event tracepoint class, shared by smc_tx_sendmsg and smc_rx_recvmsg, unconditionally dereferences smc->conn.lnk: __string(name, smc->conn.lnk->ibname) conn->lnk is only set for SMC-R; for SMC-D it is NULL. Other code on these paths already handles this (e.g. !conn->lnk in SMC_STAT_RMB_TX_SIZE_SMALL()). With the tracepoint enabled, the first sendmsg()/recvmsg() on an SMC-D socket crashes: Oops: general protection fault, probably for non-canonical address KASAN: null-ptr-deref in range [...] RIP: 0010:strlen+0x1e/0xa0 Call Trace: trace_event_raw_event_smc_msg_event (net/smc/smc_tracepoint.h:44) smc_rx_recvmsg (net/smc/smc_rx.c:515) smc_recvmsg (net/smc/af_smc.c:2859) __sys_recvfrom (net/socket.c:2315) __x64_sys_recvfrom (net/socket.c:2326) do_syscall_64 The faulting address 0x3e0 is offsetof(struct smc_link, ibname), confirming the NULL ->lnk deref. Enabling the tracepoint requires root, but the trigger itself is unprivileged: socket(AF_SMC, ...) has no capability check, and SMC-D negotiation needs no admin step on s390 or on x86 with the loopback ISM device loaded. Log an empty device name for SMC-D instead of dereferencing NULL. Fixes: aff3083f10bf ("net/smc: Introduce tracepoints for tx and rx msg") Reported-by: Weiming Shi Signed-off-by: Xiang Mei Reviewed-by: Dust Li Reviewed-by: Sidraya Jayagond Signed-off-by: Jakub Kicinski --- net/smc/smc_tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index a9a6e3c1113a..53da84f57fd6 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(smc_msg_event, __field(const void *, smc) __field(u64, net_cookie) __field(size_t, len) - __string(name, smc->conn.lnk->ibname) + __string(name, smc->conn.lnk ? smc->conn.lnk->ibname : "") ), TP_fast_assign( From 3d042592ebd4c7e44974d556de0b727cb7db4dab Mon Sep 17 00:00:00 2001 From: Chenguang Zhao Date: Mon, 11 May 2026 09:43:43 +0800 Subject: [PATCH 290/377] ethtool: fix ethnl_bitmap32_not_zero() bit interval semantics ethnl_bitmap32_not_zero() should return true if some bit in [start, end) is set: - Fix inverted memchr_inv() sense: return true when the scan finds a non-zero byte, not when the middle words are all zero. - Return false for an empty interval (end <= start). - When end is 32-bit aligned, indices in [start, end) do not include any bits from map[end_word]; return false after earlier checks found no non-zero data. Fixes: 10b518d4e6dd ("ethtool: netlink bitset handling") Signed-off-by: Chenguang Zhao Signed-off-by: Jakub Kicinski --- net/ethtool/bitset.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index 8bb98d3ea3db..a3a2cc6480a0 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -92,7 +92,7 @@ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, u32 mask; if (end <= start) - return true; + return false; if (start % 32) { mask = ethnl_upper_bits(start); @@ -105,11 +105,11 @@ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, start_word++; } - if (!memchr_inv(map + start_word, '\0', - (end_word - start_word) * sizeof(u32))) + if (memchr_inv(map + start_word, '\0', + (end_word - start_word) * sizeof(u32))) return true; if (end % 32 == 0) - return true; + return false; return map[end_word] & ethnl_lower_bits(end); } From f5b2772d14884f4be9e718644f1203d4d0e6f0d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Sun, 10 May 2026 12:30:17 +0200 Subject: [PATCH 291/377] net: ethernet: ravb: Do not check URAM suspension when WoL is active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When updating the driver to match latest datasheet to suspend access to URAM when suspending DMA transfers a corner-case was missed, URAM access will not be suspended if WoL is enabled. This lead to the error message (correctly) being triggered as URAM access is not suspended even tho it's requested as part of stopping DMA. Avoid checking if URAM access is suspended and printing the error message if WoL is enabled when we suspend the system, as we know it will not be. Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/all/CAMuHMdWnjV%3DHGE1o08zLhUfTgOSene5fYx1J5GG10mB%2BToq8qg@mail.gmail.com/ Fixes: 353d8e7989b6 ("net: ethernet: ravb: Suspend and resume the transmission flow") Signed-off-by: Niklas Söderlund Reviewed-by: Sai Krishna Tested-by: Geert Uytterhoeven Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/ravb_main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 1dbfadb2a881..5f88733094d0 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1108,9 +1108,12 @@ static int ravb_stop_dma(struct net_device *ndev) /* Request for transmission suspension */ ravb_modify(ndev, CCC, CCC_DTSR, CCC_DTSR); - error = ravb_wait(ndev, CSR, CSR_DTS, CSR_DTS); - if (error) - netdev_err(ndev, "failed to stop AXI BUS\n"); + /* Access to URAM will not be suspended if WoL is enabled. */ + if (!priv->wol_enabled) { + error = ravb_wait(ndev, CSR, CSR_DTS, CSR_DTS); + if (error) + netdev_err(ndev, "failed to stop AXI BUS\n"); + } /* Stop AVB-DMAC process */ return ravb_set_opmode(ndev, CCC_OPC_CONFIG); From 5f7c7c63ffb1a187eb90c80864469db45f3bd2a8 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Wed, 13 May 2026 17:43:03 +0800 Subject: [PATCH 292/377] io_uring/rw: drop unused attr_type_mask from io_prep_rw_pi() io_prep_rw_pi() never used the attr_type_mask argument. Callers already validate sqe->attr_type_mask before invoking the helper (only IORING_RW_ATTR_FLAG_PI is supported today). Remove the dead parameter to avoid implying further interpretation happens here. Signed-off-by: Yang Xiuwei Link: https://patch.msgid.link/20260513094303.866533-1-yangxiuwei@kylinos.cn Signed-off-by: Jens Axboe --- io_uring/rw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index e729e0e7657e..0c4834645279 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -230,7 +230,7 @@ static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) } static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, - u64 attr_ptr, u64 attr_type_mask) + u64 attr_ptr) { struct io_uring_attr_pi pi_attr; struct io_async_rw *io; @@ -305,7 +305,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; attr_ptr = READ_ONCE(sqe->attr_ptr); - return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + return io_prep_rw_pi(req, rw, ddir, attr_ptr); } return 0; } From 2d5d3fc593c9b7e41bee86175d7b9e11f470072e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 May 2026 16:58:48 +0200 Subject: [PATCH 293/377] KVM: VMX: introduce module parameter to disable CET There have been reports of host hangs caused by CET virtualization. Until these are analyzed further, introduce a module parameter that makes it possible to easily disable it. Link: https://lore.kernel.org/all/85548beb-1486-40f9-beb4-632c78e3360b@proxmox.com/ Cc: David Riley Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/capabilities.h | 1 + arch/x86/kvm/vmx/vmx.c | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 56cacc06225e..31568274d8bb 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -14,6 +14,7 @@ extern bool __read_mostly flexpriority_enabled; extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; +extern bool __read_mostly enable_cet; extern bool __read_mostly enable_pml; extern int __read_mostly pt_mode; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5c2c33a5f7dc..49feecb286b2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -108,6 +108,9 @@ module_param_named(unrestricted_guest, bool __read_mostly enable_ept_ad_bits = 1; module_param_named(eptad, enable_ept_ad_bits, bool, 0444); +bool __read_mostly enable_cet = 1; +module_param_named(cet, enable_cet, bool, 0444); + static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, 0444); @@ -4476,7 +4479,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter * 3 and 4 for details. */ - if (cpu_has_load_cet_ctrl()) { + if (enable_cet) { vmcs_writel(HOST_S_CET, kvm_host.s_cet); vmcs_writel(HOST_SSP, 0); vmcs_writel(HOST_INTR_SSP_TABLE, 0); @@ -4532,6 +4535,10 @@ static u32 vmx_get_initial_vmentry_ctrl(void) if (vmx_pt_mode_is_system()) vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); + + if (!enable_cet) + vmentry_ctrl &= ~VM_ENTRY_LOAD_CET_STATE; + /* * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. */ @@ -4546,6 +4553,9 @@ static u32 vmx_get_initial_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + if (!enable_cet) + vmexit_ctrl &= ~VM_EXIT_LOAD_CET_STATE; + /* * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for * nested virtualization and thus allowed to be set in vmcs12. @@ -8155,7 +8165,7 @@ static __init void vmx_set_cpu_caps(void) * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code * fails, so disable CET in this case too. */ - if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || + if (!enable_cet || !enable_unrestricted_guest || !cpu_has_vmx_basic_no_hw_errcode_cc()) { kvm_cpu_cap_clear(X86_FEATURE_SHSTK); kvm_cpu_cap_clear(X86_FEATURE_IBT); @@ -8630,6 +8640,9 @@ __init int vmx_hardware_setup(void) !cpu_has_vmx_invept_global()) enable_ept = 0; + if (!cpu_has_load_cet_ctrl()) + enable_cet = 0; + /* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { pr_err_ratelimited("NX (Execute Disable) not supported\n"); From 836efd35c472d89c838d7b17ef339ddb3286ffc5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 13 May 2026 20:11:29 +0900 Subject: [PATCH 294/377] block: fix handling of dead zone write plugs Shin'ichiro reported hard to reproduce unaligned write errors with zoned block devices. Under normal operation conditions (e.g. running XFS on an SMR disk), these errors are nearly impossible to trigger. But using a "slow" kernel with many debug options enables and some specific use cases (e.g. fio zbd test case 46), the errors can be reproduced fairly easily. The unaligned write errors come from mishandling a valid reference counting pattern of zone write plugs. Such pattern triggers for instance if a process A writes a zone (not necessarilly to the full state), another process B immediately resets the zone and immediately following the completion of the zone reset, starts issuing writes to the zone. With such pattern, in some cases, the zone write plugs worker thread of the device may still be holding a reference to the zone write plug of the zone taken when process A was writing to the zone. The following zone reset from process B marks the zone as dead but does not remove the zone write plug from the device hash table as a reference to the plug still exist. Once process B starts issuing new writes, the zone write plug is seen as dead and the writes from process B are immediately failed, despite this write pattern being perfectly legal. Fix this by allowing restoring a dead zone write plug to a live state if a write is issued to the zone when the zone is: marked as dead, empty and the write sector corresponds to the first sector of the zone (that is, the write is aligned to the zone write pointer). This is done with the new helper function disk_check_zone_wplug_dead(), which restores a dead zone write plug to a live state by clearing the BLK_ZONE_WPLUG_DEAD flag and restoring the initial reference to the zone write plug taken when the plug was added to the device hash table. Reported-by: Shin'ichiro Kawasaki Fixes: b7d4ffb51037 ("block: fix zone write plug removal") Signed-off-by: Damien Le Moal Tested-by: Shin'ichiro Kawasaki Link: https://patch.msgid.link/20260513111129.108809-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 30cad2bb9291..42ef830054dc 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -623,6 +623,28 @@ static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) } } +static inline bool disk_check_zone_wplug_dead(struct blk_zone_wplug *zwplug) +{ + if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) + return false; + + /* + * If a new write is received right after a zone reset completes and + * while the disk_zone_wplugs_worker() thread has not yet released the + * reference on the zone write plug after processing the last write to + * the zone, then the new write BIO will see the zone write plug marked + * as dead. This case is however a false positive and a perfectly valid + * pattern. In such case, restore the zone write plug to a live one. + */ + if (!zwplug->wp_offset && bio_list_empty(&zwplug->bio_list)) { + zwplug->flags &= ~BLK_ZONE_WPLUG_DEAD; + refcount_inc(&zwplug->ref); + return false; + } + + return true; +} + static bool disk_zone_wplug_submit_bio(struct gendisk *disk, struct blk_zone_wplug *zwplug); @@ -1444,12 +1466,12 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) spin_lock_irqsave(&zwplug->lock, flags); /* - * If we got a zone write plug marked as dead, then the user is issuing - * writes to a full zone, or without synchronizing with zone reset or - * zone finish operations. In such case, fail the BIO to signal this - * invalid usage. + * Check if we got a zone write plug marked as dead. If yes, then the + * user is likely issuing writes to a full zone, or without + * synchronizing with zone reset or zone finish operations. In such + * case, fail the BIO to signal this invalid usage. */ - if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) { + if (disk_check_zone_wplug_dead(zwplug)) { spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); bio_io_error(bio); From 87d0740b7c4cc847be1b6f307ab6d8547cb1a726 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 13 May 2026 18:19:40 +0800 Subject: [PATCH 295/377] selftests: ublk: cap nthreads to kernel's actual nr_hw_queues dev->nthreads is derived from the user-requested queue count before the ADD command, but the kernel may reduce nr_hw_queues (capped to nr_cpu_ids). When the VM has fewer CPUs than requested queues, the daemon creates more handler threads than there are kernel queues. In non-batch mode, the extra threads access uninitialized queues (q_depth=0), submit zero io_uring SQEs, and block forever in io_cqring_wait. In batch mode, the extra threads cause similar hangs during device removal. In both cases, the stuck threads prevent the daemon from closing the char device, holding the last ublk_device reference and causing ublk_ctrl_del_dev() to hang in wait_event_interruptible(). Fix by capping dev->nthreads to the kernel-returned nr_hw_queues after the ADD command completes. per_io_tasks mode is excluded because threads interleave across all queues, so nthreads > nr_hw_queues is valid. Fixes: abe54c160346 ("selftests: ublk: kublk: decouple ublk_queues from ublk server threads") Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260513101941.1373998-1-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index fbd9b1e7342a..0b23c09daea5 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1735,6 +1735,17 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) goto fail; } + /* + * The kernel may reduce nr_hw_queues (e.g. capped to nr_cpu_ids). + * Cap nthreads to the actual queue count to avoid creating extra + * handler threads that will hang during device removal. + * + * per_io_tasks mode is excluded: threads interleave across all + * queues so nthreads > nr_hw_queues is valid and intentional. + */ + if (!ctx->per_io_tasks && dev->nthreads > info->nr_hw_queues) + dev->nthreads = info->nr_hw_queues; + ret = ublk_start_daemon(ctx, dev); ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret); if (ret < 0) From 8fb70afe671cc8c1f6237a39aabd50714fcd1189 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Sun, 10 May 2026 22:56:05 +0200 Subject: [PATCH 296/377] drm/xe: Drop unused ggtt_balloon field During recent GGTT refactoring we missed to drop now unused field from the xe_tile. Drop it now. Fixes: e904c56ba6e0 ("drm/xe: Rewrite GGTT VF initialization") Signed-off-by: Michal Wajdeczko Reviewed-by: Maarten Lankhorst Link: https://patch.msgid.link/20260510205605.642-1-michal.wajdeczko@intel.com (cherry picked from commit 21d5a871f57909dc4d8e4f5d3bf92f9ccf2597b2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_tile_types.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_tile_types.h b/drivers/gpu/drm/xe/xe_tile_types.h index 33932fd547d7..0048100ccb72 100644 --- a/drivers/gpu/drm/xe/xe_tile_types.h +++ b/drivers/gpu/drm/xe/xe_tile_types.h @@ -106,8 +106,6 @@ struct xe_tile { struct xe_lmtt lmtt; } pf; struct { - /** @sriov.vf.ggtt_balloon: GGTT regions excluded from use. */ - struct xe_ggtt_node *ggtt_balloon[2]; /** @sriov.vf.self_config: VF configuration data */ struct xe_tile_sriov_vf_selfconfig self_config; } vf; From ea324444ece9f301b5c4ff71b258cc68990c4d61 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 16 Mar 2026 16:12:00 +0100 Subject: [PATCH 297/377] x86/mce: Restore MCA polling interval halving RongQing reported that the MCA polling interval doesn't halve when an error gets logged. It was traced down to the commit in Fixes:, because: mce_timer_fn() |-> mce_poll_banks() |-> machine_check_poll() |-> mce_log() which will queue the work and return. Now, back in mce_timer_fn(): /* * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval. */ if (mce_notify_irq()) <--- here we haven't ran the notifier chain yet so mce_need_notify is not set yet so this won't hit and we won't halve the interval iv. Now the notifier chain runs. mce_early_notifier() sets the bit, does mce_notify_irq(), that clears the bit and then the notifier chain a little later logs the error. So this is a silly timing issue. But, that's all unnecessary. All it needs to happen here is, the "should we notify of a logged MCE" mce_notify_irq() asks, should be simply a question to the mce gen pool: "Are you empty?" And that then turns into a simple yes or no answer and it all JustWorks(tm). So do that and also distribute the functionality where it belongs: - Print that MCE events have been logged in mce_log() - Trigger the mcelog tool specific work in the first notifier As a result, mce_notify_irq() can go now. Fixes: 011d82611172 ("RAS: Add a Corrected Errors Collector") Reported-by: Li RongQing Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Tested-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20260112082747.2842-1-lirongqing@baidu.com --- arch/x86/kernel/cpu/mce/core.c | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 8dd424ac5de8..f3a793e3a6c8 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -90,7 +90,6 @@ struct mca_config mca_cfg __read_mostly = { }; static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); -static unsigned long mce_need_notify; /* * MCA banks polled by the period polling timer for corrected events. @@ -152,8 +151,10 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm); void mce_log(struct mce_hw_err *err) { - if (mce_gen_pool_add(err)) + if (mce_gen_pool_add(err)) { + pr_info(HW_ERR "Machine check events logged\n"); irq_work_queue(&mce_irq_work); + } } EXPORT_SYMBOL_GPL(mce_log); @@ -585,28 +586,6 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -static bool mce_notify_irq(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - if (test_and_clear_bit(0, &mce_need_notify)) { - mce_work_trigger(); - - if (__ratelimit(&ratelimit)) - pr_info(HW_ERR "Machine check events logged\n"); - - return true; - } - - return false; -} - static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -618,9 +597,7 @@ static int mce_early_notifier(struct notifier_block *nb, unsigned long val, /* Emit the trace record: */ trace_mce_record(err); - set_bit(0, &mce_need_notify); - - mce_notify_irq(); + mce_work_trigger(); return NOTIFY_DONE; } @@ -1804,7 +1781,7 @@ static void mce_timer_fn(struct timer_list *t) * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval. */ - if (mce_notify_irq()) + if (!mce_gen_pool_empty()) iv = max(iv / 2, (unsigned long) HZ/100); else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); From 950953f774b3f69da6f413e045ef075e1f3da2df Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 8 May 2026 16:44:44 +0200 Subject: [PATCH 298/377] drm/gma500/oaktrail_hdmi: fix i2c adapter leak on setup Make sure to drop the reference taken to the I2C adapter (and its module) when setting up HDMI to allow the adapter to be deregistered. Fixes: 1b082ccf5901 ("gma500: Add Oaktrail support") Cc: stable@vger.kernel.org # 3.3 Signed-off-by: Johan Hovold Signed-off-by: Patrik Jakobsson Link: https://patch.msgid.link/20260508144446.59722-2-johan@kernel.org --- drivers/gpu/drm/gma500/oaktrail_hdmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/gma500/oaktrail_hdmi.c b/drivers/gpu/drm/gma500/oaktrail_hdmi.c index 58d7e191fd56..403d21cbb3a2 100644 --- a/drivers/gpu/drm/gma500/oaktrail_hdmi.c +++ b/drivers/gpu/drm/gma500/oaktrail_hdmi.c @@ -580,6 +580,7 @@ static int oaktrail_hdmi_get_modes(struct drm_connector *connector) } else { edid = (struct edid *)raw_edid; /* FIXME ? edid = drm_get_edid(connector, i2c_adap); */ + i2c_put_adapter(i2c_adap); } if (edid) { From 657a091ab6d01d0091b77660c75cfed573c9a53e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 8 May 2026 16:44:45 +0200 Subject: [PATCH 299/377] drm/gma500/oaktrail_lvds: fix hang on init failure The LVDS init code looks up an I2C adapter using i2c_get_adapter() and tries to read the EDID before falling back to allocating and registering its own adapter. The error handling does not separate these cases so on a late init failure it will try to deregister and free also an adapter that had previously been registered. Since i2c_get_adapter() takes another reference to the adapter, deregistration hangs indefinitely while waiting for the reference to be released. Fix this by only destroying adapters allocated during LVDS init on errors. Fixes: a57ebfc0b4da ("drm/gma500: Make oaktrail lvds use ddc adapter from drm_connector") Cc: stable@vger.kernel.org # 6.0 Cc: Patrik Jakobsson Signed-off-by: Johan Hovold Signed-off-by: Patrik Jakobsson Link: https://patch.msgid.link/20260508144446.59722-3-johan@kernel.org --- drivers/gpu/drm/gma500/oaktrail_lvds.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/gma500/oaktrail_lvds.c b/drivers/gpu/drm/gma500/oaktrail_lvds.c index 884d324f0044..983cc60a1e69 100644 --- a/drivers/gpu/drm/gma500/oaktrail_lvds.c +++ b/drivers/gpu/drm/gma500/oaktrail_lvds.c @@ -293,7 +293,7 @@ void oaktrail_lvds_init(struct drm_device *dev, { struct gma_encoder *gma_encoder; struct gma_connector *gma_connector; - struct gma_i2c_chan *ddc_bus; + struct gma_i2c_chan *ddc_bus = NULL; struct drm_connector *connector; struct drm_encoder *encoder; struct drm_psb_private *dev_priv = to_drm_psb_private(dev); @@ -421,7 +421,8 @@ void oaktrail_lvds_init(struct drm_device *dev, err_unlock: mutex_unlock(&dev->mode_config.mutex); - gma_i2c_destroy(to_gma_i2c_chan(connector->ddc)); + if (!IS_ERR_OR_NULL(ddc_bus)) + gma_i2c_destroy(ddc_bus); drm_encoder_cleanup(encoder); err_connector_cleanup: drm_connector_cleanup(connector); From 84d1c9b416d54afe760ca4c378bd95c89261254c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 8 May 2026 16:44:46 +0200 Subject: [PATCH 300/377] drm/gma500/oaktrail_lvds: fix i2c adapter leaks on init The LVDS init code looks up an I2C adapter using i2c_get_adapter() and tries to read the EDID before falling back to allocating and registering its own adapter. Make sure to drop the references taken by i2c_get_adapter() when falling back to allocating an adapter as well as on late errors to allow the looked up adapter to be deregistered. Fixes: 1b082ccf5901 ("gma500: Add Oaktrail support") Cc: stable@vger.kernel.org # 3.3 Signed-off-by: Johan Hovold Signed-off-by: Patrik Jakobsson Link: https://patch.msgid.link/20260508144446.59722-4-johan@kernel.org --- drivers/gpu/drm/gma500/oaktrail_lvds.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/gma500/oaktrail_lvds.c b/drivers/gpu/drm/gma500/oaktrail_lvds.c index 983cc60a1e69..e194d0cce067 100644 --- a/drivers/gpu/drm/gma500/oaktrail_lvds.c +++ b/drivers/gpu/drm/gma500/oaktrail_lvds.c @@ -367,6 +367,8 @@ void oaktrail_lvds_init(struct drm_device *dev, if (edid == NULL && dev_priv->lpc_gpio_base) { ddc_bus = oaktrail_lvds_i2c_init(dev); if (!IS_ERR(ddc_bus)) { + if (i2c_adap) + i2c_put_adapter(i2c_adap); i2c_adap = &ddc_bus->base; edid = drm_get_edid(connector, i2c_adap); } @@ -423,6 +425,8 @@ void oaktrail_lvds_init(struct drm_device *dev, mutex_unlock(&dev->mode_config.mutex); if (!IS_ERR_OR_NULL(ddc_bus)) gma_i2c_destroy(ddc_bus); + else if (i2c_adap) + i2c_put_adapter(i2c_adap); drm_encoder_cleanup(encoder); err_connector_cleanup: drm_connector_cleanup(connector); From 7d8f3158a51cb40fc710d2a781549141a139b796 Mon Sep 17 00:00:00 2001 From: Yu Miao Date: Wed, 13 May 2026 10:39:07 +0800 Subject: [PATCH 301/377] selftests/cgroup: Fix error path leaks in test_percpu_basic When cg_name_indexed() returns NULL partway through the child creation loop, the code returned -1 without running cleanup_children and cleanup. That left the `parent` pathname allocation unreleased and did not remove child cgroup directories already created under the parent. Fix by jumping to cleanup_children instead of returning. When cg_create() fails, `child` (the pathname from cg_name_indexed()) was not freed before cleanup_children. Fix by freeing `child` before branching to cleanup_children. Fixes: 90631e1dea55 ("kselftests: cgroup: add perpcu memory accounting test") Signed-off-by: Yu Miao Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_kmem.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index eeabd34bf083..12f59925500b 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -368,11 +368,15 @@ static int test_percpu_basic(const char *root) for (i = 0; i < 1000; i++) { child = cg_name_indexed(parent, "child", i); - if (!child) - return -1; - - if (cg_create(child)) + if (!child) { + ret = -1; goto cleanup_children; + } + + if (cg_create(child)) { + free(child); + goto cleanup_children; + } free(child); } From 345f40166694e60db6d5cf02233814bb27ac5dec Mon Sep 17 00:00:00 2001 From: sunshaojie Date: Wed, 13 May 2026 18:37:38 +0800 Subject: [PATCH 302/377] cgroup/cpuset: Return only actually allocated CPUs during partition invalidation In update_parent_effective_cpumask() with partcmd_invalidate, the CPUs to return to the parent are computed as: adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); where xcpus = user_xcpus(cs) which returns cs->exclusive_cpus (if set) or cs->cpus_allowed. When exclusive_cpus is not set, user_xcpus(cs) can contain CPUs that were never actually granted to the partition due to sibling exclusion in compute_excpus(). Consequently, the invalidation may return CPUs to the parent that remain in use by sibling partitions, causing overlapping effective_cpus and triggering the WARN_ON_ONCE(1) in generate_sched_domains(). Use cs->effective_xcpus instead, which reflects the CPUs actually granted to this partition. Reproducer (on a 4-CPU machine): cd /sys/fs/cgroup mkdir a1 b1 # a1 becomes partition root with CPUs 0-1 echo "0-1" > a1/cpuset.cpus echo "root" > a1/cpuset.cpus.partition # b1 becomes partition root with CPUs 1-2, but sibling exclusion # reduces its effective_xcpus to CPU 2 only echo "1-2" > b1/cpuset.cpus echo "root" > b1/cpuset.cpus.partition # b1 changes cpus_allowed to 0-1 -> partition invalidation echo "0-1" > b1/cpuset.cpus # Expected: CPUs 2-3 (only CPU 2 returned from b1) # Actual: CPUs 1-3 (CPU 0-1 returned, overlapping with a1) cat cpuset.cpus.effective dmesg will also show a WARNING from generate_sched_domains() reporting overlapping partition root effective_cpus. Fixes: 2a3602030d80 ("cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus conflict") Cc: stable@vger.kernel.org # v7.0+ Signed-off-by: sunshaojie Tested-by: Chen Ridong Reviewed-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e84e801e22cf..5c33ab20cc20 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1718,7 +1718,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, - xcpus, parent->effective_xcpus); + cs->effective_xcpus, + parent->effective_xcpus); if (old_prs > 0) new_prs = -old_prs; From d6a2d7b04b5a093021a7a0e2e69e9d5237dfa8cc Mon Sep 17 00:00:00 2001 From: Nicholas Carlini Date: Mon, 11 May 2026 18:02:16 +0000 Subject: [PATCH 303/377] io-wq: check that the predecessor is hashed in io_wq_remove_pending() io_wq_remove_pending() needs to fix up wq->hash_tail[] if the cancelled work was the tail of its hash bucket. When doing this, it checks whether the preceding entry in acct->work_list has the same hash value, but never checks that the predecessor is hashed at all. io_get_work_hash() is simply atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT, and the hash bits are never set for non-hashed work, so it returns 0. Thus, when a hashed bucket-0 work is cancelled while a non-hashed work is its list predecessor, the check spuriously passes and a pointer to the non-hashed io_kiocb is stored in wq->hash_tail[0]. Because non-hashed work is dequeued via the fast path in io_get_next_work(), which never touches hash_tail[], the stale pointer is never cleared. Therefore, after the non-hashed io_kiocb completes and is freed back to req_cachep, wq->hash_tail[0] is a dangling pointer. The io_wq is per-task (tctx->io_wq) and survives ring open/close, so the dangling pointer persists for the lifetime of the task; the next hashed bucket-0 enqueue dereferences it in io_wq_insert_work() and wq_list_add_after() writes through freed memory. Add the missing io_wq_is_hashed() check so a non-hashed predecessor never inherits a hash_tail[] slot. Cc: stable@vger.kernel.org Fixes: 204361a77f40 ("io-wq: fix hang after cancelling pending hashed work") Signed-off-by: Nicholas Carlini Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 7a9f94a0ce6f..8cc7b47d3089 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1124,7 +1124,8 @@ static inline void io_wq_remove_pending(struct io_wq *wq, if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) { if (prev) prev_work = container_of(prev, struct io_wq_work, list); - if (prev_work && io_get_work_hash(prev_work) == hash) + if (prev_work && io_wq_is_hashed(prev_work) && + io_get_work_hash(prev_work) == hash) wq->hash_tail[hash] = prev_work; else wq->hash_tail[hash] = NULL; From 32d5019ed3b6ff4439cb075fb275f655c8a2059c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2026 07:01:47 +0200 Subject: [PATCH 304/377] block: pass a minsize argument to bio_iov_iter_bounce When bouncing for block size > PAGE_SIZE file systems that require file system block size alignment (e.g. zoned XFS), the bio needs to be big enough to fit an entire block. Fixes: 8dd5e7c75d7b ("block: add helpers to bounce buffer an iov_iter into bios") Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260507050153.1298375-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 23 +++++++++++++---------- fs/iomap/direct-io.c | 2 +- include/linux/bio.h | 3 ++- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/block/bio.c b/block/bio.c index b8972dba68a0..f3e5d8bea08c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1279,11 +1279,12 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, return bio_iov_iter_align_down(bio, iter, len_align_mask); } -static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size) +static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size, + size_t minsize) { struct folio *folio; - while (*size > PAGE_SIZE) { + while (*size > minsize) { folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size)); if (folio) return folio; @@ -1307,7 +1308,7 @@ static void bio_free_folios(struct bio *bio) } static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, - size_t maxlen) + size_t maxlen, size_t minsize) { size_t total_len = min(maxlen, iov_iter_count(iter)); @@ -1322,13 +1323,13 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, size_t this_len = min(total_len, SZ_1M); struct folio *folio; - if (this_len > PAGE_SIZE * 2) + if (this_len > minsize * 2) this_len = rounddown_pow_of_two(this_len); if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len) break; - folio = folio_alloc_greedy(GFP_KERNEL, &this_len); + folio = folio_alloc_greedy(GFP_KERNEL, &this_len, minsize); if (!folio) break; bio_add_folio_nofail(bio, folio, this_len, 0); @@ -1348,12 +1349,12 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, } static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, - size_t maxlen) + size_t maxlen, size_t minsize) { size_t len = min3(iov_iter_count(iter), maxlen, SZ_1M); struct folio *folio; - folio = folio_alloc_greedy(GFP_KERNEL, &len); + folio = folio_alloc_greedy(GFP_KERNEL, &len, minsize); if (!folio) return -ENOMEM; @@ -1390,6 +1391,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, * @bio: bio to send * @iter: iter to read from / write into * @maxlen: maximum size to bounce + * @minsize: minimum folio allocation size * * Helper for direct I/O implementations that need to bounce buffer because * we need to checksum the data or perform other operations that require @@ -1397,11 +1399,12 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, * copies the data into it. Needs to be paired with bio_iov_iter_unbounce() * called on completion. */ -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen) +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen, + size_t minsize) { if (op_is_write(bio_op(bio))) - return bio_iov_iter_bounce_write(bio, iter, maxlen); - return bio_iov_iter_bounce_read(bio, iter, maxlen); + return bio_iov_iter_bounce_write(bio, iter, maxlen, minsize); + return bio_iov_iter_bounce_read(bio, iter, maxlen, minsize); } static void bvec_unpin(struct bio_vec *bv, bool mark_dirty) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b0a6549b3848..b36ee619cdcd 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -355,7 +355,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, if (dio->flags & IOMAP_DIO_BOUNCE) ret = bio_iov_iter_bounce(bio, dio->submit.iter, - iomap_max_bio_size(&iter->iomap)); + iomap_max_bio_size(&iter->iomap), alignment); else ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1); diff --git a/include/linux/bio.h b/include/linux/bio.h index 97d747320b35..dc17780d6c1e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -475,7 +475,8 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen, + size_t minsize); void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, From e7b8b3c5b2a65595d506ffedafac66f0a11fbdc2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2026 07:01:48 +0200 Subject: [PATCH 305/377] block: align down bounces bios Just like for the extract user pages path, we need to align down the size to the supported boundary. Fixes: 8dd5e7c75d7b ("block: add helpers to bounce buffer an iov_iter into bios") Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260507050153.1298375-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index f3e5d8bea08c..5f10900b3f42 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1345,7 +1345,7 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, if (!bio->bi_iter.bi_size) return -ENOMEM; - return 0; + return bio_iov_iter_align_down(bio, iter, minsize - 1); } static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, @@ -1383,7 +1383,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0); if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); - return 0; + return bio_iov_iter_align_down(bio, iter, minsize - 1); } /** From c64a647c84f30be368404f50f9052ed6c75c0f17 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 7 May 2026 08:35:46 -0600 Subject: [PATCH 306/377] vfio/pci: fix dma-buf kref underflow after revoke MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vfio_pci_dma_buf_move(revoked=true) and vfio_pci_dma_buf_cleanup() ran the same drain sequence: set priv->revoked, invalidate mappings, wait for fences, drop the registered kref, wait for completion. When the VFIO device fd was closed after PCI_COMMAND_MEMORY had been cleared, both ran in turn -- the second kref_put underflowed and the subsequent wait_for_completion() blocked on a completion that the first run had already consumed: refcount_t: underflow; use-after-free. WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x59/0x90 Call Trace: vfio_pci_dma_buf_cleanup+0x163/0x168 [vfio_pci_core] vfio_pci_core_close_device+0x67/0xe0 [vfio_pci_core] vfio_df_close+0x4c/0x80 [vfio] vfio_df_group_close+0x36/0x80 [vfio] vfio_device_fops_release+0x21/0x40 [vfio] __fput+0xe6/0x2b0 __x64_sys_close+0x3d/0x80 Collapse the duplication: vfio_pci_dma_buf_cleanup() now delegates the drain to vfio_pci_dma_buf_move(true), which is idempotent for already-revoked dma-bufs. cleanup retains only list removal and the device registration drop; the dma_resv_lock that bracketed those is dropped along with the in-line drain that required it, memory_lock continues to protect them. Re-arm the kref and the completion at the end of move()'s revoke branch so post-revoke state matches post-creation (kref == 1, completion ready). This keeps cleanup's call into move() a no-op when revoke already ran, and replaces the explicit kref_init() that the un-revoke branch used to perform for the un-revoke -> remap path. Fixes: 1a8a5227f229 ("vfio: Wait for dma-buf invalidation to complete") Reported-by: Joonas Kylmälä Closes: https://lore.kernel.org/all/GVXPR02MB12019AA6014F27EF5D773E89BFB372@GVXPR02MB12019.eurprd02.prod.outlook.com/ Cc: stable@vger.kernel.org Assisted-by: Claude:claude-opus-4-7 Reviewed-by: Leon Romanovsky Signed-off-by: Alex Williamson Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260507143548.1018405-1-alex.williamson@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_dmabuf.c | 36 +++++++++++++++--------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index f87fd32e4a01..fdc22e8b4656 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -354,19 +354,18 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) if (revoked) { kref_put(&priv->kref, vfio_pci_dma_buf_done); wait_for_completion(&priv->comp); - } else { /* - * Kref is initialize again, because when revoke - * was performed the reference counter was decreased - * to zero to trigger completion. + * Re-arm the registered kref reference and the + * completion so the post-revoke state matches the + * post-creation state. An un-revoke followed by a + * new mapping needs the kref to be non-zero before + * kref_get(), and vfio_pci_dma_buf_cleanup() + * delegates its drain back through this revoke + * path on a possibly-already-revoked dma-buf. */ kref_init(&priv->kref); - /* - * There is no need to wait as no mapping was - * performed when the previous status was - * priv->revoked == true. - */ reinit_completion(&priv->comp); + } else { dma_resv_lock(priv->dmabuf->resv, NULL); priv->revoked = false; dma_resv_unlock(priv->dmabuf->resv); @@ -382,21 +381,22 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) struct vfio_pci_dma_buf *tmp; down_write(&vdev->memory_lock); + + /* + * Drain any active mappings via the revoke path. The move is + * idempotent for dma-bufs already in the revoked state and + * leaves every priv with the kref re-armed and the completion + * ready, so cleanup itself does not need to participate in kref + * bookkeeping. + */ + vfio_pci_dma_buf_move(vdev, true); + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { if (!get_file_active(&priv->dmabuf->file)) continue; - dma_resv_lock(priv->dmabuf->resv, NULL); list_del_init(&priv->dmabufs_elm); priv->vdev = NULL; - priv->revoked = true; - dma_buf_invalidate_mappings(priv->dmabuf); - dma_resv_wait_timeout(priv->dmabuf->resv, - DMA_RESV_USAGE_BOOKKEEP, false, - MAX_SCHEDULE_TIMEOUT); - dma_resv_unlock(priv->dmabuf->resv); - kref_put(&priv->kref, vfio_pci_dma_buf_done); - wait_for_completion(&priv->comp); vfio_device_put_registration(&vdev->vdev); fput(priv->dmabuf->file); } From 6ae315d37924435516d697ea7dde0b799a5928e0 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 13 May 2026 13:24:38 +0200 Subject: [PATCH 307/377] sched_ext: Use HK_TYPE_DOMAIN_BOOT to detect isolcpus= domain isolation scx_enable() refuses to attach a BPF scheduler when isolcpus=domain is in effect by comparing housekeeping_cpumask(HK_TYPE_DOMAIN) against cpu_possible_mask. Since commit 27c3a5967f05 ("sched/isolation: Convert housekeeping cpumasks to rcu pointers"), HK_TYPE_DOMAIN's cpumask is RCU protected and dereferencing it requires either RCU read lock, the cpu_hotplug write lock, or the cpuset lock; scx_enable() holds none of these, so booting with isolcpus=domain and attaching any BPF scheduler triggers the following lockdep splat: ============================= WARNING: suspicious RCU usage ----------------------------- kernel/sched/isolation.c:60 suspicious rcu_dereference_check() usage! 1 lock held by scx_flash/281: #0: ffffffff8379fce0 (update_mutex){+.+.}-{4:4}, at: bpf_struct_ops_link_create+0x134/0x1c0 Call Trace: dump_stack_lvl+0x6f/0xb0 lockdep_rcu_suspicious.cold+0x37/0x70 housekeeping_cpumask+0xcd/0xe0 scx_enable.isra.0+0x17/0x120 bpf_scx_reg+0x5e/0x80 bpf_struct_ops_link_create+0x151/0x1c0 __sys_bpf+0x1e4b/0x33c0 __x64_sys_bpf+0x21/0x30 do_syscall_64+0x117/0xf80 entry_SYSCALL_64_after_hwframe+0x77/0x7f In addition, commit 03ff73510169 ("cpuset: Update HK_TYPE_DOMAIN cpumask from cpuset") made HK_TYPE_DOMAIN include cpuset isolated partitions as well, which means the current check also rejects BPF schedulers when a cpuset partition is active. That contradicts the original intent of commit 9f391f94a173 ("sched_ext: Disallow loading BPF scheduler if isolcpus= domain isolation is in effect"), which explicitly noted that cpuset partitions are honored through per-task cpumasks and should not be rejected. Switch to housekeeping_enabled(HK_TYPE_DOMAIN_BOOT), which reads only the housekeeping flag bit (no RCU dereference) and reflects exactly the boot-time isolcpus= configuration that the error message refers to. Fixes: 27c3a5967f05 ("sched/isolation: Convert housekeeping cpumasks to rcu pointers") Cc: stable@vger.kernel.org # v7.0+ Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo Acked-by: Frederic Weisbecker --- kernel/sched/ext.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 23f7b3f63b09..a6d0a93d8174 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7415,8 +7415,7 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) static DEFINE_MUTEX(helper_mutex); struct scx_enable_cmd cmd; - if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), - cpu_possible_mask)) { + if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL; } From df733ddc263dbe5f471e7c80c8b669532f56bf76 Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Mon, 11 May 2026 07:46:42 -0700 Subject: [PATCH 308/377] vfio/pci: Make VFIO_PCI_OFFSET_TO_INDEX() return unsigned VFIO_PCI_OFFSET_TO_INDEX() is used in several places with a signed parameter (e.g. loff_t). Because it makes no sense for a BAR/resource index to be negative, enforce this in the macro. This fixes at least one current issue, where vfio_pci_ioeventfd() uses this macro with an unvalidated signed loff_t returned into a signed type, leading to a possible negative array access. This instance does test against an out-of-bounds positive value, so treating the index as unsigned fixes this issue. Fixes: 89e1f7d4c66d8 ("vfio: Add PCI device driver") Signed-off-by: Matt Evans Link: https://lore.kernel.org/r/20260511144642.2926799-1-mattev@meta.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 2ebba746c18f..89165b769e5c 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -21,7 +21,7 @@ #define VFIO_PCI_CORE_H #define VFIO_PCI_OFFSET_SHIFT 40 -#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_TO_INDEX(off) ((u64)(off) >> VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) From 4694efc4164123580f19467141cdcfb73f7a740a Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Sat, 9 May 2026 22:04:50 +0100 Subject: [PATCH 309/377] FDDI: defza: Sanitise the reset safety timer The reset actions of the DEFZA adapters are exceedingly slow, taking up to 30 seconds to complete by the device spec and typically in the range of 10 seconds in reality, as required for the device RTOS to boot, still quite a lot. Therefore a state machine is used that's interrupt driven, however a safety mechanism is required in case of adapter malfunction, so that if no state change interrupt has arrived in time, then the situation is taken care of. The safety mechanism depends on the origin of the reset. For regular adapter initialisation at the device probe time a sleep is requested. However a reset is also required by the device spec when the adapter has transitioned into the halted state, such as in response to a PC Trace event in the course of ring fault recovery, possibly a common network event. In that case no sleep is possible as a device halt is reported at the hardirq level. A timer is therefore set up to ensure progress in case no adapter state change interrupt has arrived in time, but as from commit 168f6b6ffbee ("timers: Use del_timer_sync() even on UP") a warning is issued as the timer is deleted in the hardirq handler upon an expected state change: defza: v.1.1.4 Oct 6 2018 Maciej W. Rozycki tc2: DEC FDDIcontroller 700 or 700-C at 0x18000000, irq 4 tc2: resetting the board... ------------[ cut here ]------------ WARNING: kernel/time/timer.c:1611 at __timer_delete_sync+0x104/0x120, CPU#0: swapper/0/0 Modules linked in: CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 7.0.0-dirty #2 VOLUNTARY Stack : 9800000002027d08 00000000140120e0 0000000000000000 ffffffff8089d468 0000000000000000 0000000000000000 ffffffff807ed6b8 ffffffff80897458 ffffffff80897400 9800000002027b88 0000000000000000 7070617773203a6d 0000000000000000 9800000002027ba4 0000000000001000 6465746e69617420 0000000000000000 ffffffff807ed6b8 00000000140120e0 0000000000000009 000000000000064b ffffffff800dd14c 0000000000000036 9800000002184000 0000000000000000 0000000000000020 0000000000000000 ffffffff80910000 ffffffff8085c000 9800000002027c70 0000000000000001 ffffffff80045fa0 0000000000000000 0000000000000000 0000000000000000 0000000000000009 000000000000064b ffffffff800502b8 ffffffff807ed6b8 ffffffff80045fa0 ... Call Trace: [] show_stack+0x28/0xf0 [] dump_stack_lvl+0x48/0x7c [] __warn+0xa0/0x128 [] warn_slowpath_fmt+0x64/0xa4 [] __timer_delete_sync+0x104/0x120 [] fza_interrupt+0xc74/0xeb8 [] __handle_irq_event_percpu+0x70/0x228 [] handle_irq_event_percpu+0x18/0x78 [] handle_percpu_irq+0x50/0x80 [] generic_handle_irq+0x90/0xd0 [] do_IRQ+0x1c/0x30 [] handle_int+0x148/0x154 [] do_idle+0x40/0x108 [] cpu_startup_entry+0x2c/0x38 [] kernel_init+0x0/0x108 ---[ end trace 0000000000000000 ]--- tc2: OK tc2: model 700 (DEFZA-AA), MMF PMD, address 08-00-2b-xx-xx-xx tc2: ROM rev. 1.0, firmware rev. 1.2, RMC rev. A, SMT ver. 1 tc2: link unavailable ------------[ cut here ]------------ WARNING: kernel/time/timer.c:1611 at __timer_delete_sync+0x104/0x120, CPU#0: swapper/0/0 Modules linked in: CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G W 7.0.0-dirty #2 VOLUNTARY Tainted: [W]=WARN Stack : 9800000002027d08 00000000140120e0 0000000000000000 ffffffff8089d468 0000000000000000 0000000000000000 ffffffff807ed6b8 ffffffff80897458 ffffffff80897400 9800000002027b88 0000000000000000 0000000000000000 0000000000000000 9800000002027ba4 0000000000001000 0000000000000000 0000000000000000 ffffffff807ed6b8 00000000140120e0 0000000000000009 000000000000064b ffffffff800dd14c 0000000000000036 9800000002184000 0000000000000000 0000000000000020 0000000000000000 ffffffff80910000 ffffffff8085c000 9800000002027c70 0000000000000001 ffffffff80045fa0 0000000000000000 0000000000000000 0000000000000000 0000000000000009 000000000000064b ffffffff800502b8 ffffffff807ed6b8 ffffffff80045fa0 ... Call Trace: [] show_stack+0x28/0xf0 [] dump_stack_lvl+0x48/0x7c [] __warn+0xa0/0x128 [] warn_slowpath_fmt+0x64/0xa4 [] __timer_delete_sync+0x104/0x120 [] fza_interrupt+0xc74/0xeb8 [] __handle_irq_event_percpu+0x70/0x228 [] handle_irq_event_percpu+0x18/0x78 [] handle_percpu_irq+0x50/0x80 [] generic_handle_irq+0x90/0xd0 [] do_IRQ+0x1c/0x30 [] handle_int+0x148/0x154 [] arch_local_irq_disable+0x4/0x28 [] do_idle+0x50/0x108 [] cpu_startup_entry+0x2c/0x38 [] kernel_init+0x0/0x108 ---[ end trace 0000000000000000 ]--- tc2: registered as fddi0 The immediate origin of the new warning is the switch away from aliasing del_timer_sync() to del_timer() (timer_delete_sync() to timer_delete() in terms of current function names) for UP configurations, which however is the only choice for this driver anyway as no SMP hardware supports the TURBOchannel bus this device interfaces to. Therefore there is a very remote issue only this is a sign of. Specifically if an adapter reset issued upon a transition to the halted state times out and first triggers fza_reset_timer() for another reset assertion, which then schedules fza_reset_timer() for reset deassertion and then that second call is pre-empted after poking at the hardware, but before the timer has been rearmed and owing to high system load causing exceedingly high scheduling latency control is not handed back before a transition to the uninitialised state has caused the timer to be deleted even before it has been started, then fza_reset_timer() will be called yet again and issue another reset even though by then the adapter has already recovered. Prevent this situation from happening by switching to timer_delete() for the transition to the halted state and protect the code region affected with a spinlock, also to make sure add_timer() has not been called twice in a row due to an execution race between the interrupt handler and the timer handler (though it could only happen on SMP, but let's keep the driver clean). It's a very unlikely sequence of events to happen and therefore there's no point in trying to be overly clever about it, such as by placing printk() calls outside the protection. For the transition to the uninitialised state switch to timer_delete_sync_try() instead, so that a timer isn't deleted that's just been rearmed by the timer handler and needs to watch for the device to come out of reset again (again, an SMP scenario only). Retain timer_delete_sync() invocations outside the hardirq context for a stray timer not to fire once device structures have been released. Fixes: 61414f5ec9834 ("FDDI: defza: Add support for DEC FDDIcontroller 700 TURBOchannel adapter") Signed-off-by: Maciej W. Rozycki Signed-off-by: Jakub Kicinski --- drivers/net/fddi/defza.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/fddi/defza.c b/drivers/net/fddi/defza.c index 064fa484f797..9bfecc87d6b2 100644 --- a/drivers/net/fddi/defza.c +++ b/drivers/net/fddi/defza.c @@ -984,7 +984,7 @@ static irqreturn_t fza_interrupt(int irq, void *dev_id) case FZA_STATE_UNINITIALIZED: netif_carrier_off(dev); - timer_delete_sync(&fp->reset_timer); + timer_delete_sync_try(&fp->reset_timer); fp->ring_cmd_index = 0; fp->ring_uns_index = 0; fp->ring_rmc_tx_index = 0; @@ -1018,7 +1018,9 @@ static irqreturn_t fza_interrupt(int irq, void *dev_id) fp->queue_active = 0; netif_stop_queue(dev); pr_debug("%s: queue stopped\n", fp->name); - timer_delete_sync(&fp->reset_timer); + + spin_lock(&fp->lock); + timer_delete(&fp->reset_timer); pr_warn("%s: halted, reason: %x\n", fp->name, FZA_STATUS_GET_HALT(status)); fza_regs_dump(fp); @@ -1027,6 +1029,8 @@ static irqreturn_t fza_interrupt(int irq, void *dev_id) fp->timer_state = 0; fp->reset_timer.expires = jiffies + 45 * HZ; add_timer(&fp->reset_timer); + spin_unlock(&fp->lock); + break; default: @@ -1046,7 +1050,9 @@ static irqreturn_t fza_interrupt(int irq, void *dev_id) static void fza_reset_timer(struct timer_list *t) { struct fza_private *fp = timer_container_of(fp, t, reset_timer); + unsigned long flags; + spin_lock_irqsave(&fp->lock, flags); if (!fp->timer_state) { pr_err("%s: RESET timed out!\n", fp->name); pr_info("%s: trying harder...\n", fp->name); @@ -1069,6 +1075,7 @@ static void fza_reset_timer(struct timer_list *t) fp->reset_timer.expires = jiffies + 45 * HZ; } add_timer(&fp->reset_timer); + spin_unlock_irqrestore(&fp->lock, flags); } static int fza_set_mac_address(struct net_device *dev, void *addr) From 320fb29ea23cfa1aeef32563da8748247db896ea Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 11 May 2026 14:30:57 -0400 Subject: [PATCH 310/377] net/sched: sch_cbs: Call qdisc_reset for child qdisc During a reset, CBS is not calling reset on its child qdisc, which might cause qlen/backlog accounting issues. For example, if we have CBS with a QFQ parent and a netem child with delay, we can create a scenario where the parent's qlen underflows. QFQ, specifically, uses qlen to check whether it should deference a pointer, so this scenario may cause a null-ptr deref in QFQ: [ 43.875639][ T319] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000009: 0000 [#1] SMP KASAN NOPTI [ 43.876124][ T319] KASAN: null-ptr-deref in range [0x0000000000000048-0x000000000000004f] [ 43.876417][ T319] CPU: 10 UID: 0 PID: 319 Comm: ping Not tainted 7.0.0-13039-ge728258debd5 #773 PREEMPT(full) [ 43.876751][ T319] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 43.876949][ T319] RIP: 0010:qfq_dequeue+0x35c/0x1650 [ 43.877123][ T319] Code: 00 fc ff df 80 3c 02 00 0f 85 17 0e 00 00 4c 8d 73 48 48 89 9d b8 02 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 76 0c 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b [ 43.877648][ T319] RSP: 0018:ffff8881017ef4f0 EFLAGS: 00010216 [ 43.877845][ T319] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: dffffc0000000000 [ 43.878073][ T319] RDX: 0000000000000009 RSI: 0000000c40000000 RDI: ffff88810eef02b0 [ 43.878306][ T319] RBP: ffff88810eef0000 R08: ffff88810eef0280 R09: 1ffff1102120fd63 [ 43.878523][ T319] R10: 1ffff1102120fd66 R11: 1ffff1102120fd67 R12: 0000000c40000000 [ 43.878742][ T319] R13: ffff88810eef02b8 R14: 0000000000000048 R15: 0000000020000000 [ 43.878959][ T319] FS: 00007f9c51c47c40(0000) GS:ffff88817a0be000(0000) knlGS:0000000000000000 [ 43.879214][ T319] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 43.879403][ T319] CR2: 000055e69a2230a8 CR3: 000000010c07a000 CR4: 0000000000750ef0 [ 43.879621][ T319] PKRU: 55555554 [ 43.879735][ T319] Call Trace: [ 43.879844][ T319] [ 43.879924][ T319] __qdisc_run+0x169/0x1900 [ 43.880075][ T319] ? dev_qdisc_enqueue+0x8b/0x210 [ 43.880222][ T319] __dev_queue_xmit+0x2346/0x37a0 [ 43.880376][ T319] ? register_lock_class+0x3f/0x800 [ 43.880531][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.880684][ T319] ? __pfx___dev_queue_xmit+0x10/0x10 [ 43.880834][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.880977][ T319] ? __lock_acquire+0x819/0x1df0 [ 43.881124][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.881275][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.881418][ T319] ? __asan_memcpy+0x3c/0x60 [ 43.881563][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.881708][ T319] ? eth_header+0x165/0x1a0 [ 43.881853][ T319] ? lockdep_hardirqs_on_prepare+0xdb/0x1a0 [ 43.882031][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.882174][ T319] ? neigh_resolve_output+0x3cc/0x7e0 [ 43.882325][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.882471][ T319] ip_finish_output2+0x6b6/0x1e10 Fix this by calling qdisc_reset for CBS' child qdisc. Sashiko caught an issue which could result in a null ptr deref if qdisc_create_dflt() is invoked on an unitialised cbs qdisc which is exposed by this patch. We add an early return if the qdisc is null to address this. This is a similar approach used by two other fixes[1][2]. The proper fix for this specific issue elucidated by sashiko is to remove the call to qdisc_reset when qdisc_create_dflt fails. Since the dflt qdisc isn't attached anywhere yet at that point, calling the reset callback doesn't make much sense (and as stated has been a source of two other bugs). We plan on submitting this fix in a later patch. [1] https://lore.kernel.org/netdev/20221018063201.306474-2-shaozhengchao@huawei.com/ [2] https://lore.kernel.org/netdev/20221018063201.306474-4-shaozhengchao@huawei.com/ Fixes: 585d763af09c ("net/sched: Introduce Credit Based Shaper (CBS) qdisc") Reported-by: Junyoung Jang Tested-by: Junyoung Jang Tested-by: Victor Nogueira Acked-by: Vinicius Costa Gomes Signed-off-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- net/sched/sch_cbs.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 8c9a0400c862..0f953bd46b58 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -243,6 +243,20 @@ static struct sk_buff *cbs_dequeue(struct Qdisc *sch) return q->dequeue(sch); } +static void cbs_reset(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + /* Nothing to do if we couldn't create the underlying qdisc */ + if (!q->qdisc) + return; + + qdisc_reset(q->qdisc); + qdisc_watchdog_cancel(&q->watchdog); + q->credits = 0; + q->last = 0; +} + static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = { [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) }, }; @@ -540,7 +554,7 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .dequeue = cbs_dequeue, .peek = qdisc_peek_dequeued, .init = cbs_init, - .reset = qdisc_reset_queue, + .reset = cbs_reset, .destroy = cbs_destroy, .change = cbs_change, .dump = cbs_dump, From 59afae20080a9681014bdc87897cbfd30bedd261 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Mon, 11 May 2026 14:30:58 -0400 Subject: [PATCH 311/377] selftests/tc-testing: Add QFQ/CBS qlen underflow test Since CBS was not calling reset for its child qdisc, there are scenarios where it could cause an underflow on its parent's qlen/backlog. When the parent is QFQ, a null-ptr deref could occur. Add a test case that reproduces the underflow followed by a null-ptr deref scenario. Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index b1f856cf62c1..848696c373fc 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -1284,5 +1284,46 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "3a62", + "name": "Try to create a qlen underflow with QFQ/CBS", + "category": [ + "qdisc", + "qfq", + "cbs" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY classid 1:1 parent 1: qfq", + "$TC class add dev $DUMMY classid 1:2 parent 1: qfq", + "$TC qdisc add dev $DUMMY handle 2: parent 1:1 cbs", + "$TC qdisc add dev $DUMMY handle 3: parent 2: netem delay 5000000000", + "$TC filter add dev $DUMMY parent 1: prio 1 u32 match ip dst 10.10.10.1 classid 1:1 action ok", + "$TC filter add dev $DUMMY parent 1: prio 2 u32 match ip dst 10.10.10.2 classid 1:2 action ok", + "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "$IP l set $DUMMY down", + "$IP l set $DUMMY up", + "$TC qdisc replace dev $DUMMY handle 4: parent 2: pfifo" + ], + "cmdUnderTest": "ping -c 1 10.10.10.2 -W0.01 -I$DUMMY", + "expExitCode": "1", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:1", + "matchJSON": [ + { + "kind": "cbs", + "handle": "2:", + "bytes": 0, + "packets": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] } ] From 6c7674b5b7ae513cecae22aa9dcdcf533862cf5c Mon Sep 17 00:00:00 2001 From: Zong Li Date: Mon, 27 Apr 2026 19:41:05 -0700 Subject: [PATCH 312/377] riscv: cfi: reduce shadow stack size limit from 4GB to 2GB Follow the ARM64 GCS (Guarded Control Stack) implementation approach by reducing the shadow stack size allocation from min(RLIMIT_STACK, 4GB) to min(RLIMIT_STACK/2, 2GB). See commit 506496bcbb42 ("arm64/gcs: Ensure that new threads have a GCS") Rationale: 1. Shadow stacks only store return addresses (8 bytes per entry), not local variables, function parameters, or saved registers. A 2GB shadow stack is far more than sufficient for any practical application, even with extremely deep recursion. Using half the size maintains adequate margin while being more resource-efficient. 2. On memory-constrained systems (e.g., platforms with only 4GB of physical memory, which is a common configuration), allocating 4GB of virtual address space for shadow stack per process/thread can lead to virtual memory allocation failures when the overcommit mode is set to OVERCOMMIT_GUESS or OVERCOMMIT_NEVER: Error: "__vm_enough_memory: not enough memory for the allocation" This reduces virtual address space consumption by 50% while maintaining more than adequate space for return address storage. Signed-off-by: Zong Li Link: https://patch.msgid.link/20260428024105.645162-1-zong.li@sifive.com [pjw@kernel.org: clean up patch description] Signed-off-by: Paul Walmsley --- arch/riscv/kernel/usercfi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c index 6eaa0d94fdfe..cbfb4e495e9f 100644 --- a/arch/riscv/kernel/usercfi.c +++ b/arch/riscv/kernel/usercfi.c @@ -109,15 +109,16 @@ void set_indir_lp_lock(struct task_struct *task, bool lock) task->thread_info.user_cfi_state.ufcfi_locked = lock; } /* - * If size is 0, then to be compatible with regular stack we want it to be as big as - * regular stack. Else PAGE_ALIGN it and return back + * The shadow stack only stores the return address and not any variables + * this should be more than sufficient for most applications. + * Else PAGE_ALIGN it and return back */ static unsigned long calc_shstk_size(unsigned long size) { if (size) return PAGE_ALIGN(size); - return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); + return PAGE_ALIGN(min(rlimit(RLIMIT_STACK) / 2, SZ_2G)); } /* From 9a390d34d55cb4ecbca4981c660dd95440827c70 Mon Sep 17 00:00:00 2001 From: Sukhdeep Singh Date: Tue, 12 May 2026 18:27:11 +0530 Subject: [PATCH 313/377] MAINTAINERS: update atlantic driver maintainer Igor Russkikh and Egor Pomozov have left Marvell. Take over maintenance of the atlantic driver and its PTP subsystem. Signed-off-by: Sukhdeep Singh Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8a134cf6e9bb..7a4f2aab78ff 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2021,7 +2021,7 @@ F: Documentation/hwmon/aquacomputer_d5next.rst F: drivers/hwmon/aquacomputer_d5next.c AQUANTIA ETHERNET DRIVER (atlantic) -M: Igor Russkikh +M: Sukhdeep Singh L: netdev@vger.kernel.org S: Maintained W: https://www.marvell.com/ @@ -2030,7 +2030,7 @@ F: Documentation/networking/device_drivers/ethernet/aquantia/atlantic.rst F: drivers/net/ethernet/aquantia/atlantic/ AQUANTIA ETHERNET DRIVER PTP SUBSYSTEM -M: Egor Pomozov +M: Sukhdeep Singh L: netdev@vger.kernel.org S: Maintained W: http://www.aquantia.com From b84c5632c7b31f8910167075a8128cfb9e50fcfe Mon Sep 17 00:00:00 2001 From: Faicker Mo Date: Mon, 11 May 2026 22:05:51 +0800 Subject: [PATCH 314/377] net: net_failover: Fix the deadlock in slave register There is netdev_lock_ops() before the NETDEV_REGISTER notifier in register_netdevice(), so use the non-locking functions in net_failover_slave_register(). failover_slave_register() in failover_existing_slave_register() adds lock and unlock ops too. Call Trace: __schedule+0x30d/0x7a0 schedule+0x27/0x90 schedule_preempt_disabled+0x15/0x30 __mutex_lock.constprop.0+0x538/0x9e0 __mutex_lock_slowpath+0x13/0x20 mutex_lock+0x3b/0x50 dev_set_mtu+0x40/0xe0 net_failover_slave_register+0x24/0x280 failover_slave_register+0x103/0x1b0 failover_event+0x15e/0x210 ? dropmon_net_event+0xac/0xe0 notifier_call_chain+0x5e/0xe0 raw_notifier_call_chain+0x16/0x30 call_netdevice_notifiers_info+0x52/0xa0 register_netdevice+0x5f4/0x7c0 register_netdev+0x1e/0x40 _mlx5e_probe+0xe2/0x370 [mlx5_core] mlx5e_probe+0x59/0x70 [mlx5_core] ? __pfx_mlx5e_probe+0x10/0x10 [mlx5_core] Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP") Signed-off-by: Faicker Mo Signed-off-by: Jakub Kicinski --- drivers/net/net_failover.c | 12 ++++++------ net/core/failover.c | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index d0361aaf25ef..3f7d31033bae 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -502,7 +502,7 @@ static int net_failover_slave_register(struct net_device *slave_dev, /* Align MTU of slave with failover dev */ orig_mtu = slave_dev->mtu; - err = dev_set_mtu(slave_dev, failover_dev->mtu); + err = netif_set_mtu(slave_dev, failover_dev->mtu); if (err) { netdev_err(failover_dev, "unable to change mtu of %s to %u register failed\n", slave_dev->name, failover_dev->mtu); @@ -512,11 +512,11 @@ static int net_failover_slave_register(struct net_device *slave_dev, dev_hold(slave_dev); if (netif_running(failover_dev)) { - err = dev_open(slave_dev, NULL); + err = netif_open(slave_dev, NULL); if (err && (err != -EBUSY)) { netdev_err(failover_dev, "Opening slave %s failed err:%d\n", slave_dev->name, err); - goto err_dev_open; + goto err_netif_open; } } @@ -562,10 +562,10 @@ static int net_failover_slave_register(struct net_device *slave_dev, err_vlan_add: dev_uc_unsync(slave_dev, failover_dev); dev_mc_unsync(slave_dev, failover_dev); - dev_close(slave_dev); -err_dev_open: + netif_close(slave_dev); +err_netif_open: dev_put(slave_dev); - dev_set_mtu(slave_dev, orig_mtu); + netif_set_mtu(slave_dev, orig_mtu); done: return err; } diff --git a/net/core/failover.c b/net/core/failover.c index 11bb183c7a1b..e43c59cd6868 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -12,6 +12,7 @@ #include #include #include +#include #include static LIST_HEAD(failover_list); @@ -221,8 +222,11 @@ failover_existing_slave_register(struct net_device *failover_dev) for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; - if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) + if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) { + netdev_lock_ops(dev); failover_slave_register(dev); + netdev_unlock_ops(dev); + } } rtnl_unlock(); } From c6690a9030d784d3f099850800b6d5323771ca37 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Mon, 11 May 2026 23:30:58 +0800 Subject: [PATCH 315/377] macsec: introduce dedicated workqueue for SA crypto cleanup Introduce a dedicated ordered workqueue, macsec_wq, which will be used by subsequent patches to defer SA crypto cleanup (crypto_free_aead and related teardown) out of softirq context. Using a dedicated workqueue instead of system_wq allows macsec_exit() to drain exactly the work items belonging to this module via destroy_workqueue(), without interfering with unrelated work items on system_wq or causing unexpected delays elsewhere. rcu_barrier() in macsec_exit() ensures all in-flight rcu_work callbacks have enqueued their work items before destroy_workqueue() drains and destroys the queue, making the two-step teardown correct and complete. The same sequence is kept in the error path of macsec_init() as a precaution, to mirror macsec_exit() and stay safe if work ever becomes queueable before this point in the future. While at it, rename the error labels in macsec_init() from the resource-named style (rtnl:, notifier:, wq:) to the err_xxx: style (err_rtnl:, err_notifier:, err_destroy_wq:) to align with the broader kernel convention. Signed-off-by: Jinliang Zheng Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260511153102.2640368-2-alexjlzheng@tencent.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 6147ee8b1d78..ef5ac634f916 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -26,6 +26,8 @@ #include +static struct workqueue_struct *macsec_wq; + /* SecTAG length = macsec_eth_header without the optional SCI */ #define MACSEC_TAG_LEN 6 @@ -4505,25 +4507,35 @@ static int __init macsec_init(void) { int err; + macsec_wq = alloc_workqueue("macsec", WQ_UNBOUND, 0); + if (!macsec_wq) + return -ENOMEM; + pr_info("MACsec IEEE 802.1AE\n"); err = register_netdevice_notifier(&macsec_notifier); if (err) - return err; + goto err_destroy_wq; err = rtnl_link_register(&macsec_link_ops); if (err) - goto notifier; + goto err_notifier; err = genl_register_family(&macsec_fam); if (err) - goto rtnl; + goto err_rtnl; return 0; -rtnl: +err_rtnl: rtnl_link_unregister(&macsec_link_ops); -notifier: +err_notifier: unregister_netdevice_notifier(&macsec_notifier); +err_destroy_wq: + /* Precautionary, mirrors macsec_exit() to stay safe if work + * ever becomes queueable before this point in the future. + */ + rcu_barrier(); + destroy_workqueue(macsec_wq); return err; } @@ -4533,6 +4545,7 @@ static void __exit macsec_exit(void) rtnl_link_unregister(&macsec_link_ops); unregister_netdevice_notifier(&macsec_notifier); rcu_barrier(); + destroy_workqueue(macsec_wq); } module_init(macsec_init); From 6624bba469a325ecd699feae400b77cd11c76b98 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Mon, 11 May 2026 23:30:59 +0800 Subject: [PATCH 316/377] macsec: use rcu_work to defer RX SA crypto cleanup out of softirq crypto_free_aead() can internally invoke vunmap() (e.g. via dma_free_attrs() in hardware crypto drivers such as hisi_sec2). vunmap() must not be called from softirq context, but free_rxsa() is an RCU callback that runs in softirq, leading to a kernel crash: vunmap+0x4c/0x70 __iommu_dma_free+0xd0/0x138 dma_free_attrs+0xf4/0x100 sec_aead_exit+0x64/0xb8 [hisi_sec2] crypto_destroy_tfm+0x98/0x110 free_rxsa+0x28/0x50 [macsec] rcu_do_batch+0x184/0x460 rcu_core+0xf4/0x1f8 handle_softirqs+0x118/0x330 Use rcu_work to defer the cleanup to a workqueue. rcu_work dispatches the worker asynchronously after the RCU grace period, so no thread blocks waiting, and concurrent releases of multiple SAs naturally share the same grace period. Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Signed-off-by: Jinliang Zheng Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260511153102.2640368-3-alexjlzheng@tencent.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 8 +++++--- include/net/macsec.h | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index ef5ac634f916..e7ad24f1ea5b 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -176,9 +176,10 @@ static void macsec_rxsc_put(struct macsec_rx_sc *sc) call_rcu(&sc->rcu_head, free_rx_sc_rcu); } -static void free_rxsa(struct rcu_head *head) +static void free_rxsa_work(struct work_struct *work) { - struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu); + struct macsec_rx_sa *sa = + container_of(to_rcu_work(work), struct macsec_rx_sa, destroy_work); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); @@ -188,7 +189,7 @@ static void free_rxsa(struct rcu_head *head) static void macsec_rxsa_put(struct macsec_rx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) - call_rcu(&sa->rcu, free_rxsa); + queue_rcu_work(macsec_wq, &sa->destroy_work); } static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) @@ -1409,6 +1410,7 @@ static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len, rx_sa->next_pn = 1; refcount_set(&rx_sa->refcnt, 1); spin_lock_init(&rx_sa->lock); + INIT_RCU_WORK(&rx_sa->destroy_work, free_rxsa_work); return 0; } diff --git a/include/net/macsec.h b/include/net/macsec.h index bc7de5b53e54..0980ef36fbf0 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -123,6 +124,7 @@ struct macsec_dev_stats { * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats + * @destroy_work: deferred work to free the SA in process context after RCU grace period */ struct macsec_rx_sa { struct macsec_key key; @@ -136,7 +138,7 @@ struct macsec_rx_sa { bool active; struct macsec_rx_sa_stats __percpu *stats; struct macsec_rx_sc *sc; - struct rcu_head rcu; + struct rcu_work destroy_work; }; struct pcpu_rx_sc_stats { From 552cc2306c3d87632f44a655737d1d367c2a3295 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Mon, 11 May 2026 23:31:00 +0800 Subject: [PATCH 317/377] macsec: use rcu_work to defer TX SA crypto cleanup out of softirq free_txsa() is an RCU callback running in softirq context, but calls crypto_free_aead() which can invoke vunmap() internally on hardware crypto drivers (e.g. hisi_sec2), triggering a kernel crash. Use rcu_work to defer the cleanup to a workqueue, for the same reasons as the analogous fix to free_rxsa() in the previous patch. Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Signed-off-by: Jinliang Zheng Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260511153102.2640368-4-alexjlzheng@tencent.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 8 +++++--- include/net/macsec.h | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index e7ad24f1ea5b..f904f4d16b45 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -205,9 +205,10 @@ static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) return sa; } -static void free_txsa(struct rcu_head *head) +static void free_txsa_work(struct work_struct *work) { - struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu); + struct macsec_tx_sa *sa = + container_of(to_rcu_work(work), struct macsec_tx_sa, destroy_work); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); @@ -217,7 +218,7 @@ static void free_txsa(struct rcu_head *head) static void macsec_txsa_put(struct macsec_tx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) - call_rcu(&sa->rcu, free_txsa); + queue_rcu_work(macsec_wq, &sa->destroy_work); } static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) @@ -1510,6 +1511,7 @@ static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len, tx_sa->active = false; refcount_set(&tx_sa->refcnt, 1); spin_lock_init(&tx_sa->lock); + INIT_RCU_WORK(&tx_sa->destroy_work, free_txsa_work); return 0; } diff --git a/include/net/macsec.h b/include/net/macsec.h index 0980ef36fbf0..d962093ee923 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -176,6 +176,7 @@ struct macsec_rx_sc { * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats + * @destroy_work: deferred work to free the SA in process context after RCU grace period */ struct macsec_tx_sa { struct macsec_key key; @@ -188,7 +189,7 @@ struct macsec_tx_sa { refcount_t refcnt; bool active; struct macsec_tx_sa_stats __percpu *stats; - struct rcu_head rcu; + struct rcu_work destroy_work; }; /** From f44d38a31f1802b7222adaea9ee69f9d280f698a Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Thu, 14 May 2026 10:18:47 +0800 Subject: [PATCH 318/377] io_uring: validate user-controlled cq.head in io_cqe_cache_refill() A fuzzing run reproduced an unkillable io_uring task stuck at ~100% CPU: [root@fedora io_uring_stress]# ps -ef | grep io_uring root 1240 1 99 13:36 ? 00:01:35 [io_uring_stress] The task loops inside io_cqring_wait() and never returns to userspace, and SIGKILL has no effect. This is caused by the CQ ring exposing rings->cq.head to userspace as writable, while the authoritative tail lives in kernel-private ctx->cached_cq_tail. io_cqe_cache_refill() computes free space as an unsigned subtraction: free = ctx->cq_entries - min(tail - head, ctx->cq_entries); If userspace keeps head within [0, tail], the subtraction is well defined and min() just acts as a defensive clamp. But if userspace advances head past tail, (tail - head) wraps to a huge value, free becomes 0, and io_cqe_cache_refill() fails. The CQE is pushed onto the overflow list and IO_CHECK_CQ_OVERFLOW_BIT is set. The wait loop in io_cqring_wait() relies on an invariant: refill() only fails when the CQ is *physically* full, in which case rings->cq.tail has been advanced to iowq->cq_tail and io_should_wake() returns true. The tampered head breaks this: refill() fails while the ring is not full, no OCQE is copied in, rings->cq.tail never catches up, io_should_wake() stays false, and io_cqring_wait_schedule() keeps returning early because IO_CHECK_CQ_OVERFLOW_BIT is still set. The result is a tight retry loop that never returns to userspace. Introduce io_cqring_queued() as the single point that converts the (tail, head) pair into a trustworthy queued count. Since the real head/tail distance is bounded by cq_entries (far below 2^31), a signed comparison reliably detects userspace moving head past tail; in that case treat the queue as empty so callers see the full cache as free and forward progress is preserved. Suggested-by: Jens Axboe Signed-off-by: Zizhi Wo Link: https://patch.msgid.link/20260514021847.4062782-1-wozizhi@huaweicloud.com [axboe: fixup commit message, kill 'queued' var, and keep it all in io_uring.c] Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2ebb0ba37c4f..036145ee466c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -686,13 +686,27 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, return ocqe; } +/* + * Compute queued CQEs for free-space calculation, clamped to cq_entries. + */ +static unsigned int io_cqring_queued(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = io_get_rings(ctx); + int diff; + + diff = (int)(ctx->cached_cq_tail - READ_ONCE(rings->cq.head)); + if (diff >= 0) + return min((unsigned int)diff, ctx->cq_entries); + return 0; +} + /* * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE * because the ring is a single 16b entry away from wrapping. */ static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) { - if (__io_cqring_events(ctx) < ctx->cq_entries) { + if (io_cqring_queued(ctx) < ctx->cq_entries) { struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; cqe->user_data = 0; @@ -713,7 +727,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int free, queued, len; + unsigned int free, len; /* * Posting into the CQ when there are pending overflowed CQEs may break @@ -733,9 +747,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) off = 0; } - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; + free = ctx->cq_entries - io_cqring_queued(ctx); /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); if (len < (cqe32 + 1)) From 50da1c9ccb70fc5250c37ac474b54ee072732ea3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 6 Apr 2026 16:23:04 -0700 Subject: [PATCH 319/377] riscv: Docs: fix unmatched quote warning 'make htmldocs' complains about ``prctrl` -- so add a second '`' to avoid the warning. Documentation/arch/riscv/zicfilp.rst:79: WARNING: Inline literal start-string without end-string. [docutils] Fixes: 08ee1559052b ("prctl: cfi: change the branch landing pad prctl()s to be more descriptive") Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260406232304.1892528-1-rdunlap@infradead.org Signed-off-by: Paul Walmsley --- Documentation/arch/riscv/zicfilp.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arch/riscv/zicfilp.rst b/Documentation/arch/riscv/zicfilp.rst index ab7d8e62ddaf..12b35969d17a 100644 --- a/Documentation/arch/riscv/zicfilp.rst +++ b/Documentation/arch/riscv/zicfilp.rst @@ -78,7 +78,7 @@ the program. Per-task indirect branch tracking state can be monitored and controlled via the :c:macro:`PR_GET_CFI` and :c:macro:`PR_SET_CFI` -``prctl()` arguments (respectively), by supplying +``prctl()`` arguments (respectively), by supplying :c:macro:`PR_CFI_BRANCH_LANDING_PADS` as the second argument. These are architecture-agnostic, and will return -EINVAL if the underlying functionality is not supported. From b69bcb13ed7024a84d6cd8ad330f1e32782fcf28 Mon Sep 17 00:00:00 2001 From: Vivian Wang Date: Wed, 1 Apr 2026 09:53:17 +0800 Subject: [PATCH 320/377] riscv: misaligned: Make enabling delegation depend on NONPORTABLE The unaligned access emulation code in Linux has various deficiencies. For example, it doesn't emulate vector instructions [1] [2], and doesn't emulate KVM guest accesses. Therefore, requesting misaligned exception delegation with SBI FWFT actually regresses vector instructions' and KVM guests' behavior. Until Linux can handle it properly, guard these sbi_fwft_set() calls behind RISCV_SBI_FWFT_DELEGATE_MISALIGNED, which in turn depends on NONPORTABLE. Those who are sure that this wouldn't be a problem can enable this option, perhaps getting better performance. The rest of the existing code proceeds as before, except as if SBI_FWFT_MISALIGNED_EXC_DELEG is not available, to handle any remaining address misaligned exceptions on a best-effort basis. The KVM SBI FWFT implementation is also not touched, but it is disabled if the firmware emulates unaligned accesses. Cc: stable@vger.kernel.org Fixes: cf5a8abc6560 ("riscv: misaligned: request misaligned exception from SBI") Reported-by: Songsong Zhang # KVM Link: https://lore.kernel.org/linux-riscv/38ce44c1-08cf-4e3f-8ade-20da224f529c@iscas.ac.cn/ [1] Link: https://lore.kernel.org/linux-riscv/b3cfcdac-0337-4db0-a611-258f2868855f@iscas.ac.cn/ [2] Signed-off-by: Vivian Wang Acked-by: Conor Dooley Link: https://patch.msgid.link/20260401-riscv-misaligned-dont-delegate-v2-1-5014a288c097@iscas.ac.cn Signed-off-by: Paul Walmsley --- arch/riscv/Kconfig | 22 ++++++++++++++++++++++ arch/riscv/kernel/traps_misaligned.c | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index d235396c4514..c5754942cf85 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -937,6 +937,28 @@ config RISCV_VECTOR_MISALIGNED help Enable detecting support for vector misaligned loads and stores. +config RISCV_SBI_FWFT_DELEGATE_MISALIGNED + bool "Request firmware delegation of unaligned access exceptions" + depends on RISCV_SBI + depends on NONPORTABLE + help + Use SBI FWFT to request delegation of load address misaligned and + store address misaligned exceptions, if possible, and prefer Linux + kernel emulation of these accesses to firmware emulation. + + Unfortunately, Linux's emulation is still incomplete. Namely, it + currently does not handle vector instructions and KVM guest accesses. + On platforms where these accesses would have been handled by firmware, + enabling this causes unexpected kernel oopses, userspaces crashes and + KVM guest crashes. If you are sure that these are not a problem for + your platform, you can say Y here, which may improve performance. + + Saying N here will not worsen emulation support for unaligned accesses + even in the case where the firmware also has incomplete support. It + simply keeps the firmware's emulation enabled. + + If you don't know what to do here, say N. + choice prompt "Unaligned Accesses Support" default RISCV_PROBE_UNALIGNED_ACCESS diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 2a27d3ff4ac6..81b7682e6c6d 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -584,7 +584,7 @@ static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) static bool misaligned_traps_delegated; -#ifdef CONFIG_RISCV_SBI +#if defined(CONFIG_RISCV_SBI_FWFT_DELEGATE_MISALIGNED) static int cpu_online_sbi_unaligned_setup(unsigned int cpu) { From 31467b23823ffec1f6fff407f8e3ca9af8b7491a Mon Sep 17 00:00:00 2001 From: Sayali Patil Date: Wed, 13 May 2026 13:44:13 +0530 Subject: [PATCH 321/377] powerpc/time: Remove redundant preempt_disable|enable() calls from arch_irq_work_raise() A kernel panic is observed when handling machine check exceptions from real mode. BUG: Unable to handle kernel data access on read at 0xc00000006be21300 Oops: Kernel access of bad area, sig: 11 [#1] MSR: 8000000000001003 CR: 88222248 XER: 00000005 CFAR: c00000000003ffc4 DAR: c00000006be21300 DSISR: 40000000 IRQMASK: 0 NIP [c000000000029e40] arch_irq_work_raise+0x10/0x70 LR [c00000000003ffc8] machine_check_queue_event+0xa8/0x150 Call Trace: [c0000000179d3c70] [c00000000003ff64] machine_check_queue_event+0x44/0x150 [c0000000179d3d30] [c0000000000084e0] machine_check_early_common+0x1f0/0x2c0 The crash occurs because arch_irq_work_raise() calls preempt_disable() from machine check exception (MCE) handlers running in real mode. In this context, accessing the preempt_count can fault, leading to the panic. The preempt_disable()/preempt_enable() pair in arch_irq_work_raise() was originally added by commit 0fe1ac48bef0 ("powerpc/perf_event: Fix oops due to perf_event_do_pending call") to avoid races while raising irq work from exception context. Later, commit 471ba0e686cb ("irq_work: Do not raise an IPI when queueing work on the local CPU") added preemption protection in irq_work_queue() path, while commit 20b876918c06 ("irq_work: Use per cpu atomics instead of regular atomics") added equivalent protection in irq_work_queue_on() before reaching arch_irq_work_raise(): irq_work_queue() / irq_work_queue_on() -> preempt_disable() -> __irq_work_queue_local() -> irq_work_raise() -> arch_irq_work_raise() As a result, callers other than mce_irq_work_raise() already execute with preemption disabled, making the additional preempt_disable()/preempt_enable() pair in arch_irq_work_raise() redundant. The arch_irq_work_raise() function executes in NMI context when called from MCE handler. Hence we will not be preempted or scheduled out since we are in NMI context with MSR[EE]=0. Therefore, it is safe to remove the preempt_disable()/preempt_enable() calls from here. Remove it to avoid accessing preempt_count from real mode context. Fixes: cc15ff327569 ("powerpc/mce: Avoid using irq_work_queue() in realmode") Suggested-by: Mahesh Salgaonkar Acked-by: Shrikanth Hegde Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Sayali Patil [Maddy: Fixed the commit title] Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260513081413.222490-1-sayalip@linux.ibm.com --- arch/powerpc/kernel/time.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 4bbeb8644d3d..b4472288e0d4 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -458,6 +458,10 @@ DEFINE_PER_CPU(u8, irq_work_pending); #endif /* 32 vs 64 bit */ +/* + * Must be called with preemption disabled since it updates + * per-CPU irq_work state and programs the local CPU decrementer. + */ void arch_irq_work_raise(void) { /* @@ -471,10 +475,8 @@ void arch_irq_work_raise(void) * which could get tangled up if we're messing with the same state * here. */ - preempt_disable(); set_irq_work_pending_flag(); set_dec(1); - preempt_enable(); } static void set_dec_or_work(u64 val) From 1ef2a89584b7b788b2603590d886db076b2f24cc Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Thu, 7 May 2026 16:28:12 +0100 Subject: [PATCH 322/377] arm_mpam: Fix monitor instance selection when checking for hardware NRDY In _mpam_ris_hw_probe_hw_nrdy() a new register value to select the first monitor and relevant RIS is prepared in mon_sel. However, it is written to the monitor value register, e.g. MSMON_CSU, rather than MSMON_CFG_MON_SEL. As MSMON_CFG_MON_SEL is a 32 bit register update the type of mon_sel to u32. Write mon_sel to the intended register, MSMON_CFG_MON_SEL. Fixes: 8c90dc68a5de ("arm_mpam: Probe the hardware features resctrl supports") Cc: Signed-off-by: Ben Horgan Reviewed-by: James Morse Signed-off-by: Catalin Marinas --- drivers/resctrl/mpam_devices.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 41b14344b16f..817cb10a8e79 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -731,7 +731,7 @@ static void mpam_enable_quirks(struct mpam_msc *msc) static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) { u32 now; - u64 mon_sel; + u32 mon_sel; bool can_set, can_clear; struct mpam_msc *msc = ris->vmsc->msc; @@ -740,7 +740,7 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, 0) | FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); - _mpam_write_monsel_reg(msc, mon_reg, mon_sel); + mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); _mpam_write_monsel_reg(msc, mon_reg, MSMON___NRDY); now = _mpam_read_monsel_reg(msc, mon_reg); From 4387970bbd84fd14e0c49c3089c5061ccd86b98a Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Thu, 7 May 2026 16:28:13 +0100 Subject: [PATCH 323/377] arm_mpam: Pretend that NRDY is always hardware managed Rule ZTXDS of the MPAM specification, IHI009 version B.b, states: "If a monitor does not support automatic updates of NRDY, software can use that bit for any purpose." As software is not reliably informed whether or not the monitor supports automatic updates of NRDY always assume that hardware may manage NRDY but don't rely on it. When NRDY is truly untouched by hardware then, as it is written to 0 on configuration, it will always read 0. At probe it's checked if MSMON_CSU.NRDY and MSMON_MBWU.NRDY are hardware managed but not MSMON_MBWU_L.NDRY. Specialize the checking for hardware managed NRDY to CSU counters as this is the only case where hardware management makes sense. Continue to inform the user if MSMON_CSU.NRDY appears to be hardware managed but the firmware doesn't provide the associated time limit for the automatic clearing of NRDY. Remove the NRDY feature flags as they are now unused. Fixes: 8c90dc68a5de ("arm_mpam: Probe the hardware features resctrl supports") Cc: Signed-off-by: Ben Horgan Reviewed-by: James Morse Signed-off-by: Catalin Marinas --- drivers/resctrl/mpam_devices.c | 53 +++++++++++---------------------- drivers/resctrl/mpam_internal.h | 2 -- 2 files changed, 17 insertions(+), 38 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 817cb10a8e79..58e0c8970e8c 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -728,7 +728,7 @@ static void mpam_enable_quirks(struct mpam_msc *msc) * Try and see what values stick in this bit. If we can write either value, * its probably not implemented by hardware. */ -static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) +static bool mpam_ris_hw_probe_csu_nrdy(struct mpam_msc_ris *ris) { u32 now; u32 mon_sel; @@ -742,21 +742,18 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); - _mpam_write_monsel_reg(msc, mon_reg, MSMON___NRDY); - now = _mpam_read_monsel_reg(msc, mon_reg); + _mpam_write_monsel_reg(msc, MSMON_CSU, MSMON___NRDY); + now = _mpam_read_monsel_reg(msc, MSMON_CSU); can_set = now & MSMON___NRDY; - _mpam_write_monsel_reg(msc, mon_reg, 0); - now = _mpam_read_monsel_reg(msc, mon_reg); + _mpam_write_monsel_reg(msc, MSMON_CSU, 0); + now = _mpam_read_monsel_reg(msc, MSMON_CSU); can_clear = !(now & MSMON___NRDY); mpam_mon_sel_unlock(msc); return (!can_set || !can_clear); } -#define mpam_ris_hw_probe_hw_nrdy(_ris, _mon_reg) \ - _mpam_ris_hw_probe_hw_nrdy(_ris, MSMON_##_mon_reg) - static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) { int err; @@ -873,20 +870,18 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_msmon_csu_xcl, props); /* Is NRDY hardware managed? */ - hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU); - if (hw_managed) - mpam_set_feature(mpam_feat_msmon_csu_hw_nrdy, props); - } + hw_managed = mpam_ris_hw_probe_csu_nrdy(ris); - /* - * Accept the missing firmware property if NRDY appears - * un-implemented. - */ - if (err && mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, props)) - dev_err_once(dev, "Counters are not usable because not-ready timeout was not provided by firmware."); + /* + * Accept the missing firmware property if NRDY appears + * un-implemented. + */ + if (err && hw_managed) + dev_err_once(dev, "Counters are not usable because not-ready timeout was not provided by firmware."); + } } if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_MBWU, msmon_features)) { - bool has_long, hw_managed; + bool has_long; u32 mbwumon_idr = mpam_read_partsel_reg(msc, MBWUMON_IDR); props->num_mbwu_mon = FIELD_GET(MPAMF_MBWUMON_IDR_NUM_MON, mbwumon_idr); @@ -905,16 +900,6 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) } else { mpam_set_feature(mpam_feat_msmon_mbwu_31counter, props); } - - /* Is NRDY hardware managed? */ - hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); - if (hw_managed) - mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props); - - /* - * Don't warn about any missing firmware property for - * MBWU NRDY - it doesn't make any sense! - */ } } } @@ -1197,7 +1182,6 @@ static void __ris_msmon_read(void *arg) bool reset_on_next_read = false; struct mpam_msc_ris *ris = m->ris; struct msmon_mbwu_state *mbwu_state; - struct mpam_props *rprops = &ris->props; struct mpam_msc *msc = m->ris->vmsc->msc; u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt; @@ -1253,8 +1237,7 @@ static void __ris_msmon_read(void *arg) switch (m->type) { case mpam_feat_msmon_csu: now = mpam_read_monsel_reg(msc, CSU); - if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) - nrdy = now & MSMON___NRDY; + nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout) @@ -1266,8 +1249,7 @@ static void __ris_msmon_read(void *arg) case mpam_feat_msmon_mbwu_63counter: if (m->type != mpam_feat_msmon_mbwu_31counter) { now = mpam_msc_read_mbwu_l(msc); - if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) - nrdy = now & MSMON___L_NRDY; + nrdy = now & MSMON___L_NRDY; if (m->type == mpam_feat_msmon_mbwu_63counter) now = FIELD_GET(MSMON___LWD_VALUE, now); @@ -1275,8 +1257,7 @@ static void __ris_msmon_read(void *arg) now = FIELD_GET(MSMON___L_VALUE, now); } else { now = mpam_read_monsel_reg(msc, MBWU); - if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) - nrdy = now & MSMON___NRDY; + nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 1914aefdcba9..04d1a59f02af 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -181,14 +181,12 @@ enum mpam_device_features { mpam_feat_msmon_csu, mpam_feat_msmon_csu_capture, mpam_feat_msmon_csu_xcl, - mpam_feat_msmon_csu_hw_nrdy, mpam_feat_msmon_mbwu, mpam_feat_msmon_mbwu_31counter, mpam_feat_msmon_mbwu_44counter, mpam_feat_msmon_mbwu_63counter, mpam_feat_msmon_mbwu_capture, mpam_feat_msmon_mbwu_rwbw, - mpam_feat_msmon_mbwu_hw_nrdy, mpam_feat_partid_nrw, MPAM_FEATURE_LAST }; From ccad6001be5c38426ccf45790c411467ad3c03c6 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Thu, 7 May 2026 16:28:14 +0100 Subject: [PATCH 324/377] arm_mpam: Improve check for whether or not NRDY is hardware managed mpam_ris_hw_probe_csu_nrdy() sets and clears MSMON_CSU.NRDY and checks whether it's configuration sticks. However, hardware isn't given a chance to disagree. Based on rule LRTGP, in MPAM specification IHI0099 version B.b, the hardware will set NRDY if it needs time to establish a count after a configuration change. Enable the monitor so that NRDY becomes relevant and change the configuration after clearing NRDY to try and coax the hardware into setting it. Fixes: 8c90dc68a5de ("arm_mpam: Probe the hardware features resctrl supports") Cc: Signed-off-by: Ben Horgan Reviewed-by: James Morse Signed-off-by: Catalin Marinas --- drivers/resctrl/mpam_devices.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 58e0c8970e8c..e145828f3f73 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -730,8 +730,7 @@ static void mpam_enable_quirks(struct mpam_msc *msc) */ static bool mpam_ris_hw_probe_csu_nrdy(struct mpam_msc_ris *ris) { - u32 now; - u32 mon_sel; + u32 now, mon_sel, ctl_val; bool can_set, can_clear; struct mpam_msc *msc = ris->vmsc->msc; @@ -742,11 +741,21 @@ static bool mpam_ris_hw_probe_csu_nrdy(struct mpam_msc_ris *ris) FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); + /* Hardware might ignore nrdy if it's not enabled */ + ctl_val = MSMON_CFG_CSU_CTL_TYPE_CSU; + ctl_val |= MSMON_CFG_x_CTL_MATCH_PARTID; + ctl_val |= MSMON_CFG_x_CTL_MATCH_PMG; + ctl_val |= MSMON_CFG_x_CTL_EN; + mpam_write_monsel_reg(msc, CFG_CSU_FLT, 0); + mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val); + _mpam_write_monsel_reg(msc, MSMON_CSU, MSMON___NRDY); now = _mpam_read_monsel_reg(msc, MSMON_CSU); can_set = now & MSMON___NRDY; _mpam_write_monsel_reg(msc, MSMON_CSU, 0); + /* Configuration change to try and coax hardware into setting nrdy */ + mpam_write_monsel_reg(msc, CFG_CSU_FLT, 0x1); now = _mpam_read_monsel_reg(msc, MSMON_CSU); can_clear = !(now & MSMON___NRDY); mpam_mon_sel_unlock(msc); From f1caff3335ea6eab88cdc84ec8f2e3c45ca05486 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 8 May 2026 17:23:38 +0100 Subject: [PATCH 325/377] arm_mpam: Fix false positive assert failure during mpam_disable() mpam_assert_partid_sizes_fixed() is used to document that the caller doesn't expect the discovered PARTID size to change while it is walking a list sized by PARTID. Typically the MSC state is not written to until all the MSC have been discovered and this value is set. However, if discovering the MSC fails and schedules mpam_disable(), then the MSC state is written to reset it. In this case the discovered PARTID size may be become smaller - but only PARTID 0 will be used once resctrl_exit() has been called. Skip the WARN_ON_ONCE() if mpam_disable_reason has been set. Fixes: 3bd04fe7d807 ("arm_mpam: Extend reset logic to allow devices to be reset any time") Cc: Signed-off-by: James Morse Reviewed-by: Ben Horgan Signed-off-by: Catalin Marinas --- drivers/resctrl/mpam_devices.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index e145828f3f73..5c48812d8573 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -164,11 +164,17 @@ static void mpam_free_garbage(void) /* * Once mpam is enabled, new requestors cannot further reduce the available * partid. Assert that the size is fixed, and new requestors will be turned - * away. + * away. This is needed when walking over structures sized by PARTID. + * + * During mpam_disable() these structures are not fixed, but the MSC state + * is still reset using whatever sizes have been discovered so far. As only + * PARTID 0 will be used after mpam_disable(), any race would be benign. + * Skip the check if a mpam_disable_reason has been set. */ static void mpam_assert_partid_sizes_fixed(void) { - WARN_ON_ONCE(!partid_max_published); + if (!mpam_disable_reason) + WARN_ON_ONCE(!partid_max_published); } static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg) From 6ccbb613b42a1f1ba7bfd547a148f644a902a25c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 8 May 2026 17:23:39 +0100 Subject: [PATCH 326/377] arm_mpam: Check whether the config array is allocated before destroying it __destroy_component_cfg() is called to free the configuration array. It uses the embedded 'garbage' structure, which means the array has to be allocated. If __destroy_component_cfg() is called from mpam_disable() before the configuration was ever allocated, then a NULL pointer is dereferenced. Check for this case and return early if the configuration is not allocated. __destroy_component_cfg() also frees the mbwu_state as this is allocated by __allocate_component_cfg(). As the mbwu_state is allocated after comp->cfg is set, and is also under mpam_list_lock, only the first pointer needs checking. Fixes: 3bd04fe7d807 ("arm_mpam: Extend reset logic to allow devices to be reset any time") Cc: Signed-off-by: James Morse Reviewed-by: Ben Horgan Signed-off-by: Catalin Marinas --- drivers/resctrl/mpam_devices.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 5c48812d8573..988fc291241d 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2581,6 +2581,9 @@ static void __destroy_component_cfg(struct mpam_component *comp) lockdep_assert_held(&mpam_list_lock); + if (!comp->cfg) + return; + add_to_garbage(comp->cfg); list_for_each_entry(vmsc, &comp->vmsc, comp_list) { msc = vmsc->msc; From 277740023def559a4a2ddc3e8e784ee37a0f16a9 Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Sun, 10 May 2026 23:21:38 -0700 Subject: [PATCH 327/377] net/smc: reject CHID-0 ACCEPT that matches an empty ism_dev slot On the SMC-D client, slot 0 of ini->ism_dev[]/ini->ism_chid[] is reserved for an SMC-Dv1 device. smc_find_ism_v2_device_clnt() populates V2 entries starting at index 1, so when no V1 device is selected slot 0 is left in its kzalloc()'ed state with ism_dev[0] == NULL and ism_chid[0] == 0. smc_v2_determine_accepted_chid() then matches the peer's CHID against the array starting from index 0 using the CHID alone. A malicious peer replying to a SMC-Dv2-only proposal with d1.chid == 0 matches the empty slot, ini->ism_selected becomes 0, and the subsequent ism_dev[0]->lgr_lock dereference in smc_conn_create() faults at offsetof(struct smcd_dev, lgr_lock) == 0x68: BUG: KASAN: null-ptr-deref in _raw_spin_lock_bh+0x79/0xe0 Write of size 4 at addr 0000000000000068 by task exploit/144 Call Trace: _raw_spin_lock_bh smc_conn_create (net/smc/smc_core.c:1997) __smc_connect (net/smc/af_smc.c:1447) smc_connect (net/smc/af_smc.c:1720) __sys_connect __x64_sys_connect do_syscall_64 Require ism_dev[i] to be non-NULL before accepting a CHID match. Fixes: a7c9c5f4af7f ("net/smc: CLC accept / confirm V2") Reported-by: Weiming Shi Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Xiang Mei Link: https://patch.msgid.link/20260511062138.2839584-1-xmei5@asu.edu Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da28652f6810..dffbd529762d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1400,7 +1400,8 @@ smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc, int i; for (i = 0; i < ini->ism_offered_cnt + 1; i++) { - if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { + if (ini->ism_dev[i] && + ini->ism_chid[i] == ntohs(aclc->d1.chid)) { ini->ism_selected = i; return 0; } From 602d60ebae0f10bfbc7ba90eee026fdbd0203df3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 22 Apr 2026 11:42:32 +0200 Subject: [PATCH 328/377] vdso/gettimeofday: Reload sequence counter after switch to time page in do_aux() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After switching to the real data pages, the sequence counter needs to be reloaded from there. The code using vdso_read_begin_timens() assumed this worked by 'continue' jumping to the *beginning* of the do-while retry loop. However the 'continue' jumps to the *end* of said loop, evaluating the exit condition. If the data page has a sequence counter of '1' it will match the one from the time namespace page and prematurely exit the retry loop. This would result in garbage returned to the caller. Reload the sequence counter after switching the pages by using an inner while loop again, which will loop at most once. The loop generates slightly better code than an explicit reload through 'seq = vdso_read_begin()'. Fixes: ed78b7b2c5ae ("vdso/gettimeofday: Add a helper to read the sequence lock of a time namespace aware clock") Reported-by: Ricardo Ribalda Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Tested-by: Ricardo Ribalda Reviewed-by: Christophe Leroy (CS GROUP) Link: https://patch.msgid.link/20260422-vdso-aux-timens-loop-v1-1-e2dd8c7164cc@linutronix.de Closes: https://lore.kernel.org/lkml/CANiDSCsOy0P1if-gJZqOM5pTJ0RDcwVfru1B7KFbTOEMqjPKJw@mail.gmail.com/ --- lib/vdso/gettimeofday.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index a5798bd26d20..da224011fafd 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -248,11 +248,10 @@ bool do_aux(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_ti vc = &vd->aux_clock_data[idx]; do { - if (vdso_read_begin_timens(vc, &seq)) { + while (vdso_read_begin_timens(vc, &seq)) { + /* Re-read from the real time data page, reload seq by looping */ vd = __arch_get_vdso_u_timens_data(vd); vc = &vd->aux_clock_data[idx]; - /* Re-read from the real time data page */ - continue; } /* Auxclock disabled? */ From 591711b32681a04b57d00c2a404658f8419a081c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Fri, 8 May 2026 18:09:20 +0200 Subject: [PATCH 329/377] drm/ttm: Convert -EAGAIN from dmem_cgroup_try_charge to -ENOSPC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dmem_cgroup_try_charge() returns -EAGAIN when the cgroup limit is hit and the charge fails. TTM has no concept of -EAGAIN from resource allocation; -ENOSPC is the canonical error meaning "no space, try eviction". Convert at the source in ttm_resource_alloc() so no caller needs to handle an unexpected error code, and clean up the now-redundant -EAGAIN check in ttm_bo_alloc_resource(). Without this, -EAGAIN escaping ttm_resource_alloc() during an eviction walk causes the walk to terminate early instead of continuing to the next candidate. Cc: Friedrich Vock Cc: Maarten Lankhorst Cc: Tejun Heo Cc: Maxime Ripard Cc: Christian Koenig Cc: dri-devel@lists.freedesktop.org Cc: # v6.14+ Fixes: 2b624a2c1865 ("drm/ttm: Handle cgroup based eviction in TTM") Assisted-by: GitHub_Copilot:claude-sonnet-4.6 Signed-off-by: Thomas Hellström Reviewed-by: Maarten Lankhorst Link: https://patch.msgid.link/20260508160920.230339-1-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/ttm/ttm_bo.c | 2 +- drivers/gpu/drm/ttm/ttm_resource.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 293401705542..bcd76f6bb7f0 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -739,7 +739,7 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo, may_evict = (force_space && place->mem_type != TTM_PL_SYSTEM); ret = ttm_resource_alloc(bo, place, res, force_space ? &limit_pool : NULL); if (ret) { - if (ret != -ENOSPC && ret != -EAGAIN) { + if (ret != -ENOSPC) { dmem_cgroup_pool_state_put(limit_pool); return ret; } diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index 0e5f1582f13d..154d6739256f 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -398,8 +398,11 @@ int ttm_resource_alloc(struct ttm_buffer_object *bo, if (man->cg) { ret = dmem_cgroup_try_charge(man->cg, bo->base.size, &pool, ret_limit_pool); - if (ret) + if (ret) { + if (ret == -EAGAIN) + ret = -ENOSPC; return ret; + } } ret = man->func->alloc(man, bo, place, res_ptr); From 285943c6e7ca309bbea84b253745154241d9788a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 May 2026 10:49:17 -0700 Subject: [PATCH 330/377] net: tls: fix off-by-one in sg_chain entry count for wrapped sk_msg ring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an sk_msg scatterlist ring wraps (sg.end < sg.start), tls_push_record() chains the tail portion of the ring to the head using sg_chain(). An extra entry in the sg array is reserved for this: struct sk_msg_sg { [...] /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the * chaining; * 2) to chain tailer SG entries after the message. */ struct scatterlist data[MAX_MSG_FRAGS + 2]; The current code uses MAX_SKB_FRAGS + 1 as the ring size: sg_chain(&msg_pl->sg.data[msg_pl->sg.start], MAX_SKB_FRAGS - msg_pl->sg.start + 1, msg_pl->sg.data); This places the chain pointer at sg_chain(data[start], (MAX_SKB_FRAGS - msg_start + 1) .. = &data[start] + (MAX_SKB_FRAGS - msg_start + 1) - 1 = data[start + (MAX_SKB_FRAGS - start + 1) - 1] = data[MAX_SKB_FRAGS] instead of the true last entry. This is likely due to a "race" of the commit under Fixes landing close to commit 031097d9e079 ("bpf: sk_msg, zap ingress queue on psock down") Convert to ARRAY_SIZE and drop the data[start] / - start (as suggested by Sabrina). Reported-by: 钱一铭 Fixes: 9aaaa56845a0 ("bpf: Sockmap/tls, skmsg can have wrapped skmsg that needs extra chaining") Signed-off-by: Jakub Kicinski Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260511174920.433155-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/tls/tls_sw.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2590e855f6a5..2608b0c01849 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -800,11 +800,9 @@ static int tls_push_record(struct sock *sk, int flags, sg_mark_end(sk_msg_elem(msg_pl, i)); } - if (msg_pl->sg.end < msg_pl->sg.start) { - sg_chain(&msg_pl->sg.data[msg_pl->sg.start], - MAX_SKB_FRAGS - msg_pl->sg.start + 1, + if (msg_pl->sg.end < msg_pl->sg.start) + sg_chain(msg_pl->sg.data, ARRAY_SIZE(msg_pl->sg.data), msg_pl->sg.data); - } i = msg_pl->sg.start; sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); From ff26a0e8377dec07e4a7230db7675bed1b9a6d03 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 May 2026 10:49:18 -0700 Subject: [PATCH 331/377] net: tls: prevent chain-after-chain in plain text SG Sashiko points out that if end = 0 (start != 0) the current code will create a chain link to content type right after the wrap link: This would create a chain where the wrap link points directly to another chain link. The scatterlist API sg_next iterator does not recursively resolve consecutive chain links. meaning this is illegal input to crypto. The wrapping link is unnecessary if end = 0. end is the entry after the last one used so end = 0 means there's nothing pushed after the wrap: end start i v v v [ ]...[ ][ d ][ d ][ d ][ d ][rsv for wrap] Skip the wrapping in this case. TLS 1.3 can use the "wrapping slot" for it's chaining if end = 0. This avoids the chain-after-chain. Move the wrap chaining before marking END and chaining off content type, that feels like more logical ordering to me, but should not matter from functional perspective. Reported-by: Sashiko Fixes: 9aaaa56845a0 ("bpf: Sockmap/tls, skmsg can have wrapped skmsg that needs extra chaining") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260511174920.433155-3-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/tls/tls_sw.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2608b0c01849..3bfdaf5e64f5 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -789,21 +789,33 @@ static int tls_push_record(struct sock *sk, int flags, i = msg_pl->sg.end; sk_msg_iter_var_prev(i); + /* msg_pl->sg.data is a ring; data[MAX+1] is reserved for the wrap + * link (frags won't use it). 'i' is now the last filled entry: + * + * i end start + * v v v [ rsv ] + * [ d ][ d ][ ][ ]...[ ][ d ][ d ][ d ][chain] + * ^ END v + * `-----------------------------------------' + * + * Note that SGL does not allow chain-after-chain, so for TLS 1.3, + * we must make sure we don't create the wrap entry and then chain + * link to content_type immediately at index 0. + */ + if (i < msg_pl->sg.start) + sg_chain(msg_pl->sg.data, ARRAY_SIZE(msg_pl->sg.data), + msg_pl->sg.data); + rec->content_type = record_type; if (prot->version == TLS_1_3_VERSION) { /* Add content type to end of message. No padding added */ sg_set_buf(&rec->sg_content_type, &rec->content_type, 1); sg_mark_end(&rec->sg_content_type); - sg_chain(msg_pl->sg.data, msg_pl->sg.end + 1, - &rec->sg_content_type); + sg_chain(msg_pl->sg.data, i + 2, &rec->sg_content_type); } else { sg_mark_end(sk_msg_elem(msg_pl, i)); } - if (msg_pl->sg.end < msg_pl->sg.start) - sg_chain(msg_pl->sg.data, ARRAY_SIZE(msg_pl->sg.data), - msg_pl->sg.data); - i = msg_pl->sg.start; sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); From 561458db0d6b08b4e4956c6e4456d7781b18676f Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Wed, 13 May 2026 14:51:29 -0600 Subject: [PATCH 332/377] docs: security-bugs: add a link to the threat-model documentation Rather than make readers search for this document, just a link to it where it is referenced. (While I was at it, I removed the unused and unneeded _threatmodel label from the top of threat-model.rst). Acked-by: Willy Tarreau Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jonathan Corbet --- Documentation/process/security-bugs.rst | 13 +++++++------ Documentation/process/threat-model.rst | 2 -- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Documentation/process/security-bugs.rst b/Documentation/process/security-bugs.rst index f85c65f31f12..3c51ddde31dd 100644 --- a/Documentation/process/security-bugs.rst +++ b/Documentation/process/security-bugs.rst @@ -191,12 +191,13 @@ handle: Please **always convert your report to plain text** without any formatting decorations before sending it. - * **Impact Evaluation**: Many AI-generated reports lack an understanding of - the kernel's threat model and go to great lengths inventing theoretical - consequences. This adds noise and complicates triage. Please stick to - verifiable facts (e.g., "this bug permits any user to gain CAP_NET_ADMIN") - without enumerating speculative implications. Have your tool read this - documentation as part of the evaluation process. + * **Impact Evaluation**: Many AI-generated reports lack an understanding + of the kernel's threat model (see Documentation/process/threat-model.rst) + and go to great lengths inventing theoretical consequences. This adds + noise and complicates triage. Please stick to verifiable facts (e.g., + "this bug permits any user to gain CAP_NET_ADMIN") without enumerating + speculative implications. Have your tool read this documentation as + part of the evaluation process. * **Reproducer**: AI-based tools are often capable of generating reproducers. Please always ensure your tool provides one and **test it thoroughly**. If diff --git a/Documentation/process/threat-model.rst b/Documentation/process/threat-model.rst index ecb432390e79..91da52f7114f 100644 --- a/Documentation/process/threat-model.rst +++ b/Documentation/process/threat-model.rst @@ -1,5 +1,3 @@ -.. _threatmodel: - The Linux Kernel threat model ============================= From f2e65e4e5b4b4b9ecf43f03c3fdbe8c9a8a43a9e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Wed, 13 May 2026 14:58:53 -0600 Subject: [PATCH 333/377] docs: threat-model: don't limit root capabilities to CAP_SYS_ADMIN The threat-model document says that only users with CAP_SYS_ADMIN can carry out a number of admin-level tasks, but there are numerous capabilities that can confer that sort of power. Generalize the text slightly to make it clear that CAP_SYS_ADMIN is not the only all-powerful capability. Acked-by: Willy Tarreau Signed-off-by: Jonathan Corbet --- Documentation/process/threat-model.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/process/threat-model.rst b/Documentation/process/threat-model.rst index 91da52f7114f..f177b8d3c1ca 100644 --- a/Documentation/process/threat-model.rst +++ b/Documentation/process/threat-model.rst @@ -62,7 +62,8 @@ on common processors featuring privilege levels and memory management units: * **Capability-based protection**: - * users not having the ``CAP_SYS_ADMIN`` capability may not alter the + * users not having elevated capabilities (including but not limited to + CAP_SYS_ADMIN) may not alter the kernel's configuration, memory nor state, change other users' view of the file system layout, grant any user capabilities they do not have, nor affect the system's availability (shutdown, reboot, panic, hang, or making From c78bdba7b9666020c0832150a4fc4c0aebc7c6ac Mon Sep 17 00:00:00 2001 From: Sven Schuchmann Date: Tue, 12 May 2026 09:19:47 +0200 Subject: [PATCH 334/377] net: phy: DP83TC811: add reading of abilities At this time the driver is not listing any speeds it supports. This should be ETHTOOL_LINK_MODE_100baseT1_Full_BIT for DP83TC811. Add the missing call for phylib to read the abilities. Fixes: b753a9faaf9a ("net: phy: DP83TC811: Introduce support for the DP83TC811 phy") Suggested-by: Andrew Lunn Signed-off-by: Sven Schuchmann Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20260512071949.6218-1-schuchmann@schleissheimer.de [pabeni@redhat.com: dropped revision history] Signed-off-by: Paolo Abeni --- drivers/net/phy/dp83tc811.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/dp83tc811.c b/drivers/net/phy/dp83tc811.c index e480c2a07450..252fb12b3e68 100644 --- a/drivers/net/phy/dp83tc811.c +++ b/drivers/net/phy/dp83tc811.c @@ -393,6 +393,7 @@ static struct phy_driver dp83811_driver[] = { .config_init = dp83811_config_init, .config_aneg = dp83811_config_aneg, .soft_reset = dp83811_phy_reset, + .get_features = genphy_c45_pma_read_ext_abilities, .get_wol = dp83811_get_wol, .set_wol = dp83811_set_wol, .config_intr = dp83811_config_intr, From 1d59f36e95f7f7134db0e313c9d787cb0adb2153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Mon, 11 May 2026 18:24:43 +0200 Subject: [PATCH 335/377] drm/ttm: Fix ttm_bo_shrink() infinite LRU walk on backup failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the same fix as b2ed01e7ad ("drm/ttm: Fix ttm_bo_swapout() infinite LRU walk on swapout failure") to the ttm_bo_shrink() path. Move del_bulk_move from before the backup to after success only, using ttm_resource_del_bulk_move_unevictable() since the resource is now unevictable once fully backed up. Fixes: 70d645deac98 ("drm/ttm: Add helpers for shrinking") Cc: Christian König Cc: Huang Rui Cc: Matthew Auld Cc: Matthew Brost Cc: Dave Airlie Cc: dri-devel@lists.freedesktop.org Cc: stable@vger.kernel.org # v6.15+ Assisted-by: GitHub_Copilot:claude-opus-4.6 Reviewed-by: Matthew Auld Link: https://patch.msgid.link/20260511162443.24352-1-thomas.hellstrom@linux.intel.com Signed-off-by: Thomas Hellström --- drivers/gpu/drm/ttm/ttm_bo_util.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index f83b7d5ec6c6..3e3c201a0222 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -1112,19 +1112,14 @@ long ttm_bo_shrink(struct ttm_operation_ctx *ctx, struct ttm_buffer_object *bo, if (lret < 0) return lret; - if (bo->bulk_move) { - spin_lock(&bdev->lru_lock); - ttm_resource_del_bulk_move(bo->resource, bo); - spin_unlock(&bdev->lru_lock); - } - lret = ttm_tt_backup(bdev, bo->ttm, (struct ttm_backup_flags) {.purge = flags.purge, .writeback = flags.writeback}); - if (lret <= 0 && bo->bulk_move) { + if (lret > 0) { spin_lock(&bdev->lru_lock); - ttm_resource_add_bulk_move(bo->resource, bo); + ttm_resource_del_bulk_move_unevictable(bo->resource, bo); + ttm_resource_move_to_lru_tail(bo->resource); spin_unlock(&bdev->lru_lock); } From 7d9a7f1f96cd617ee9e75bb22217c709038e26b8 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 14 May 2026 21:14:18 +0800 Subject: [PATCH 336/377] smb/client: fix possible infinite loop and oob read in symlink_data() On 32-bit architectures, the infinite loop is as follows: len = p->ErrorDataLength == 0xfffffff8 u8 *next = p->ErrorContextData + len next == p On 32-bit architectures, the out-of-bounds read is as follows: len = p->ErrorDataLength == 0xfffffff0 u8 *next = p->ErrorContextData + len next == (u8 *)p - 8 Reported-by: ChenXiaoSong Fixes: 76894f3e2f71 ("cifs: improve symlink handling for smb2+") Cc: stable@vger.kernel.org Signed-off-by: Ye Bin Reviewed-by: ChenXiaoSong Signed-off-by: Steve French --- fs/smb/client/smb2file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index b292aa94a593..6860eff31693 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -49,6 +49,9 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) __func__, le32_to_cpu(p->ErrorId)); len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8); + if (len > end - ((u8 *)p + sizeof(*p))) + return ERR_PTR(-EINVAL); + p = (struct smb2_error_context_rsp *)(p->ErrorContextData + len); } } else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) && From a6ab75639e23169a741b0b2e12191fd8acb32c73 Mon Sep 17 00:00:00 2001 From: Nick Chan Date: Thu, 14 May 2026 21:16:01 +0800 Subject: [PATCH 337/377] nvme-apple: Reset q->sq_tail during queue init Fixes a "duplicate tag error for tag 0" firmware crash during controller reset while setting up a queue on Apple A11 / T8015 caused by stale entries in the submission queue due to an invalid sq_tail offset after reset. Fixes: 04d8ecf37b5e ("nvme: apple: Add Apple A11 support") Cc: stable@vger.kernel.org Suggested-by: Yuriy Havrylyuk Reviewed-by: Sven Peter Signed-off-by: Nick Chan Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 423c9c628e7b..c692fc73babf 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1009,6 +1009,7 @@ static void apple_nvme_init_queue(struct apple_nvme_queue *q) unsigned int depth = apple_nvme_queue_depth(q); struct apple_nvme *anv = queue_to_apple_nvme(q); + q->sq_tail = 0; q->cq_head = 0; q->cq_phase = 1; if (anv->hw->has_lsq_nvmmu) From ab26dfeba278b0efbcea012f1698cf524d9b5695 Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Wed, 13 May 2026 22:26:22 +0900 Subject: [PATCH 338/377] cifs: client: stage smb3_reconfigure() updates and restore ctx on failure smb3_reconfigure() moves strings out of cifs_sb->ctx before the multichannel update, so a later failure can leave the live context with NULL strings or options that do not match the session. Stage the new ctx separately, commit it only on success, and restore the snapshot on failure. Also make smb3_sync_session_ctx_passwords() all-or-nothing. Commit session passwords before channel updates so newly added channels authenticate with the staged credentials. Fixes: ef529f655a2c ("cifs: client: allow changing multichannel mount options on remount") Reported-by: RAJASI MANDAL Closes: https://lore.kernel.org/lkml/CAEY6_V1+dzW3OD5zqXhsWyXwrDTrg5tAMGZ1AJ7_GAuRE+aevA@mail.gmail.com/ Link: https://lore.kernel.org/lkml/xkr2dlvgibq5j6gkcxd3yhhnj4atgxw2uy4eug2pxm7wy7nbms@iq6cf5taa65v/ Reviewed-by: Henrique Carvalho Signed-off-by: DaeMyung Kang Signed-off-by: Steve French --- fs/smb/client/fs_context.c | 163 +++++++++++++++++++++++++------------ 1 file changed, 109 insertions(+), 54 deletions(-) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index b63ec7ab6e51..4c8b1b9ade8b 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -736,7 +736,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, static int smb3_fs_context_parse_monolithic(struct fs_context *fc, void *data); static int smb3_get_tree(struct fs_context *fc); -static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channels); +static void smb3_sync_ses_chan_max(struct cifs_ses *ses, size_t max_channels); static int smb3_reconfigure(struct fs_context *fc); static const struct fs_context_operations smb3_fs_context_ops = { @@ -1010,25 +1010,34 @@ do { \ int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { + char *password = NULL, *password2 = NULL; + if (ses->password && cifs_sb->ctx->password && strcmp(ses->password, cifs_sb->ctx->password)) { - kfree_sensitive(cifs_sb->ctx->password); - cifs_sb->ctx->password = kstrdup(ses->password, GFP_KERNEL); - if (!cifs_sb->ctx->password) + password = kstrdup(ses->password, GFP_KERNEL); + if (!password) return -ENOMEM; } if (ses->password2 && cifs_sb->ctx->password2 && strcmp(ses->password2, cifs_sb->ctx->password2)) { - kfree_sensitive(cifs_sb->ctx->password2); - cifs_sb->ctx->password2 = kstrdup(ses->password2, GFP_KERNEL); - if (!cifs_sb->ctx->password2) { - kfree_sensitive(cifs_sb->ctx->password); - cifs_sb->ctx->password = NULL; + password2 = kstrdup(ses->password2, GFP_KERNEL); + if (!password2) { + kfree_sensitive(password); return -ENOMEM; } } + + if (password) { + kfree_sensitive(cifs_sb->ctx->password); + cifs_sb->ctx->password = password; + } + if (password2) { + kfree_sensitive(cifs_sb->ctx->password2); + cifs_sb->ctx->password2 = password2; + } + return 0; } @@ -1041,7 +1050,7 @@ int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_se * with the session's channel lock. This should be called whenever the maximum * allowed channels for a session changes (e.g., after a remount or reconfigure). */ -static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channels) +static void smb3_sync_ses_chan_max(struct cifs_ses *ses, size_t max_channels) { spin_lock(&ses->chan_lock); ses->chan_max = max_channels; @@ -1051,12 +1060,15 @@ static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channe static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); + struct smb3_fs_context *new_ctx = NULL; + struct smb3_fs_context *old_ctx = NULL; struct dentry *root = fc->root; struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses; unsigned int rsize = ctx->rsize, wsize = ctx->wsize; char *new_password = NULL, *new_password2 = NULL; bool need_recon = false; + bool need_mchan_update; int rc; if (ses->expired_pwd) @@ -1066,6 +1078,16 @@ static int smb3_reconfigure(struct fs_context *fc) if (rc) return rc; + old_ctx = kzalloc_obj(*old_ctx); + if (!old_ctx) + return -ENOMEM; + + rc = smb3_fs_context_dup(old_ctx, cifs_sb->ctx); + if (rc) { + kfree(old_ctx); + return rc; + } + /* * We can not change UNC/username/password/domainname/ * workstation_name/nodename/iocharset @@ -1075,16 +1097,22 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); + STEAL_STRING(cifs_sb, ctx, domainname); + STEAL_STRING(cifs_sb, ctx, nodename); + STEAL_STRING(cifs_sb, ctx, iocharset); - if (need_recon == false) + if (!need_recon) { STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); - else { + } else { if (ctx->password) { new_password = kstrdup(ctx->password, GFP_KERNEL); - if (!new_password) - return -ENOMEM; - } else + if (!new_password) { + rc = -ENOMEM; + goto restore_ctx; + } + } else { STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); + } } /* @@ -1094,11 +1122,29 @@ static int smb3_reconfigure(struct fs_context *fc) if (ctx->password2) { new_password2 = kstrdup(ctx->password2, GFP_KERNEL); if (!new_password2) { - kfree_sensitive(new_password); - return -ENOMEM; + rc = -ENOMEM; + goto restore_ctx; } - } else + } else { STEAL_STRING_SENSITIVE(cifs_sb, ctx, password2); + } + + /* if rsize or wsize not passed in on remount, use previous values */ + ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize; + ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize; + + new_ctx = kzalloc_obj(*new_ctx); + if (!new_ctx) { + rc = -ENOMEM; + goto restore_ctx; + } + + rc = smb3_fs_context_dup(new_ctx, ctx); + if (rc) + goto restore_ctx; + + need_mchan_update = ctx->multichannel != cifs_sb->ctx->multichannel || + ctx->max_channels != cifs_sb->ctx->max_channels; /* * we may update the passwords in the ses struct below. Make sure we do @@ -1109,54 +1155,55 @@ static int smb3_reconfigure(struct fs_context *fc) /* * smb2_reconnect may swap password and password2 in case session setup * failed. First get ctx passwords in sync with ses passwords. It should - * be okay to do this even if this function were to return an error at a - * later stage + * be done before committing new passwords. */ rc = smb3_sync_session_ctx_passwords(cifs_sb, ses); if (rc) { mutex_unlock(&ses->session_mutex); - kfree_sensitive(new_password); - kfree_sensitive(new_password2); - return rc; - } - - /* - * now that allocations for passwords are done, commit them - */ - if (new_password) { - kfree_sensitive(ses->password); - ses->password = new_password; - } - if (new_password2) { - kfree_sensitive(ses->password2); - ses->password2 = new_password2; + goto cleanup_new_ctx; } /* * If multichannel or max_channels has changed, update the session's channels accordingly. * This may add or remove channels to match the new configuration. */ - if ((ctx->multichannel != cifs_sb->ctx->multichannel) || - (ctx->max_channels != cifs_sb->ctx->max_channels)) { - - /* Synchronize ses->chan_max with the new mount context */ - smb3_sync_ses_chan_max(ses, ctx->max_channels); - /* Now update the session's channels to match the new configuration */ + if (need_mchan_update) { /* Prevent concurrent scaling operations */ spin_lock(&ses->ses_lock); if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) { spin_unlock(&ses->ses_lock); mutex_unlock(&ses->session_mutex); - return -EINVAL; + rc = -EINVAL; + goto cleanup_new_ctx; } ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS; spin_unlock(&ses->ses_lock); + } + + /* + * Commit session passwords before any channel work so newly added + * channels authenticate with the new credentials. + */ + if (new_password) { + kfree_sensitive(ses->password); + ses->password = new_password; + new_password = NULL; + } + if (new_password2) { + kfree_sensitive(ses->password2); + ses->password2 = new_password2; + new_password2 = NULL; + } + + if (need_mchan_update) { + /* Synchronize ses->chan_max with the new mount context */ + smb3_sync_ses_chan_max(ses, ctx->max_channels); mutex_unlock(&ses->session_mutex); - rc = smb3_update_ses_channels(ses, ses->server, - false /* from_reconnect */, - false /* disable_mchan */); + smb3_update_ses_channels(ses, ses->server, + false /* from_reconnect */, + false /* disable_mchan */); /* Clear scaling flag after operation */ spin_lock(&ses->ses_lock); @@ -1166,22 +1213,30 @@ static int smb3_reconfigure(struct fs_context *fc) mutex_unlock(&ses->session_mutex); } - STEAL_STRING(cifs_sb, ctx, domainname); - STEAL_STRING(cifs_sb, ctx, nodename); - STEAL_STRING(cifs_sb, ctx, iocharset); - - /* if rsize or wsize not passed in on remount, use previous values */ - ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize; - ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize; - smb3_cleanup_fs_context_contents(cifs_sb->ctx); - rc = smb3_fs_context_dup(cifs_sb->ctx, ctx); + memcpy(cifs_sb->ctx, new_ctx, sizeof(*new_ctx)); + kfree(new_ctx); + new_ctx = NULL; + smb3_cleanup_fs_context(old_ctx); + old_ctx = NULL; smb3_update_mnt_flags(cifs_sb); #ifdef CONFIG_CIFS_DFS_UPCALL if (!rc) rc = dfs_cache_remount_fs(cifs_sb); #endif + return rc; + +cleanup_new_ctx: + smb3_cleanup_fs_context_contents(new_ctx); +restore_ctx: + kfree(new_ctx); + kfree_sensitive(new_password); + kfree_sensitive(new_password2); + smb3_cleanup_fs_context_contents(cifs_sb->ctx); + memcpy(cifs_sb->ctx, old_ctx, sizeof(*old_ctx)); + kfree(old_ctx); + return rc; } From 31e62c2ebbfdc3fe3dbdf5e02c92a9dc67087a3a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 May 2026 11:37:18 -0700 Subject: [PATCH 339/377] ptrace: slightly saner 'get_dumpable()' logic The 'dumpability' of a task is fundamentally about the memory image of the task - the concept comes from whether it can core dump or not - and makes no sense when you don't have an associated mm. And almost all users do in fact use it only for the case where the task has a mm pointer. But we have one odd special case: ptrace_may_access() uses 'dumpable' to check various other things entirely independently of the MM (typically explicitly using flags like PTRACE_MODE_READ_FSCREDS). Including for threads that no longer have a VM (and maybe never did, like most kernel threads). It's not what this flag was designed for, but it is what it is. The ptrace code does check that the uid/gid matches, so you do have to be uid-0 to see kernel thread details, but this means that the traditional "drop capabilities" model doesn't make any difference for this all. Make it all make a *bit* more sense by saying that if you don't have a MM pointer, we'll use a cached "last dumpability" flag if the thread ever had a MM (it will be zero for kernel threads since it is never set), and require a proper CAP_SYS_PTRACE capability to override. Reported-by: Qualys Security Advisory Cc: Oleg Nesterov Cc: Kees Cook Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +++ kernel/exit.c | 1 + kernel/ptrace.c | 22 ++++++++++++++++------ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 368c7b4d7cb5..ee06cba5c6f5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1002,6 +1002,9 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif + /* Save user-dumpable when mm goes away */ + unsigned user_dumpable:1; + /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; diff --git a/kernel/exit.c b/kernel/exit.c index 9a909993ab1d..f50d73c272d6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -571,6 +571,7 @@ static void exit_mm(void) */ smp_mb__after_spinlock(); local_irq_disable(); + current->user_dumpable = (get_dumpable(mm) == SUID_DUMP_USER); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 68c17daef8d4..130043bfc209 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -272,11 +272,24 @@ static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) return ns_capable(ns, CAP_SYS_PTRACE); } +static bool task_still_dumpable(struct task_struct *task, unsigned int mode) +{ + struct mm_struct *mm = task->mm; + if (mm) { + if (get_dumpable(mm) == SUID_DUMP_USER) + return true; + return ptrace_has_cap(mm->user_ns, mode); + } + + if (task->user_dumpable) + return true; + return ptrace_has_cap(&init_user_ns, mode); +} + /* Returns 0 on success, -errno on denial. */ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; - struct mm_struct *mm; kuid_t caller_uid; kgid_t caller_gid; @@ -337,11 +350,8 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * Pairs with a write barrier in commit_creds(). */ smp_rmb(); - mm = task->mm; - if (mm && - ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(mm->user_ns, mode))) - return -EPERM; + if (!task_still_dumpable(task, mode)) + return -EPERM; return security_ptrace_access_check(task, mode); } From 81a874233c305d29e37fdb70b691ff4254294c0b Mon Sep 17 00:00:00 2001 From: Jeremy Erazo Date: Thu, 14 May 2026 12:03:34 +0000 Subject: [PATCH 340/377] smb: client: avoid integer overflow in SMB2 READ length check SMB2 READ response validation in cifs_readv_receive() and handle_read_data() checks data_offset + data_len against the received buffer length. Both values are attacker-controlled fields from the server response and are stored as unsigned int, so the addition can wrap before the bounds check: fs/smb/client/transport.c:1259 if (!use_rdma_mr && (data_offset + data_len > buflen)) fs/smb/client/smb2ops.c:4839 else if (buf_len >= data_offset + data_len) A malicious SMB server can use this to bypass validation. In the non-encrypted receive path the client attempts an oversized socket read and stalls for the SMB response timeout (180 seconds) before reconnecting. In the SMB3 encrypted path, runtime testing shows the malformed length can reach copy_to_iter() in handle_read_data() with attacker-controlled size, where usercopy hardening stops the oversized copy before bytes reach userspace. Guard both call sites with check_add_overflow(), which is already used elsewhere in this subsystem (smb2pdu.c). On overflow, treat the response as malformed and reject with -EIO. Signed-off-by: Jeremy Erazo Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 4 +++- fs/smb/client/transport.c | 15 +++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index e6cb9b144530..3738204984f5 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4721,6 +4721,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, { unsigned int data_offset; unsigned int data_len; + unsigned int end_off; unsigned int cur_off; unsigned int cur_page_idx; unsigned int pad_len; @@ -4836,7 +4837,8 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, } rdata->got_bytes = buffer_len; - } else if (buf_len >= data_offset + data_len) { + } else if (!check_add_overflow(data_offset, data_len, &end_off) && + buf_len >= end_off) { /* read response payload is in buf */ WARN_ONCE(buffer, "read data can be either in buf or in buffer"); copied = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter); diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 05f8099047e1..fdf4e50c27ce 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -1158,7 +1158,7 @@ int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { int length, len; - unsigned int data_offset, data_len; + unsigned int data_offset, data_len, end_off; struct cifs_io_subrequest *rdata = mid->callback_data; char *buf = server->smallbuf; unsigned int buflen = server->pdu_size; @@ -1256,11 +1256,14 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) use_rdma_mr = rdata->mr; #endif data_len = server->ops->read_data_length(buf, use_rdma_mr); - if (!use_rdma_mr && (data_offset + data_len > buflen)) { - /* data_len is corrupt -- discard frame */ - rdata->result = smb_EIO2(smb_eio_trace_read_rsp_malformed, - data_offset + data_len, buflen); - return cifs_readv_discard(server, mid); + if (!use_rdma_mr) { + if (check_add_overflow(data_offset, data_len, &end_off) || + end_off > buflen) { + /* data_len is corrupt -- discard frame */ + rdata->result = smb_EIO2(smb_eio_trace_read_rsp_malformed, + end_off, buflen); + return cifs_readv_discard(server, mid); + } } #ifdef CONFIG_CIFS_SMB_DIRECT From 905c559e51497b8bfdbb68df8be56d2f70f0de8e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 14 Mar 2026 14:24:56 +0100 Subject: [PATCH 341/377] gcc-plugins: Always define CONST_CAST_GIMPLE and CONST_CAST_TREE For gcc-16, the CONST_CAST macro family was removed. Add back what we were using in gcc-common.h, as they are simple wrappers. See GCC commits: c3d96ff9e916c02584aa081f03ab999292efbb50 458c7926d48959abcb2c1adaa22458e27459a551 Suggested-by: Ingo Saitz Link: https://lore.kernel.org/lkml/ab6OKoay0OWkywjK@spatz.zoo Fixes: 6b90bd4ba40b ("GCC plugin infrastructure") Tested-by: Ivan Bulatovic Tested-by: Christopher Cradock Signed-off-by: Kees Cook --- scripts/gcc-plugins/gcc-common.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h index 8f1b3500f8e2..abb1964c44d4 100644 --- a/scripts/gcc-plugins/gcc-common.h +++ b/scripts/gcc-plugins/gcc-common.h @@ -309,7 +309,9 @@ typedef const gimple *const_gimple_ptr; #define gimple gimple_ptr #define const_gimple const_gimple_ptr #undef CONST_CAST_GIMPLE -#define CONST_CAST_GIMPLE(X) CONST_CAST(gimple, (X)) +#define CONST_CAST_GIMPLE(X) const_cast((X)) +#undef CONST_CAST_TREE +#define CONST_CAST_TREE(X) const_cast((X)) /* gimple related */ static inline gimple gimple_build_assign_with_ops(enum tree_code subcode, tree lhs, tree op1, tree op2 MEM_STAT_DECL) From 28e03f78e69cf6628b81f24777799778528a84c1 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 5 May 2026 12:24:17 +0200 Subject: [PATCH 342/377] x86/xen: Fix xen_e820_swap_entry_with_ram() When swapping a not page-aligned E820 map entry with RAM, the start address of the modified entry is calculated wrong (the offset into the page is subtracted instead of being added to the page address). Fixes: be35d91c8880 ("xen: tolerate ACPI NVS memory overlapping with Xen allocated memory") Reported-by: Jan Beulich Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross Message-ID: <20260505102417.208138-1-jgross@suse.com> --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index bb95a05259b8..41251d4cf953 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -655,7 +655,7 @@ static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry) /* Fill new entry (keep size and page offset). */ entry->type = swap_entry->type; entry->addr = entry_end - swap_size + - swap_addr - swap_entry->addr; + swap_entry->addr - swap_addr; entry->size = swap_entry->size; /* Convert old entry to RAM, align to pages. */ From 4594437880ce347ac8438758fd91543f70da1aa9 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 8 May 2026 16:39:33 +0200 Subject: [PATCH 343/377] x86/xen: Tolerate nested XEN_LAZY_MMU entering/leaving MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the support of nested lazy mmu sections it can happen that arch_enter_lazy_mmu_mode() is being called twice without a call of arch_leave_lazy_mmu_mode() in between, as the lazy_mmu_*() helpers are not disabling preemption when checking for nested lazy mmu sections. This is a problem when running as a Xen PV guest, as xen_enter_lazy_mmu() and xen_leave_lazy_mmu() don't tolerate this case. Fix that in xen_enter_lazy_mmu() and xen_leave_lazy_mmu() in order not to hurt all other lazy mmu mode users. Fixes: 291b3abed657 ("x86/xen: use lazy_mmu_state when context-switching") Tested-by: Marek Marczykowski-Górecki Signed-off-by: Juergen Gross Message-ID: <20260508143933.493013-1-jgross@suse.com> --- arch/x86/xen/mmu_pv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index c80d0058efd1..3eee5f84f8a7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2145,7 +2145,10 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) static void xen_enter_lazy_mmu(void) { - enter_lazy(XEN_LAZY_MMU); + preempt_disable(); + if (xen_get_lazy_mode() != XEN_LAZY_MMU) + enter_lazy(XEN_LAZY_MMU); + preempt_enable(); } static void xen_flush_lazy_mmu(void) @@ -2182,7 +2185,8 @@ static void xen_leave_lazy_mmu(void) { preempt_disable(); xen_mc_flush(); - leave_lazy(XEN_LAZY_MMU); + if (xen_get_lazy_mode() != XEN_LAZY_NONE) + leave_lazy(XEN_LAZY_MMU); preempt_enable(); } From 05f2a68b407a6817fe141dd64972c6ab8725312d Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Mon, 11 May 2026 07:58:23 -0700 Subject: [PATCH 344/377] vfio/pci: Set up BAR resources and maps in vfio_pci_core_enable() Previously BAR resource requests and the corresponding pci_iomap() were performed on-demand and without synchronisation, which was racy. Rather than add synchronisation, it's simplest to address this by doing both activities from vfio_pci_core_enable(). The resource allocation and/or pci_iomap() can still fail; their status is tracked and existing calls to vfio_pci_core_setup_barmap() will fail in a similar way to before. This keeps the point of failure as observed by userspace the same, i.e. failures to request/map unused BARs are benign. Fixes: 89e1f7d4c66d ("vfio: Add PCI device driver") Signed-off-by: Matt Evans Link: https://lore.kernel.org/r/20260511145829.2993601-2-mattev@meta.com [ERR_PTR -> IOMEM_ERR_PTR per lkp report] Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 37 +++++++++++++++++++++++++++++++- drivers/vfio/pci/vfio_pci_rdwr.c | 26 ++++++---------------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 3f8d093aacf8..050e7542952e 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -482,6 +482,40 @@ static int vfio_pci_core_runtime_resume(struct device *dev) } #endif /* CONFIG_PM */ +/* + * Eager-request BAR resources, and iomap them. Soft failures are + * allowed, and consumers must check the barmap before use in order to + * give compatible user-visible behaviour with the previous on-demand + * allocation method. + */ +static void vfio_pci_core_map_bars(struct vfio_pci_core_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + int i; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + int bar = i + PCI_STD_RESOURCES; + + vdev->barmap[bar] = IOMEM_ERR_PTR(-ENODEV); + + if (!pci_resource_len(pdev, i)) + continue; + + if (pci_request_selected_regions(pdev, 1 << bar, "vfio")) { + pci_dbg(pdev, "Failed to reserve region %d\n", bar); + vdev->barmap[bar] = IOMEM_ERR_PTR(-EBUSY); + continue; + } + + vdev->barmap[bar] = pci_iomap(pdev, bar, 0); + if (!vdev->barmap[bar]) { + pci_dbg(pdev, "Failed to iomap region %d\n", bar); + pci_release_selected_regions(pdev, 1 << bar); + vdev->barmap[bar] = IOMEM_ERR_PTR(-ENOMEM); + } + } +} + /* * The pci-driver core runtime PM routines always save the device state * before going into suspended state. If the device is going into low power @@ -568,6 +602,7 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) vdev->has_vga = true; + vfio_pci_core_map_bars(vdev); return 0; @@ -648,7 +683,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) for (i = 0; i < PCI_STD_NUM_BARS; i++) { bar = i + PCI_STD_RESOURCES; - if (!vdev->barmap[bar]) + if (IS_ERR_OR_NULL(vdev->barmap[bar])) continue; pci_iounmap(pdev, vdev->barmap[bar]); pci_release_selected_regions(pdev, 1 << bar); diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 4251ee03e146..3bfbb879a005 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -198,27 +198,15 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, } EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); +/* + * The barmap is set up in vfio_pci_core_enable(). Callers use this + * function to check that the BAR resources are requested or that the + * pci_iomap() was done. + */ int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) { - struct pci_dev *pdev = vdev->pdev; - int ret; - void __iomem *io; - - if (vdev->barmap[bar]) - return 0; - - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); - if (ret) - return ret; - - io = pci_iomap(pdev, bar, 0); - if (!io) { - pci_release_selected_regions(pdev, 1 << bar); - return -ENOMEM; - } - - vdev->barmap[bar] = io; - + if (IS_ERR(vdev->barmap[bar])) + return PTR_ERR(vdev->barmap[bar]); return 0; } EXPORT_SYMBOL_GPL(vfio_pci_core_setup_barmap); From 702809dabdecca807bdd50cfdcc1c980feb2ba62 Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Mon, 11 May 2026 07:58:24 -0700 Subject: [PATCH 345/377] vfio/pci: Check BAR resources before exporting a DMABUF A DMABUF exports access to BAR resources and, although they are requested at startup time, we need to ensure they really were reserved before exporting. Otherwise, it's possible to access unreserved resources through the export. Add a check to the DMABUF-creation path. Fixes: 5d74781ebc86c ("vfio/pci: Add dma-buf export support for MMIO regions") Signed-off-by: Matt Evans Link: https://lore.kernel.org/r/20260511145829.2993601-3-mattev@meta.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_dmabuf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index fdc22e8b4656..1a177ce7de54 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -244,9 +244,11 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, return -EINVAL; /* - * For PCI the region_index is the BAR number like everything else. + * For PCI the region_index is the BAR number like everything + * else. Check that PCI resources have been claimed for it. */ - if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX) + if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX || + vfio_pci_core_setup_barmap(vdev, get_dma_buf.region_index)) return -ENODEV; dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, From c207f1d785044667f87cc8c72355e33f3981f2d6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 13 May 2026 19:50:02 +0100 Subject: [PATCH 346/377] smbdirect: Fix error cleanup in smbdirect_map_sges_from_iter() Fix smbdirect_map_sges_from_iter() to use pre-decrement, not post-decrement so that it cleans up the correct slots. Fixes: e5fbdde43017 ("cifs: Add a function to build an RDMA SGE list from an iterator") Closes: https://sashiko.dev/#/patchset/20260326104544.509518-1-dhowells%40redhat.com Signed-off-by: David Howells Reviewed-by: Stefan Metzmacher cc: Paulo Alcantara cc: Tom Talpey cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/smbdirect/connection.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/smbdirect/connection.c b/fs/smb/smbdirect/connection.c index fe9912e53da6..8adf58097534 100644 --- a/fs/smb/smbdirect/connection.c +++ b/fs/smb/smbdirect/connection.c @@ -2168,7 +2168,7 @@ static ssize_t smbdirect_map_sges_from_iter(struct iov_iter *iter, size_t len, if (ret < 0) { while (state->num_sge > before) { - struct ib_sge *sge = &state->sge[state->num_sge--]; + struct ib_sge *sge = &state->sge[--state->num_sge]; ib_dma_unmap_page(state->device, sge->addr, From 51f57607e30bee282a1d40845f89a311cbb26481 Mon Sep 17 00:00:00 2001 From: Chen-Shi-Hong Date: Thu, 14 May 2026 23:39:13 +0800 Subject: [PATCH 347/377] docs: hwmon: sy7636a: fix temperature sysfs attribute name The hwmon sysfs naming convention uses temp[1-*]_input for temperature channels. Documentation/hwmon/sy7636a-hwmon.rst currently documents temp0_input, while the driver uses the standard hwmon temperature channel interface. Update the documentation to use temp1_input. Signed-off-by: Chen-Shi-Hong Link: https://lore.kernel.org/r/20260514154108.1937-1-eric039eric@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/sy7636a-hwmon.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/hwmon/sy7636a-hwmon.rst b/Documentation/hwmon/sy7636a-hwmon.rst index 0143ce0e5db7..03d866aba6e8 100644 --- a/Documentation/hwmon/sy7636a-hwmon.rst +++ b/Documentation/hwmon/sy7636a-hwmon.rst @@ -22,5 +22,5 @@ The following sensors are supported sysfs-Interface --------------- -temp0_input +temp1_input - Temperature of external NTC (milli-degree C) From 6fc7e8a3b8115294f60f5c89de27330bf1b9c98e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:13 -0300 Subject: [PATCH 348/377] iommu: Fix loss of errno on map failure for classic ops A typo, likely from a rebase, inverted the condition and caused errors to be lost. Fix it to be "if (ret)". This was breaking iommu_create_device_direct_mappings() on drivers that don't use iommupt and don't fully set up their domain in alloc_pages() (i.e., SMMUv2). In this case the first call of iommu_create_device_direct_mappings() should fail due to the incompletely initialized domain. Since it wrongly returns success, the second call to iommu_create_device_direct_mappings() doesn't happen and IOMMU_RESV_DIRECT is never set up. Cc: stable@vger.kernel.org Fixes: d6c65b0fd621 ("iommupt: Avoid rewalking during map") Reported-by: Josua Mayer Closes: https://lore.kernel.org/all/321c2e57-6a17-4aef-ba42-d2ebd577e472@solid-run.com/ Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Reviewed-by: Mostafa Saleh Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index e7bd28cc77ee..05f78cfd1f1d 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2709,7 +2709,7 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, return 0; } ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); - if (!ret) + if (ret) return ret; trace_map(iova, paddr, size); From b948a87228482235afbaf5f4d8037860b5c470fd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:14 -0300 Subject: [PATCH 349/377] iommu: Fix up map/unmap debugging for iommupt domains Sashiko noticed a few issues in this path, and a few more were found on review. Tidy them up further. These are intertwined because the debug code depends on some of the WARN_ONs to function right: Lift into iommu_map_nosync(): - The might_sleep_if() - 0 pgsize_bitmap WARN_ON - Promote the illegal domain->type to a WARN_ON - WARN_ON for illegal gfp flags Then remove the return 0 since it is now safe to call iommu_debug_map(). Lift into __iommu_unmap(): - 0 pgsize_bitmap WARN_ON - Promote the illegal domain->type to a WARN_ON - iommu_debug_unmap_begin() This now pairs with the unconditional iommu_debug_map() on the mapping side. Thus iommu debugging now works for iommupt along with some of the other debugging features. Fixes: 99fb8afa16ad ("iommupt: Directly call iommupt's unmap_range()") Fixes: d6c65b0fd621 ("iommupt: Avoid rewalking during map") Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Reviewed-by: Mostafa Saleh Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 05f78cfd1f1d..c21d1e3c4876 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2623,19 +2623,9 @@ static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, size_t orig_size = size; int ret = 0; - might_sleep_if(gfpflags_allow_blocking(gfp)); - - if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) - return -EINVAL; - - if (WARN_ON(!ops->map_pages || domain->pgsize_bitmap == 0UL)) + if (WARN_ON(!ops->map_pages)) return -ENODEV; - /* Discourage passing strange GFP flags */ - if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | - __GFP_HIGHMEM))) - return -EINVAL; - /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); @@ -2697,6 +2687,15 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, struct pt_iommu *pt = iommupt_from_domain(domain); int ret; + might_sleep_if(gfpflags_allow_blocking(gfp)); + + /* Discourage passing strange GFP flags or illegal domains */ + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING) || + !domain->pgsize_bitmap || + (gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | + __GFP_HIGHMEM)))) + return -EINVAL; + if (pt) { size_t mapped = 0; @@ -2706,11 +2705,12 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, iommu_unmap(domain, iova, mapped); return ret; } - return 0; + } else { + ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, + gfp); + if (ret) + return ret; } - ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); - if (ret) - return ret; trace_map(iova, paddr, size); iommu_debug_map(domain, paddr, size); @@ -2742,10 +2742,7 @@ __iommu_unmap_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, size_t unmapped_page, unmapped = 0; unsigned int min_pagesz; - if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) - return 0; - - if (WARN_ON(!ops->unmap_pages || domain->pgsize_bitmap == 0UL)) + if (WARN_ON(!ops->unmap_pages)) return 0; /* find out the minimum page size supported */ @@ -2764,8 +2761,6 @@ __iommu_unmap_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size); - iommu_debug_unmap_begin(domain, iova, size); - /* * Keep iterating until we either unmap 'size' bytes (or more) * or we hit an area that isn't mapped. @@ -2801,6 +2796,12 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, struct pt_iommu *pt = iommupt_from_domain(domain); size_t unmapped; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING) || + !domain->pgsize_bitmap)) + return 0; + + iommu_debug_unmap_begin(domain, iova, size); + if (pt) unmapped = pt->ops->unmap_range(pt, iova, size, iotlb_gather); else From 0735c54804c709d1b292f3b6947cfb560b2ce552 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:15 -0300 Subject: [PATCH 350/377] iommu: Handle unmap error when iommu_debug is enabled Sashiko noticed a latent bug where the map error flow called iommu_unmap() which calls iommu_debug_unmap_begin()/iommu_debug_unmap_end() however since this is an error path the map flow never actually established the original iommu_debug_map() it will malfunction. Lift the unmap error handling into iommu_map_nosync() and reorder it so the trace_map()/iommu_debug_map() records the partial mapping and then immediately unmaps it. This avoid creating the unbalanced tracking and provides saner tracing instead of a unmap unmatched to any map. Fixes: ccc21213f013 ("iommu: Add calls for IOMMU_DEBUG_PAGEALLOC") Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Reviewed-by: Mostafa Saleh Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 49 +++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index c21d1e3c4876..d1a9e713d3a0 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2615,12 +2615,11 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, - size_t size, int prot, gfp_t gfp) + size_t size, int prot, gfp_t gfp, + size_t *mapped) { const struct iommu_domain_ops *ops = domain->ops; - unsigned long orig_iova = iova; unsigned int min_pagesz; - size_t orig_size = size; int ret = 0; if (WARN_ON(!ops->map_pages)) @@ -2643,31 +2642,25 @@ static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { - size_t pgsize, count, mapped = 0; + size_t pgsize, count, op_mapped = 0; pgsize = iommu_pgsize(domain, iova, paddr, size, &count); pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", iova, &paddr, pgsize, count); ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, - gfp, &mapped); + gfp, &op_mapped); /* * Some pages may have been mapped, even if an error occurred, * so we should account for those so they can be unmapped. */ - size -= mapped; - + *mapped += op_mapped; if (ret) - break; + return ret; - iova += mapped; - paddr += mapped; - } - - /* unroll mapping in case something went wrong */ - if (ret) { - iommu_unmap(domain, orig_iova, orig_size - size); - return ret; + size -= op_mapped; + iova += op_mapped; + paddr += op_mapped; } return 0; } @@ -2685,6 +2678,7 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { struct pt_iommu *pt = iommupt_from_domain(domain); + size_t mapped = 0; int ret; might_sleep_if(gfpflags_allow_blocking(gfp)); @@ -2696,24 +2690,19 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, __GFP_HIGHMEM)))) return -EINVAL; - if (pt) { - size_t mapped = 0; - + if (pt) ret = pt->ops->map_range(pt, iova, paddr, size, prot, gfp, &mapped); - if (ret) { - iommu_unmap(domain, iova, mapped); - return ret; - } - } else { + else ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, - gfp); - if (ret) - return ret; - } + gfp, &mapped); - trace_map(iova, paddr, size); - iommu_debug_map(domain, paddr, size); + trace_map(iova, paddr, mapped); + iommu_debug_map(domain, paddr, mapped); + if (ret) { + iommu_unmap(domain, iova, mapped); + return ret; + } return 0; } From 8ef3f77c440005c7f04229a75976bfc078364247 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:16 -0300 Subject: [PATCH 351/377] iommupt: Check for missing PAGE_SIZE in the pgsize_bitmap Sashiko pointed out that the driver could drop PAGE_SIZE from the pgsize_bitmap. That is technically allowed but nothing does it, and such an iommu_domain would not be used with the DMA API today. Still, it is against the design and it is trivial to fix up. Lift the PT_WARN_ON to the if branch and just skip the fast path. Fixes: dcd6a011a8d5 ("iommupt: Add map_pages op") Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 19b6daf88f2a..4877b05291c9 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -920,8 +920,8 @@ static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, return ret; /* Calculate target page size and level for the leaves */ - if (pt_has_system_page_size(common) && len == PAGE_SIZE) { - PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); + if (pt_has_system_page_size(common) && len == PAGE_SIZE && + likely(pgsize_bitmap & PAGE_SIZE)) { if (log2_mod(iova | paddr, PAGE_SHIFT)) return -ENXIO; map.leaf_pgsize_lg2 = PAGE_SHIFT; From 58829512ad461af8f35941069c209941e3a97b65 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:17 -0300 Subject: [PATCH 352/377] iommupt: Fix the end_index calculation in __map_range_leaf() Sashiko noticed a mismatch of units in this math: num_leaves is actually the number of leaf *entries* (so a 16-item contiguous leaf is one num_leaves), while index is in items. The mismatch in maths causes __map_range_leaf() to exit early instead of efficiently filling a larger range of contiguous PTEs. The early exit is caught by the functions above and then __map_range_leaf() is re-invoked, so there is no functional issue. Correct the misuse of units by adjusting num_leaves with the leaf size and avoid the performance cost of looping externally. There are also some mismatched types for num_leaves; simplify things to remove the duplicated calculations. Fixes: d6c65b0fd621 ("iommupt: Avoid rewalking during map") Signed-off-by: Jason Gunthorpe Reviewed-by: Samiullah Khawaja Reviewd-by: Pranjal Shrivastava Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 4877b05291c9..dc91fb4e2f61 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -534,10 +534,12 @@ static int __map_range_leaf(struct pt_range *range, void *arg, struct pt_state pts = pt_init(range, level, table); struct pt_iommu_map_args *map = arg; unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; + unsigned int leaves_avail; unsigned int start_index; pt_oaddr_t oa = map->oa; - unsigned int num_leaves; + pt_vaddr_t num_leaves; unsigned int orig_end; + unsigned int step_lg2; pt_vaddr_t last_va; unsigned int step; bool need_contig; @@ -546,21 +548,25 @@ static int __map_range_leaf(struct pt_range *range, void *arg, PT_WARN_ON(map->leaf_level != level); PT_WARN_ON(!pt_can_have_leaf(&pts)); - step = log2_to_int_t(unsigned int, - leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts)); - need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts); + step_lg2 = leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts); + step = log2_to_int_t(unsigned int, step_lg2); + need_contig = step_lg2 != 0; _pt_iter_first(&pts); start_index = pts.index; orig_end = pts.end_index; - if (pts.index + map->num_leaves < pts.end_index) { + leaves_avail = + log2_div_t(unsigned int, pts.end_index - pts.index, step_lg2); + if (map->num_leaves <= leaves_avail) { /* Need to stop in the middle of the table to change sizes */ - pts.end_index = pts.index + map->num_leaves; + pts.end_index = pts.index + log2_mul(map->num_leaves, step_lg2); num_leaves = 0; } else { - num_leaves = map->num_leaves - (pts.end_index - pts.index); + num_leaves = map->num_leaves - leaves_avail; } + PT_WARN_ON( + log2_mod_t(unsigned int, pts.end_index - pts.index, step_lg2)); do { pts.type = pt_load_entry_raw(&pts); if (pts.type != PT_ENTRY_EMPTY || need_contig) { From 1bb54043ff309795c90ccadd8a6e6b13ac40ec4e Mon Sep 17 00:00:00 2001 From: Tomasz Jeznach Date: Tue, 12 May 2026 10:37:44 -0700 Subject: [PATCH 353/377] MAINTAINERS: update Tomasz Jeznach's email address Switch from the previous work address to a linux.dev account, as the work address is no longer actively monitored. Signed-off-by: Tomasz Jeznach Signed-off-by: Joerg Roedel --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index eec4a740f7ca..bd6a72f29d9c 100644 --- a/.mailmap +++ b/.mailmap @@ -857,6 +857,7 @@ Tobias Klauser Tobias Klauser Tobias Klauser Todor Tomov +Tomasz Jeznach Tony Luck Trilok Soni TripleX Chung diff --git a/MAINTAINERS b/MAINTAINERS index b2040011a386..55c6ab1a974a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22944,7 +22944,7 @@ N: riscv K: riscv RISC-V IOMMU -M: Tomasz Jeznach +M: Tomasz Jeznach L: iommu@lists.linux.dev L: linux-riscv@lists.infradead.org S: Maintained From c0e4fffc0f474b7ed10adee4ab2bc1a66d36fc72 Mon Sep 17 00:00:00 2001 From: Robertus Diawan Chris Date: Fri, 8 May 2026 10:39:14 +0700 Subject: [PATCH 354/377] ALSA: scarlett2: Add missing error check when initialise Autogain Status When initialise new control with scarlett2_add_new_ctl() function for Autogain Status, scarlett2_add_new_ctl() might throw an error. So, add error check after initialise new control for Autogain Status. This is reported by Coverity Scan with CID 1598781 as UNUSED_VALUE. Fixes: 0a995e38dc44 ("ALSA: scarlett2: Add support for software-controllable input gain") Signed-off-by: Robertus Diawan Chris Link: https://patch.msgid.link/20260508033914.111596-1-robertusdchris@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/mixer_scarlett2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/mixer_scarlett2.c b/sound/usb/mixer_scarlett2.c index 8eaa96222759..0f83f8981213 100644 --- a/sound/usb/mixer_scarlett2.c +++ b/sound/usb/mixer_scarlett2.c @@ -6707,6 +6707,8 @@ static int scarlett2_add_line_in_ctls(struct usb_mixer_interface *mixer) err = scarlett2_add_new_ctl( mixer, &scarlett2_autogain_status_ctl, i, 1, s, &private->autogain_status_ctls[i]); + if (err < 0) + return err; } /* Add autogain target controls */ From 2149c011510cbdcf183a13b26756e4a02071f0f2 Mon Sep 17 00:00:00 2001 From: Lianqin Hu Date: Fri, 8 May 2026 12:49:34 +0000 Subject: [PATCH 355/377] ALSA: usb-audio: Add iface reset and delay quirk for TTGK Technology USB-C Audio Setting up the interface when suspended/resumeing fail on this card. Adding a reset and delay quirk will eliminate this problem. usb 1-1: new full-speed USB device number 2 using xhci-hcd usb 1-1: New USB device found, idVendor=3302, idProduct=17c2 usb 1-1: New USB device strings: Mfr=1, Product=2, SerialNumber=3 usb 1-1: Product: USB-C Audio usb 1-1: Manufacturer: TTGK Technology usb 1-1: SerialNumber: 170120210706 Signed-off-by: Lianqin Hu Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/TYUPR06MB621720E4E8F99A42E162FD51D23D2@TYUPR06MB6217.apcprd06.prod.outlook.com --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 17983d9774f8..31cbe383ae65 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2479,6 +2479,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x3255, 0x0000, /* Luxman D-10X */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY), + DEVICE_FLG(0x3302, 0x17c2, /* TTGK Technology USB-C Audio */ + QUIRK_FLAG_FORCE_IFACE_RESET | QUIRK_FLAG_IFACE_DELAY), DEVICE_FLG(0x339b, 0x3a07, /* Synaptics HONOR USB-C HEADSET */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x3443, 0x930d, /* NexiGo N930W 60fps Webcam */ From dd074f04e04648d89d9d10ae9846cd057c97b385 Mon Sep 17 00:00:00 2001 From: Nicholas Bonello Date: Fri, 8 May 2026 18:55:07 -0400 Subject: [PATCH 356/377] ALSA: hda/realtek: Fix Legion 7 16ITHG6 speaker amp binding The Lenovo Legion 7 16ITHG6 uses codec SSID 17aa:3855, but its PCI SSID is 17aa:3811. The latter is now also used by the Legion S7 15IMH05 quirk, which is matched before codec SSID fallback and incorrectly routes Legion 7 16ITHG6 machines to ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS. That fixup does not bind the CLSA0101 CS35L41 companion amplifiers, making the built-in speakers silent even though playback appears to be active. Add a codec SSID quirk for 17aa:3855 before the conflicting PCI SSID quirk so that the Legion 7 16ITHG6 uses ALC287_FIXUP_LEGION_16ITHG6. This restores CS35L41 firmware loading and binds both speaker amplifiers. Fixes: 67f4c61a73e9 ("ALSA: hda/realtek: Add quirk for Legion S7 15IMH") Cc: stable@vger.kernel.org Tested-by: Nicholas Bonello Assisted-by: Codex:GPT-5 Signed-off-by: Nicholas Bonello Link: https://patch.msgid.link/20260508225507.47667-1-hadobedo@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 55bb98e2e55a..4e2c8401c404 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7675,11 +7675,12 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3801, "Lenovo Yoga9 14IAP7", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), HDA_CODEC_QUIRK(0x17aa, 0x3802, "DuetITL 2021", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3802, "Lenovo Yoga Pro 9 14IRP8", ALC287_FIXUP_TAS2781_I2C), - /* Yoga Pro 9 16IMH9 shares PCI SSID 17aa:3811 with Legion S7 15IMH05; - * use codec SSID to distinguish them + /* Yoga Pro 9 16IMH9 and Legion 7 16ITHG6 share PCI SSID 17aa:3811 + * with Legion S7 15IMH05; use codec SSID to distinguish them */ HDA_CODEC_QUIRK(0x17aa, 0x38d5, "Lenovo Yoga Pro 9 16IMH9", ALC287_FIXUP_TAS2781_I2C), HDA_CODEC_QUIRK(0x17aa, 0x38d6, "Lenovo Yoga Pro 9 16IMH9", ALC287_FIXUP_TAS2781_I2C), + HDA_CODEC_QUIRK(0x17aa, 0x3855, "Legion 7 16ITHG6", ALC287_FIXUP_LEGION_16ITHG6), SND_PCI_QUIRK(0x17aa, 0x3811, "Legion S7 15IMH05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940 / Yoga Duet 7", ALC298_FIXUP_LENOVO_C940_DUET7), From 0a9c56dd387605d17dabeedd9fdd2c4c1d0bab7b Mon Sep 17 00:00:00 2001 From: Myeonghun Pak Date: Wed, 13 May 2026 15:57:00 +0900 Subject: [PATCH 357/377] drm/loongson: Use managed KMS polling lsdc_pci_probe() initializes KMS polling before setting up vblank support, requesting the IRQ and registering the DRM device. If any of those later steps fails, probe returns without finalizing polling. The driver also never finalizes polling on regular removal. Use drmm_kms_helper_poll_init() so polling is tied to the DRM device lifetime and automatically finalized on probe failure and device removal. This issue was identified during our ongoing static-analysis research while reviewing kernel code. Fixes: f39db26c5428 ("drm: Add kms driver for loongson display controller") Cc: stable@vger.kernel.org Co-developed-by: Ijae Kim Signed-off-by: Ijae Kim Reviewed-by: Thomas Zimmermann Acked-by: Jianmin Lv Reviewed-by: Huacai Chen Signed-off-by: Myeonghun Pak Signed-off-by: Thomas Zimmermann Link: https://patch.msgid.link/20260513065706.23803-1-mhun512@gmail.com --- drivers/gpu/drm/loongson/lsdc_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/loongson/lsdc_drv.c b/drivers/gpu/drm/loongson/lsdc_drv.c index 1ece1ea42f78..34405073c4d4 100644 --- a/drivers/gpu/drm/loongson/lsdc_drv.c +++ b/drivers/gpu/drm/loongson/lsdc_drv.c @@ -293,7 +293,7 @@ static int lsdc_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) vga_client_register(pdev, lsdc_vga_set_decode); - drm_kms_helper_poll_init(ddev); + drmm_kms_helper_poll_init(ddev); if (loongson_vblank) { ret = drm_vblank_init(ddev, descp->num_of_crtc); From 814b2c9b30e56074e11fc0a6e5419b3fee0639bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Mon, 11 May 2026 01:36:37 -0300 Subject: [PATCH 358/377] ALSA: usb-audio: qcom: Check offload mapping failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uaudio_transfer_buffer_setup() calls dma_get_sgtable() and then passes the sg_table to uaudio_iommu_map_xfer_buf() without checking whether sg table construction succeeded. If dma_get_sgtable() fails, the sg_table contents are not valid. uaudio_iommu_map_pa() also ignores iommu_map() failures for the event and transfer rings and still returns the allocated IOVA to the QMI response. That can expose an unmapped IOVA to the audio DSP. For transfer rings, the failed mapping also leaves the IOVA allocator state marked in use. Check both operations. Free the coherent transfer buffer when sg table construction fails, free the sg table when transfer-buffer IOMMU mapping fails, and release the transfer-ring IOVA if iommu_map() fails. Also return the existing event-ring IOVA when the event ring is already mapped, matching the pre-split helper behavior. Fixes: 326bbc348298 ("ALSA: usb-audio: qcom: Introduce QC USB SND offloading support") Fixes: 44499ecb4f28 ("ALSA: usb: qcom: Fix false-positive address space check") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260511-alsa-usb-qcom-offload-map-errors-v1-1-6502695e58bc@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/qcom/qc_audio_offload.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/sound/usb/qcom/qc_audio_offload.c b/sound/usb/qcom/qc_audio_offload.c index 5f993b88448c..a0009503b2c5 100644 --- a/sound/usb/qcom/qc_audio_offload.c +++ b/sound/usb/qcom/qc_audio_offload.c @@ -565,6 +565,7 @@ static unsigned long uaudio_iommu_map_pa(enum mem_type mtype, bool dma_coherent, unsigned long iova = 0; bool map = true; int prot = uaudio_iommu_map_prot(dma_coherent); + int ret; switch (mtype) { case MEM_EVENT_RING: @@ -582,10 +583,24 @@ static unsigned long uaudio_iommu_map_pa(enum mem_type mtype, bool dma_coherent, dev_err(uaudio_qdev->data->dev, "unknown mem type %d\n", mtype); } - if (!iova || !map) + if (!iova) return 0; - iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, GFP_KERNEL); + if (!map) + return iova; + + ret = iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, + GFP_KERNEL); + if (ret) { + dev_err(uaudio_qdev->data->dev, + "failed to map %zu bytes at iova 0x%08lx: %d\n", + size, iova, ret); + if (mtype == MEM_XFER_RING) + uaudio_put_iova(iova, size, + &uaudio_qdev->xfer_ring_list, + &uaudio_qdev->xfer_ring_iova_size); + return 0; + } return iova; } @@ -1054,15 +1069,17 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs, if (!xfer_buf) return -ENOMEM; - dma_get_sgtable(subs->dev->bus->sysdev, &xfer_buf_sgt, xfer_buf, - xfer_buf_dma, len); + ret = dma_get_sgtable(subs->dev->bus->sysdev, &xfer_buf_sgt, xfer_buf, + xfer_buf_dma, len); + if (ret) + goto free_xfer_buf; /* map the physical buffer into sysdev as well */ xfer_buf_dma_sysdev = uaudio_iommu_map_xfer_buf(dma_coherent, len, &xfer_buf_sgt); if (!xfer_buf_dma_sysdev) { ret = -ENOMEM; - goto unmap_sync; + goto free_sgt; } mem_info->dma = xfer_buf_dma; @@ -1073,7 +1090,9 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs, return 0; -unmap_sync: +free_sgt: + sg_free_table(&xfer_buf_sgt); +free_xfer_buf: usb_free_coherent(subs->dev, len, xfer_buf, xfer_buf_dma); return ret; From 74e8409821ac8cda70bf23eb593f2c7f6e3b5a2f Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 11 May 2026 11:41:48 +0100 Subject: [PATCH 359/377] ALSA: doc: cs35l56: Update path to HDA driver source The HDA drivers were moved to sound/hda/... so update a Documentation reference that still pointed to the old location. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260511104148.36382-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- Documentation/sound/codecs/cs35l56.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/sound/codecs/cs35l56.rst b/Documentation/sound/codecs/cs35l56.rst index d5363b08f515..b3f8c1c23851 100644 --- a/Documentation/sound/codecs/cs35l56.rst +++ b/Documentation/sound/codecs/cs35l56.rst @@ -40,7 +40,7 @@ There are two drivers in the kernel *For systems using SoundWire*: sound/soc/codecs/cs35l56.c and associated files -*For systems using HDA*: sound/pci/hda/cs35l56_hda.c +*For systems using HDA*: sound/hda/codecs/side-codecs/cs35l56_hda.c Firmware ======== From d02d2d51a50d1bbf44a50eda094aa2b10fecf023 Mon Sep 17 00:00:00 2001 From: Edson Juliano Drosdeck Date: Mon, 11 May 2026 15:15:58 -0300 Subject: [PATCH 360/377] ALSA: hda/realtek: Limit mic boost on Positivo DN50E The internal mic boost on the Positivo DN50E is too high. Fix this by applying the ALC269_FIXUP_LIMIT_INT_MIC_BOOST fixup to the machine to limit the gain. Signed-off-by: Edson Juliano Drosdeck Link: https://patch.msgid.link/20260511181558.670563-1-edson.drosdeck@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 4e2c8401c404..3323793cdc14 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7830,6 +7830,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1d72, 0x1945, "Redmi G", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1947, "RedmiBook Air", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1e39, 0xca14, "MEDION NM14LNL", ALC233_FIXUP_MEDION_MTL_SPK), + SND_PCI_QUIRK(0x1e50, 0x7007, "Positivo DN50E", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x1ee7, 0x2078, "HONOR BRB-X M1010", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1ee7, 0x2081, "HONOR MRB-XXX M1020", ALC256_FIXUP_HONOR_MRB_XXX_M1020_AUDIO), SND_PCI_QUIRK(0x1f4c, 0xe001, "Minisforum V3 (SE)", ALC245_FIXUP_BASS_HP_DAC), From 67c73815220784074ff13ec07df955911caf1b73 Mon Sep 17 00:00:00 2001 From: Daniel Schaefer Date: Wed, 13 May 2026 23:55:13 +0800 Subject: [PATCH 361/377] ALSA: hda/realtek: fix mic boost on Framework PTL In addition to the mic jack fix, also need to avoid boosting the internal mic too much, otherwise >50% input volume clips a lot. Also add a second SSID. We have one for the classic chassis/speaker and one for the new Pro chassis/speaker. To: Jaroslav Kysela To: Takashi Iwai To: linux-sound@vger.kernel.org Cc: Dustin L. Howett Cc: linux@frame.work Signed-off-by: Daniel Schaefer Link: https://patch.msgid.link/20260513155513.11683-1-dhs@frame.work Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 3323793cdc14..ef1eccda70df 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -4095,6 +4095,7 @@ enum { ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED, ALC285_FIXUP_HP_SPEAKERS_MICMUTE_LED, ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE, + ALC295_FIXUP_FRAMEWORK_LAPTOP_LIMIT_INT_MIC_BOOST, ALC287_FIXUP_LEGION_16ITHG6, ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK, ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN, @@ -6346,6 +6347,12 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC }, + [ALC295_FIXUP_FRAMEWORK_LAPTOP_LIMIT_INT_MIC_BOOST] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc269_fixup_limit_int_mic_boost, + .chained = true, + .chain_id = ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE, + }, [ALC287_FIXUP_LEGION_16ITHG6] = { .type = HDA_FIXUP_FUNC, .v.func = alc287_fixup_legion_16ithg6_speakers, @@ -7856,7 +7863,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0xf111, 0x0009, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x000b, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x000c, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0xf111, 0x000f, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0xf111, 0x000f, "Framework Laptop 13 Pro PTL", ALC295_FIXUP_FRAMEWORK_LAPTOP_LIMIT_INT_MIC_BOOST), + SND_PCI_QUIRK(0xf111, 0x010f, "Framework Laptop 13 PTL", ALC295_FIXUP_FRAMEWORK_LAPTOP_LIMIT_INT_MIC_BOOST), #if 0 /* Below is a quirk table taken from the old code. From 2891bb13ef158281736b6314b1ceaef6d08d57f4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 13 May 2026 18:27:58 +0200 Subject: [PATCH 362/377] ALSA: hda/cs35l56: Drop malformed default N from Kconfig First of all, it has to be 'default n' (small letter n), otherwise it looks for CONFIG_N which is absent and in case of appearance will enable something unrelated. Second and most important is that 'n' *is* the default 'default' already. Hence just drop malformed line. Signed-off-by: Andy Shevchenko Reviewed-by: Richard Fitzgerald Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260513162758.365972-1-andriy.shevchenko@linux.intel.com --- sound/hda/codecs/side-codecs/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/hda/codecs/side-codecs/Kconfig b/sound/hda/codecs/side-codecs/Kconfig index fc5651e555e3..e51964c0a091 100644 --- a/sound/hda/codecs/side-codecs/Kconfig +++ b/sound/hda/codecs/side-codecs/Kconfig @@ -94,7 +94,6 @@ menu "CS35L56 driver options" config SND_HDA_SCODEC_CS35L56_CAL_DEBUGFS bool "CS35L56 create debugfs for factory calibration" - default N depends on DEBUG_FS select SND_SOC_CS35L56_CAL_DEBUGFS_COMMON help From fd87b510f5f543125ecf51e7c706a9f4bc3352be Mon Sep 17 00:00:00 2001 From: Markus Kramer Date: Thu, 14 May 2026 00:28:18 +0200 Subject: [PATCH 363/377] ALSA: hda/realtek: Add quirk for Samsung Galaxy Book5 360 headphone The Samsung Galaxy Book5 360 (NP750QHA, PCI subsystem ID 0x144d:0xc902) has severe audio distortion on the 3.5mm headphone jack. Applying ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET corrects the output path configuration, consistent with fixes already applied to other Samsung Galaxy Book models using the same ALC256 codec. Cc: stable@vger.kernel.org Link: https://github.com/thesofproject/linux/issues/5648 Signed-off-by: Markus Kramer Link: https://patch.msgid.link/20260513222818.14351-1-linux@markus-kramer.de Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index ef1eccda70df..0c501e9fdc27 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7511,6 +7511,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x144d, 0xc870, "Samsung Galaxy Book2 Pro (NP950XED)", ALC298_FIXUP_SAMSUNG_AMP_V2_2_AMPS), SND_PCI_QUIRK(0x144d, 0xc872, "Samsung Galaxy Book2 Pro (NP950XEE)", ALC298_FIXUP_SAMSUNG_AMP_V2_2_AMPS), SND_PCI_QUIRK(0x144d, 0xc886, "Samsung Galaxy Book3 Pro (NP964XFG)", ALC298_FIXUP_SAMSUNG_AMP_V2_4_AMPS), + SND_PCI_QUIRK(0x144d, 0xc902, "Samsung Galaxy Book5 360 (NP750QHA)", ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET), SND_PCI_QUIRK(0x144d, 0xc1ca, "Samsung Galaxy Book3 Pro 360 (NP960QFG)", ALC298_FIXUP_SAMSUNG_AMP_V2_4_AMPS), SND_PCI_QUIRK(0x144d, 0xc1cb, "Samsung Galaxy Book3 Pro 360 (NP965QFG)", ALC298_FIXUP_SAMSUNG_AMP_V2_4_AMPS), SND_PCI_QUIRK(0x144d, 0xc1cc, "Samsung Galaxy Book3 Ultra (NT960XFH)", ALC298_FIXUP_SAMSUNG_AMP_V2_4_AMPS), From 9921941929ab014038c357b30567d93d20393a94 Mon Sep 17 00:00:00 2001 From: Quan Sun <2022090917019@std.uestc.edu.cn> Date: Thu, 14 May 2026 21:22:45 +0800 Subject: [PATCH 364/377] ALSA: hda: Fix NULL pointer dereference in snd_hda_ctl_add() snd_hda_ctl_add() dereferences kctl->id.subdevice without checking whether kctl is NULL. Multiple callers in sound/hda/codecs/ca0132.c pass the return value of snd_ctl_new1() directly to snd_hda_ctl_add() without a NULL check: return snd_hda_ctl_add(codec, nid, snd_ctl_new1(&knew, codec)); snd_ctl_new1() returns NULL when the underlying snd_ctl_new() fails on memory allocation (kzalloc_flex),which can occur under memory pressure or via fault injection. Add a NULL check at the entry of snd_hda_ctl_add(), matching the pattern already used by snd_ctl_add_replace() at the same call path (sound/core/control.c:515). Return -EINVAL to let callers handle the error gracefully. Fixes: 44f0c9782cc6 ("ALSA: hda/ca0132: Add tuning controls") Signed-off-by: Quan Sun <2022090917019@std.uestc.edu.cn> Link: https://patch.msgid.link/20260514132245.3062884-1-2022090917019@std.uestc.edu.cn Signed-off-by: Takashi Iwai --- sound/hda/common/codec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/hda/common/codec.c b/sound/hda/common/codec.c index c2af2511a831..81f266b9b850 100644 --- a/sound/hda/common/codec.c +++ b/sound/hda/common/codec.c @@ -1699,6 +1699,9 @@ int snd_hda_ctl_add(struct hda_codec *codec, hda_nid_t nid, unsigned short flags = 0; struct hda_nid_item *item; + if (!kctl) + return -EINVAL; + if (kctl->id.subdevice & HDA_SUBDEV_AMP_FLAG) { flags |= HDA_NID_ITEM_AMP; if (nid == 0) From 83dca2530fb3ba63f47bad339d890bc30aa06ab5 Mon Sep 17 00:00:00 2001 From: Jackie Dong Date: Thu, 14 May 2026 23:39:40 +0800 Subject: [PATCH 365/377] ALSA: hda/realtek: ALC269 fixup for Lenovo Yoga Pro 7 15ASH111 audio Volume control for the speakers on the Lenovo Yoga Pro 7 15ASH11 laptop doesn't work. The DAC routing is the same as on the ThinkPad X1 Gen7 function, so reuse the alc285_fixup_thinkpad_x1_gen7 to get it working. Signed-off-by: Jackie Dong Link: https://patch.msgid.link/20260514153940.7320-1-xy-jackie@139.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 0c501e9fdc27..9946ae185f4a 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7762,6 +7762,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x38df, "Y990 YG DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38f9, "Thinkbook 16P Gen5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), SND_PCI_QUIRK(0x17aa, 0x38fa, "Thinkbook 16P Gen5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), + SND_PCI_QUIRK(0x17aa, 0x38fc, "Lenovo Yoga Pro 7 15ASH11", ALC245_FIXUP_BASS_HP_DAC), SND_PCI_QUIRK(0x17aa, 0x38fd, "ThinkBook plus Gen5 Hybrid", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x390d, "Lenovo Yoga Pro 7 14ASP10", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), From 7d1051ad68df3d584b5f24bfa1fb19f3a24db278 Mon Sep 17 00:00:00 2001 From: Adrien Burnett Date: Thu, 14 May 2026 18:59:05 +0200 Subject: [PATCH 366/377] ALSA: hda/realtek: Add mute LED quirk for HP Pavilion Laptop 16-ag0xxx Add a SND_PCI_QUIRK entry for the HP Pavilion Laptop 16-ag0xxx (subsystem 0x103c:0x8cbc, Realtek ALC245). The ALC245_FIXUP_HP_X360_MUTE_LEDS fixup is already used by the neighbouring HP Pavilion Aero Laptop 13-bg0xxx (0x103c:0x8cbd); it chains the master-mute COEF handler with the GPIO mic-mute LED handler, which is what this machine needs. Tested on the affected hardware: both the mute and mic-mute key LEDs respond correctly to the keyboard hotkeys after this change. Cc: Signed-off-by: Adrien Burnett Link: https://patch.msgid.link/20260514165905.21175-1-an.arctic.pigeon@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 9946ae185f4a..2f5d13032fd7 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7200,6 +7200,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8ca4, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8caf, "HP Elite mt645 G8 Mobile Thin Client", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8cbc, "HP Pavilion Laptop 16-ag0xxx", ALC245_FIXUP_HP_X360_MUTE_LEDS), SND_PCI_QUIRK(0x103c, 0x8cbd, "HP Pavilion Aero Laptop 13-bg0xxx", ALC245_FIXUP_HP_X360_MUTE_LEDS), SND_PCI_QUIRK(0x103c, 0x8cdd, "HP Spectre", ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX), SND_PCI_QUIRK(0x103c, 0x8cde, "HP OmniBook Ultra Flip Laptop 14t", ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX), From 6fd9f6e870ea285f05102e8e00e6a7f4495a9a02 Mon Sep 17 00:00:00 2001 From: Matt DeVillier Date: Thu, 7 May 2026 09:58:41 -0500 Subject: [PATCH 367/377] ALSA: hda/ca0132: Disable auto-detect on manual output select Commit 778031e1658d ("ALSA: hda/ca0132: Set HP/Speaker auto-detect default from headphone pin verb") enables HP/Speaker auto-detect by default when the headphone pin supports presence detect. With auto-detect enabled, ca0132_select_out() and ca0132_alt_select_out() choose the output from jack presence instead of the manual HP/Speaker selection. This means selecting speaker output while headphones are plugged in updates the control state, but audio still routes to the headphones. Treat an explicit manual output selection as a request to leave auto-detect mode. Clear the HP/Speaker auto-detect switch before applying the manual selection, and notify userspace so the auto-detect control state is updated in mixers. Do this for both the normal HP/Speaker Playback Switch and the alternate Output Select control used by desktop cards. This keeps auto-detect enabled by default for devices with jack presence detection, while preserving the expected behavior that a manual output choice takes effect immediately. Fixes: 778031e1658d ("ALSA: hda/ca0132: Set HP/Speaker auto-detect default from headphone pin verb") Signed-off-by: Matt DeVillier Link: https://lore.kernel.org/CAFTm+6AfeXKf=b2frG4xC5yC4jjM9TkD6c8+dOWWFw6BDjDESw@mail.gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/ca0132.c | 44 +++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/sound/hda/codecs/ca0132.c b/sound/hda/codecs/ca0132.c index ad533b04ab29..be565ffaade0 100644 --- a/sound/hda/codecs/ca0132.c +++ b/sound/hda/codecs/ca0132.c @@ -5498,6 +5498,30 @@ static int zxr_headphone_gain_set(struct hda_codec *codec, long val) return 0; } +/* + * Manual output selection (HP/Speaker Playback Switch or alt Output Select) + * is meaningful only when HP/Speaker auto-detect is disabled, since the + * select_out path always prefers jack presence when auto-detect is on. When + * the user explicitly chooses an output, turn auto-detect off so the manual + * choice actually takes effect, and notify userspace so the auto-detect + * control reflects the new state. + */ +static void ca0132_disable_hp_auto_detect(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + struct snd_kcontrol *kctl; + + if (!spec->vnode_lswitch[VNID_HP_ASEL - VNODE_START_NID]) + return; + + spec->vnode_lswitch[VNID_HP_ASEL - VNODE_START_NID] = 0; + kctl = snd_hda_find_mixer_ctl(codec, + "HP/Speaker Auto Detect Playback Switch"); + if (kctl) + snd_ctl_notify(codec->card, SNDRV_CTL_EVENT_MASK_VALUE, + &kctl->id); +} + static int ca0132_vnode_switch_set(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { @@ -5510,14 +5534,11 @@ static int ca0132_vnode_switch_set(struct snd_kcontrol *kcontrol, int auto_jack; if (nid == VNID_HP_SEL) { - auto_jack = - spec->vnode_lswitch[VNID_HP_ASEL - VNODE_START_NID]; - if (!auto_jack) { - if (ca0132_use_alt_functions(spec)) - ca0132_alt_select_out(codec); - else - ca0132_select_out(codec); - } + ca0132_disable_hp_auto_detect(codec); + if (ca0132_use_alt_functions(spec)) + ca0132_alt_select_out(codec); + else + ca0132_select_out(codec); return 1; } @@ -5978,7 +5999,6 @@ static int ca0132_alt_output_select_put(struct snd_kcontrol *kcontrol, struct ca0132_spec *spec = codec->spec; int sel = ucontrol->value.enumerated.item[0]; unsigned int items = NUM_OF_OUTPUTS; - unsigned int auto_jack; if (sel >= items) return 0; @@ -5988,10 +6008,8 @@ static int ca0132_alt_output_select_put(struct snd_kcontrol *kcontrol, spec->out_enum_val = sel; - auto_jack = spec->vnode_lswitch[VNID_HP_ASEL - VNODE_START_NID]; - - if (!auto_jack) - ca0132_alt_select_out(codec); + ca0132_disable_hp_auto_detect(codec); + ca0132_alt_select_out(codec); return 1; } From 96350db80e0acd733e9b9ef61c0d910790b27289 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 May 2026 12:57:09 +0200 Subject: [PATCH 368/377] ring-buffer remote: Avoid unexpected symbol warnings (arm, s390) The now more verbose check found more architecture specific symbol missing from the whitelist, during randconfig testing on s390 and 32-bit arm: Unexpected symbols in kernel/trace/simple_ring_buffer.o: U __aeabi_unwind_cpp_pr1 Unexpected symbols in kernel/trace/simple_ring_buffer.o: U __s390_indirect_jump_r1 U __s390_indirect_jump_r10 U __s390_indirect_jump_r14 U __s390_indirect_jump_r2 U __s390_indirect_jump_r5 U __s390_indirect_jump_r7 U __s390_indirect_jump_r8 U __s390_indirect_jump_r9 make[6]: *** [/home/arnd/arm-soc/kernel/trace/Makefile:160: kernel/trace/simple_ring_buffer.o.checked] Error 1 Add these to the list and keep it roughly sorted into sanitizer and architecture symbols. Cc: Masami Hiramatsu Cc: Marc Zyngier Cc: Nathan Chancellor Cc: Vincent Donnefort Cc: Mathieu Desnoyers Cc: Paolo Bonzini Link: https://patch.msgid.link/20260515105717.1023007-1-arnd@kernel.org Fixes: 1211907ac0b5 ("tracing: Generate undef symbols allowlist for simple_ring_buffer") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt --- kernel/trace/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1decdce8cbef..9b0834134cae 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -143,8 +143,8 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o targets += undefsyms_base.o KASAN_SANITIZE_undefsyms_base.o := y -UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \ - __msan simple_ring_buffer \ +UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \ + __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \ $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}') quiet_cmd_check_undefined = NM $< From dc366607c41c45fd0ae6f3db090f31dd611b644a Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Wed, 13 May 2026 12:30:50 +0800 Subject: [PATCH 369/377] drm: Replace old pointer to new idr Commit 5e28b7b94408 introduced a logical error by failing to replace the newly generated IDR pointer to old id's pointer at the correct location within the "change handle" logic; this resulted in the issue reported by syzbot [1]. Specifically, the new IDR object pointer is intended to replace the original id's pointer during the normal execution flow. Additionally, an unnecessary conditional check for the ret exit path has been removed. [1] !RB_EMPTY_ROOT(&prime_fpriv->dmabufs) WARNING: drivers/gpu/drm/drm_prime.c:224 at drm_prime_destroy_file_private+0x48/0x60 drivers/gpu/drm/drm_prime.c:224, CPU#0: syz.0.17/5833 Call Trace: drm_file_free.part.0+0x7e6/0xcc0 drivers/gpu/drm/drm_file.c:269 drm_file_free drivers/gpu/drm/drm_file.c:237 [inline] drm_close_helper.isra.0+0x186/0x200 drivers/gpu/drm/drm_file.c:290 drm_release+0x1ab/0x360 drivers/gpu/drm/drm_file.c:438 Fixes: 5e28b7b94408 ("drm: Set old handle to NULL before prime swap in change_handle") Reported-by: syzbot+d7c9eed171647e421013@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d7c9eed171647e421013 Cc: stable@vger.kernel.org Tested-by: syzbot+d7c9eed171647e421013@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: Dave Airlie Link: https://patch.msgid.link/tencent_C267296443AAA4567771176886DFF364A305@qq.com --- drivers/gpu/drm/drm_gem.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 51a887cc7fd7..8afab57fc055 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -1067,17 +1067,12 @@ int drm_gem_change_handle_ioctl(struct drm_device *dev, void *data, spin_unlock(&file_priv->table_lock); - if (ret < 0) - goto out_unlock; - if (obj->dma_buf) { ret = drm_prime_add_buf_handle(&file_priv->prime, obj->dma_buf, handle); if (ret < 0) { spin_lock(&file_priv->table_lock); idr_remove(&file_priv->object_idr, handle); - idrobj = idr_replace(&file_priv->object_idr, obj, handle); - WARN_ON(idrobj != NULL); spin_unlock(&file_priv->table_lock); goto out_unlock; } @@ -1089,7 +1084,9 @@ int drm_gem_change_handle_ioctl(struct drm_device *dev, void *data, spin_lock(&file_priv->table_lock); idr_remove(&file_priv->object_idr, args->handle); + idrobj = idr_replace(&file_priv->object_idr, obj, handle); spin_unlock(&file_priv->table_lock); + WARN_ON(idrobj != NULL); out_unlock: mutex_unlock(&file_priv->prime.lock); From b09a45601094c7f4ec4db8090b825fa61e169d93 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 14 May 2026 14:31:49 -0700 Subject: [PATCH 370/377] hwmon: (lm90) Stop work before releasing hwmon device Sashiko reports: In lm90_probe(), the devm action to cancel the alert_work and report_work (lm90_restore_conf) is registered in lm90_init_client() before devm_hwmon_device_register_with_info() is called. Because devm executes cleanup actions in reverse order during module unbind or probe failure, the hwmon device is unregistered and freed first. If lm90_alert_work() or lm90_report_alarms() runs in the window between the hwmon device being freed and the delayed works being cancelled, lm90_update_alarms() will dereference the freed data->hwmon_dev here. Fix the problem by canceling the workers separately after registering the hwmon device and before registering the interrupt handler. This ensures that the workers are canceled after interrupts are disabled and before the hwmon device is released. Add "shutdown" flag to indicate that device shutdown is in progress to prevent workers from being re-armed. Fixes: f6d0775119fb9 ("hwmon: (lm90) Rework alarm/status handling") Reported-by: Sashiko Signed-off-by: Guenter Roeck --- drivers/hwmon/lm90.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c index 3c10a5066b53..c4a9dafff81d 100644 --- a/drivers/hwmon/lm90.c +++ b/drivers/hwmon/lm90.c @@ -736,6 +736,7 @@ struct lm90_data { struct hwmon_chip_info chip; struct delayed_work alert_work; struct work_struct report_work; + bool shutdown; /* true if shutting down */ bool valid; /* true if register values are valid */ bool alarms_valid; /* true if status register values are valid */ unsigned long last_updated; /* in jiffies */ @@ -1154,6 +1155,9 @@ static void lm90_report_alarms(struct work_struct *work) static int lm90_update_alarms_locked(struct lm90_data *data, bool force) { + if (data->shutdown) + return 0; + if (force || !data->alarms_valid || time_after(jiffies, data->alarms_updated + msecs_to_jiffies(data->update_interval))) { struct i2c_client *client = data->client; @@ -2584,15 +2588,23 @@ static void lm90_restore_conf(void *_data) struct lm90_data *data = _data; struct i2c_client *client = data->client; - cancel_delayed_work_sync(&data->alert_work); - cancel_work_sync(&data->report_work); - /* Restore initial configuration */ if (data->flags & LM90_HAVE_CONVRATE) lm90_write_convrate(data, data->convrate_orig); lm90_write_reg(client, LM90_REG_CONFIG1, data->config_orig); } +static void lm90_stop_work(void *_data) +{ + struct lm90_data *data = _data; + + hwmon_lock(data->hwmon_dev); + data->shutdown = true; + hwmon_unlock(data->hwmon_dev); + cancel_delayed_work_sync(&data->alert_work); + cancel_work_sync(&data->report_work); +} + static int lm90_init_client(struct i2c_client *client, struct lm90_data *data) { struct device_node *np = client->dev.of_node; @@ -2902,6 +2914,10 @@ static int lm90_probe(struct i2c_client *client) data->hwmon_dev = hwmon_dev; + err = devm_add_action_or_reset(&client->dev, lm90_stop_work, data); + if (err) + return err; + if (client->irq) { dev_dbg(dev, "IRQ: %d\n", client->irq); err = devm_request_threaded_irq(dev, client->irq, @@ -2930,7 +2946,7 @@ static void lm90_alert(struct i2c_client *client, enum i2c_alert_protocol type, */ struct lm90_data *data = i2c_get_clientdata(client); - if ((data->flags & LM90_HAVE_BROKEN_ALERT) && + if (!data->shutdown && (data->flags & LM90_HAVE_BROKEN_ALERT) && (data->current_alarms & data->alert_alarms)) { if (!(data->config & 0x80)) { dev_dbg(&client->dev, "Disabling ALERT#\n"); From 873e919e3101063a7a75989510ccfc125a4391cf Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 14 May 2026 14:41:00 -0700 Subject: [PATCH 371/377] hwmon: (lm90) Add lock protection to lm90_alert Sashiko reports: lm90_alert() executes in the smbus alert context and calls lm90_update_confreg() to disable the hardware alert line, without acquiring hwmon_lock. Concurrently, sysfs write operations (such as lm90_write_convrate) hold the hwmon_lock, temporarily modify data->config, and then restore it. If an alert interrupt occurs concurrently with a sysfs write, the sysfs path will overwrite the alert handler's modifications to data->config and the hardware register. This unintentionally re-enables the hardware alert line while the alarm is still active, causing an interrupt storm. Add the missing lock to lm90_alert() to solve the problem. Fixes: 7a1d220ccb0cc ("hwmon: (lm90) Introduce function to update configuration register") Reported-by: Sashiko Signed-off-by: Guenter Roeck --- drivers/hwmon/lm90.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c index c4a9dafff81d..1eeb608e5903 100644 --- a/drivers/hwmon/lm90.c +++ b/drivers/hwmon/lm90.c @@ -2946,6 +2946,7 @@ static void lm90_alert(struct i2c_client *client, enum i2c_alert_protocol type, */ struct lm90_data *data = i2c_get_clientdata(client); + hwmon_lock(data->hwmon_dev); if (!data->shutdown && (data->flags & LM90_HAVE_BROKEN_ALERT) && (data->current_alarms & data->alert_alarms)) { if (!(data->config & 0x80)) { @@ -2955,6 +2956,7 @@ static void lm90_alert(struct i2c_client *client, enum i2c_alert_protocol type, schedule_delayed_work(&data->alert_work, max_t(int, HZ, msecs_to_jiffies(data->update_interval))); } + hwmon_unlock(data->hwmon_dev); } else { dev_dbg(&client->dev, "Everything OK\n"); } From 55a0005518195fdea1fd2991b07644f8dc97ea8e Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 15 May 2026 21:16:16 +0100 Subject: [PATCH 372/377] tracing: Fix desc in error path for the trace remote test module During initialisation in remote_test_load(), if one of the simple_ring_buffer fails to initialise, the error path attempts to rollback initialised buffers. However, the rollback incorrectly uses the global pointer to the trace descriptor, which is only set upon successful load completion. Fix the error path by using the local pointer to the descriptor. Link: https://patch.msgid.link/20260515201616.337469-1-vdonnefort@google.com Fixes: ea908a2b79c8 ("tracing: Add a trace remote module for testing") Reported-by: Sashiko Signed-off-by: Vincent Donnefort base-commit: 5d6919055dec134de3c40167a490f33c74c12581 Signed-off-by: Steven Rostedt --- kernel/trace/remote_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c index 6c1b7701ddae..a3e2c9b606eb 100644 --- a/kernel/trace/remote_test.c +++ b/kernel/trace/remote_test.c @@ -110,9 +110,9 @@ static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unus return remote_test_buffer_desc; err_unload: - for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + for_each_ring_buffer_desc(rb_desc, cpu, desc) remote_test_unload_simple_rb(rb_desc->cpu); - trace_remote_free_buffer(remote_test_buffer_desc); + trace_remote_free_buffer(desc); err_free_desc: kfree(desc); From 23e6a1ca04ae44806439a5a446e62e4d42e80bb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20L=C3=B3pez?= Date: Tue, 12 May 2026 12:00:41 +0200 Subject: [PATCH 373/377] virt: sev-guest: Do not use host-controlled page order in cleanup path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When issuing an extended guest request (SVM_VMGEXIT_EXT_GUEST_REQUEST), get_ext_report() allocates a buffer to retrieve a certificate blob from the host, keeping track of its size in report_req->certs_len. However, the host may return SNP_GUEST_VMM_ERR_INVALID_LEN, indicating an invalid buffer size, as well as the expected length of such buffer. get_ext_report() subsequently updates report_req->certs_len with the host-controlled value, and cleans up the buffer by computing a page order from such value. This is incorrect, as the host-provided length may not match the page order of the original allocation, potentially resulting in corruption in the page allocator. Fix this by using alloc_pages_exact() instead, and reusing @npages to compute the size passed to free_pages_exact(). For consistency, also use @npages to compute the size when allocating the pages, even though this last change has no functional effect. Fixes: 3e385c0d6ce8 ("virt: sev-guest: Move SNP Guest Request data pages handling under snp_cmd_mutex") Signed-off-by: Carlos López Signed-off-by: Borislav Petkov (AMD) Tested-by: Michael Roth Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- drivers/virt/coco/sev-guest/sev-guest.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index e001e6769a43..910a1de0d5a7 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -176,7 +176,6 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques struct snp_guest_req req = {}; int ret, npages = 0, resp_len; sockptr_t certs_address; - struct page *page; if (sockptr_is_null(io->req_data) || sockptr_is_null(io->resp_data)) return -EINVAL; @@ -211,16 +210,15 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques * zeros to indicate that certificate data was not provided. */ npages = report_req->certs_len >> PAGE_SHIFT; - page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, - get_order(report_req->certs_len)); - if (!page) + req.certs_data = alloc_pages_exact(npages << PAGE_SHIFT, + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!req.certs_data) return -ENOMEM; - req.certs_data = page_address(page); ret = set_memory_decrypted((unsigned long)req.certs_data, npages); if (ret) { pr_err("failed to mark page shared, ret=%d\n", ret); - __free_pages(page, get_order(report_req->certs_len)); + free_pages_exact(req.certs_data, npages << PAGE_SHIFT); return -EFAULT; } @@ -277,7 +275,7 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques if (set_memory_encrypted((unsigned long)req.certs_data, npages)) WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n"); else - __free_pages(page, get_order(report_req->certs_len)); + free_pages_exact(req.certs_data, npages << PAGE_SHIFT); } return ret; } From 5200f5f493f79f14bbdc349e402a40dfb32f23c8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 May 2026 13:59:58 -0700 Subject: [PATCH 374/377] Linux 7.1-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b7b80e84e1eb..9f59598d3a08 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 7 PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* From 67a52d3ebb5a0ae0c0e23ffa99470d9463179c9f Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 21 May 2026 13:25:09 +0100 Subject: [PATCH 375/377] ASoC: cs-amp-lib: Fix wrong sizeof() in _cs_amp_set_efi_calibration_data() When calculating data->count replace the incorrect sizeof(data) with use of struct_offset(). The faulty sizeof(data) was incorrectly calculating the size of the pointer instead of the size of the struct pointed to. As it happens, both values are 8 on a 64-bit CPU. In the unlikely event of using this code on a 32-bit CPU the number of available bytes would be calculated 4 larger than is actually available. Instead of changing to sizeof(*data) it has been replaced by struct_offset() because it has better chance of detecting these sorts of typos. Also the offset of the data[] array is actually what we want to know here anyway. Signed-off-by: Richard Fitzgerald Fixes: 2b62e66626f0 ("ASoC: cs-amp-lib: Add function to write calibration to UEFI") Link: https://patch.msgid.link/20260521122511.987322-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index b34b1f5f121f..881c6f2264f3 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -500,7 +500,7 @@ static int _cs_amp_set_efi_calibration_data(struct device *dev, int amp_index, i * must be set. */ if (data->count == 0) - data->count = (data->size - sizeof(data)) / sizeof(data->data[0]); + data->count = (data->size - struct_offset(data, data)) / sizeof(data->data[0]); if (amp_index < 0) { /* Is there already a slot for this target? */ From ba28a07a9a0b53a538c809e04e517e1ce1f1bee3 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 21 May 2026 13:25:10 +0100 Subject: [PATCH 376/377] ASoC: cs-amp-lib: Fix missing dput() after debugfs_lookup() Rewrite cs_amp_create_debugfs() so that dput() will be called on a valid dentry returned from debugfs_lookup(). The pointer returned from debugfs_lookup() must be released by dput(). The pointer returned from debugfs_create_dir() does not need to be passed to dput(). Signed-off-by: Richard Fitzgerald Fixes: cdd27fa3298a ("ASoC: cs-amp-lib: Add helpers for factory calibration") Link: https://patch.msgid.link/20260521122511.987322-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index 881c6f2264f3..e97a125ebbb3 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -833,11 +833,18 @@ EXPORT_SYMBOL_NS_GPL(cs_amp_devm_get_vendor_specific_variant_id, "SND_SOC_CS_AMP */ struct dentry *cs_amp_create_debugfs(struct device *dev) { - struct dentry *dir; + struct dentry *dir, *created; + /* debugfs_lookup() can return NULL or ERR_PTR on error */ dir = debugfs_lookup("cirrus_logic", NULL); - if (!dir) - dir = debugfs_create_dir("cirrus_logic", NULL); + if (!IS_ERR_OR_NULL(dir)) { + created = debugfs_create_dir(dev_name(dev), dir); + dput(dir); + + return created; + } + + dir = debugfs_create_dir("cirrus_logic", NULL); return debugfs_create_dir(dev_name(dev), dir); } From a685252686851633b795bc30facac8a39344ae09 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 21 May 2026 13:25:11 +0100 Subject: [PATCH 377/377] ASoC: cs-amp-lib: Fix typo in error message: write -> read Fix the error message in cs_amp_read_cal_coeff() to say "Failed to read". It was incorrectly "Failed to write", probably a copy-paste error. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260521122511.987322-4-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index e97a125ebbb3..fb5b950e584c 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -118,7 +118,7 @@ static int cs_amp_read_cal_coeff(struct cs_dsp *dsp, } if (ret < 0) { - dev_err(dsp->dev, "Failed to write to '%s': %d\n", ctl_name, ret); + dev_err(dsp->dev, "Failed to read '%s': %d\n", ctl_name, ret); return ret; }