From 8ec396d05d1b737c87311fb7311f753b02c2a6b1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 28 Nov 2024 15:06:17 +0000 Subject: [PATCH 01/25] mm: reinstate ability to map write-sealed memfd mappings read-only Patch series "mm: reinstate ability to map write-sealed memfd mappings read-only". In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. This series reworks how we both permit write-sealed mappings being mapped read-only and disallow mprotect() from undoing the write-seal, fixing this regression. We also add a regression test to ensure that we do not accidentally regress this in future. Thanks to Julian Orth for reporting this regression. This patch (of 2): In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. This was previously unnecessarily disallowed, despite the man page documentation indicating that it would be, thereby limiting the usefulness of F_SEAL_WRITE logic. We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE seal (one which disallows future writes to the memfd) to also be used for F_SEAL_WRITE. For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a read-only mapping to disallow mprotect() from overriding the seal - an operation performed by seal_check_write(), invoked from shmem_mmap(), the f_op->mmap() hook used by shmem mappings. By extending this to F_SEAL_WRITE and critically - checking mapping_map_writable() to determine if we may map the memfd AFTER we invoke shmem_mmap() - the desired logic becomes possible. This is because mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will have cleared. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. We reinstate this functionality by moving the check out of shmem_mmap() and instead performing it in do_mmap() at the point at which VMA flags are being determined, which seems in any case to be a more appropriate place in which to make this determination. In order to achieve this we rework memfd seal logic to allow us access to this information using existing logic and eliminate the clearing of VM_MAYWRITE from seal_check_write() which we are performing in do_mmap() instead. Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") Signed-off-by: Lorenzo Stoakes Reported-by: Julian Orth Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/ Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Shuah Khan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/memfd.h | 14 +++++++++++ include/linux/mm.h | 56 ++++++++++++++++++++++++++++++------------- mm/memfd.c | 2 +- mm/mmap.c | 4 ++++ 4 files changed, 58 insertions(+), 18 deletions(-) diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 3f2cf339ceaf..d437e3070850 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,6 +7,7 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); +unsigned int *memfd_file_seals_ptr(struct file *file); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -16,6 +17,19 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } + +static inline unsigned int *memfd_file_seals_ptr(struct file *file) +{ + return NULL; +} #endif +/* Retrieve memfd seals associated with the file, if any. */ +static inline unsigned int memfd_file_seals(struct file *file) +{ + unsigned int *sealsp = memfd_file_seals_ptr(file); + + return sealsp ? *sealsp : 0; +} + #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 338a76ce9083..fb397918c43d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4101,6 +4101,37 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif +static inline bool is_write_sealed(int seals) +{ + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +/** + * is_readonly_sealed - Checks whether write-sealed but mapped read-only, + * in which case writes should be disallowing moving + * forwards. + * @seals: the seals to check + * @vm_flags: the VMA flags to check + * + * Returns whether readonly sealed, in which case writess should be disallowed + * going forward. + */ +static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) +{ + /* + * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (is_write_sealed(seals) && + ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) + return true; + + return false; +} + /** * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and * handle them. @@ -4112,24 +4143,15 @@ static inline void mem_dump_obj(void *object) {} */ static inline int seal_check_write(int seals, struct vm_area_struct *vma) { - if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; + if (!is_write_sealed(seals)) + return 0; - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vm_flags_clear(vma, VM_MAYWRITE); - } + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; return 0; } diff --git a/mm/memfd.c b/mm/memfd.c index c17c3ea701a1..35a370d75c9a 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) return error; } -static unsigned int *memfd_file_seals_ptr(struct file *file) +unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; diff --git a/mm/mmap.c b/mm/mmap.c index d32b7e701058..16f8e8be01f8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -368,6 +369,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); + unsigned int seals = memfd_file_seals(file); unsigned long flags_mask; if (!file_mmap_ok(file, inode, pgoff, len)) @@ -408,6 +410,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + else if (is_readonly_sealed(seals, vm_flags)) + vm_flags &= ~VM_MAYWRITE; fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) From ea0916e01d0b0f2cce1369ac1494239a79827270 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 28 Nov 2024 15:06:18 +0000 Subject: [PATCH 02/25] selftests/memfd: add test for mapping write-sealed memfd read-only Now we have reinstated the ability to map F_SEAL_WRITE mappings read-only, assert that we are able to do this in a test to ensure that we do not regress this again. Link: https://lkml.kernel.org/r/a6377ec470b14c0539b4600cf8fa24bf2e4858ae.1732804776.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Jann Horn Cc: Julian Orth Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 0a0b55516028..c0c53451a16d 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -282,6 +282,24 @@ static void *mfd_assert_mmap_shared(int fd) return p; } +static void *mfd_assert_mmap_read_shared(int fd) +{ + void *p; + + p = mmap(NULL, + mfd_def_size, + PROT_READ, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + + return p; +} + static void *mfd_assert_mmap_private(int fd) { void *p; @@ -980,6 +998,30 @@ static void test_seal_future_write(void) close(fd); } +static void test_seal_write_map_read_shared(void) +{ + int fd; + void *p; + + printf("%s SEAL-WRITE-MAP-READ\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_write_map_read", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_WRITE); + + p = mfd_assert_mmap_read_shared(fd); + + mfd_assert_read(fd); + mfd_assert_read_shared(fd); + mfd_fail_write(fd); + + munmap(p, mfd_def_size); + close(fd); +} + /* * Test SEAL_SHRINK * Test whether SEAL_SHRINK actually prevents shrinking @@ -1593,6 +1635,7 @@ int main(int argc, char **argv) test_seal_write(); test_seal_future_write(); + test_seal_write_map_read_shared(); test_seal_shrink(); test_seal_grow(); test_seal_resize(); From 6aaced5abd32e2a57cd94fd64f824514d0361da8 Mon Sep 17 00:00:00 2001 From: Seiji Nishikawa Date: Sun, 1 Dec 2024 01:12:34 +0900 Subject: [PATCH 03/25] mm: vmscan: account for free pages to prevent infinite Loop in throttle_direct_reclaim() The task sometimes continues looping in throttle_direct_reclaim() because allow_direct_reclaim(pgdat) keeps returning false. #0 [ffff80002cb6f8d0] __switch_to at ffff8000080095ac #1 [ffff80002cb6f900] __schedule at ffff800008abbd1c #2 [ffff80002cb6f990] schedule at ffff800008abc50c #3 [ffff80002cb6f9b0] throttle_direct_reclaim at ffff800008273550 #4 [ffff80002cb6fa20] try_to_free_pages at ffff800008277b68 #5 [ffff80002cb6fae0] __alloc_pages_nodemask at ffff8000082c4660 #6 [ffff80002cb6fc50] alloc_pages_vma at ffff8000082e4a98 #7 [ffff80002cb6fca0] do_anonymous_page at ffff80000829f5a8 #8 [ffff80002cb6fce0] __handle_mm_fault at ffff8000082a5974 #9 [ffff80002cb6fd90] handle_mm_fault at ffff8000082a5bd4 At this point, the pgdat contains the following two zones: NODE: 4 ZONE: 0 ADDR: ffff00817fffe540 NAME: "DMA32" SIZE: 20480 MIN/LOW/HIGH: 11/28/45 VM_STAT: NR_FREE_PAGES: 359 NR_ZONE_INACTIVE_ANON: 18813 NR_ZONE_ACTIVE_ANON: 0 NR_ZONE_INACTIVE_FILE: 50 NR_ZONE_ACTIVE_FILE: 0 NR_ZONE_UNEVICTABLE: 0 NR_ZONE_WRITE_PENDING: 0 NR_MLOCK: 0 NR_BOUNCE: 0 NR_ZSPAGES: 0 NR_FREE_CMA_PAGES: 0 NODE: 4 ZONE: 1 ADDR: ffff00817fffec00 NAME: "Normal" SIZE: 8454144 PRESENT: 98304 MIN/LOW/HIGH: 68/166/264 VM_STAT: NR_FREE_PAGES: 146 NR_ZONE_INACTIVE_ANON: 94668 NR_ZONE_ACTIVE_ANON: 3 NR_ZONE_INACTIVE_FILE: 735 NR_ZONE_ACTIVE_FILE: 78 NR_ZONE_UNEVICTABLE: 0 NR_ZONE_WRITE_PENDING: 0 NR_MLOCK: 0 NR_BOUNCE: 0 NR_ZSPAGES: 0 NR_FREE_CMA_PAGES: 0 In allow_direct_reclaim(), while processing ZONE_DMA32, the sum of inactive/active file-backed pages calculated in zone_reclaimable_pages() based on the result of zone_page_state_snapshot() is zero. Additionally, since this system lacks swap, the calculation of inactive/ active anonymous pages is skipped. crash> p nr_swap_pages nr_swap_pages = $1937 = { counter = 0 } As a result, ZONE_DMA32 is deemed unreclaimable and skipped, moving on to the processing of the next zone, ZONE_NORMAL, despite ZONE_DMA32 having free pages significantly exceeding the high watermark. The problem is that the pgdat->kswapd_failures hasn't been incremented. crash> px ((struct pglist_data *) 0xffff00817fffe540)->kswapd_failures $1935 = 0x0 This is because the node deemed balanced. The node balancing logic in balance_pgdat() evaluates all zones collectively. If one or more zones (e.g., ZONE_DMA32) have enough free pages to meet their watermarks, the entire node is deemed balanced. This causes balance_pgdat() to exit early before incrementing the kswapd_failures, as it considers the overall memory state acceptable, even though some zones (like ZONE_NORMAL) remain under significant pressure. The patch ensures that zone_reclaimable_pages() includes free pages (NR_FREE_PAGES) in its calculation when no other reclaimable pages are available (e.g., file-backed or anonymous pages). This change prevents zones like ZONE_DMA32, which have sufficient free pages, from being mistakenly deemed unreclaimable. By doing so, the patch ensures proper node balancing, avoids masking pressure on other zones like ZONE_NORMAL, and prevents infinite loops in throttle_direct_reclaim() caused by allow_direct_reclaim(pgdat) repeatedly returning false. The kernel hangs due to a task stuck in throttle_direct_reclaim(), caused by a node being incorrectly deemed balanced despite pressure in certain zones, such as ZONE_NORMAL. This issue arises from zone_reclaimable_pages() returning 0 for zones without reclaimable file- backed or anonymous pages, causing zones like ZONE_DMA32 with sufficient free pages to be skipped. The lack of swap or reclaimable pages results in ZONE_DMA32 being ignored during reclaim, masking pressure in other zones. Consequently, pgdat->kswapd_failures remains 0 in balance_pgdat(), preventing fallback mechanisms in allow_direct_reclaim() from being triggered, leading to an infinite loop in throttle_direct_reclaim(). This patch modifies zone_reclaimable_pages() to account for free pages (NR_FREE_PAGES) when no other reclaimable pages exist. This ensures zones with sufficient free pages are not skipped, enabling proper balancing and reclaim behavior. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20241130164346.436469-1-snishika@redhat.com Link: https://lkml.kernel.org/r/20241130161236.433747-2-snishika@redhat.com Fixes: 5a1c84b404a7 ("mm: remove reclaim and compaction retry approximations") Signed-off-by: Seiji Nishikawa Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 76378bc257e3..9a859b7d18d7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -374,7 +374,14 @@ unsigned long zone_reclaimable_pages(struct zone *zone) if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); - + /* + * If there are no reclaimable file-backed or anonymous pages, + * ensure zones with sufficient free pages are not skipped. + * This prevents zones like DMA32 from being ignored in reclaim + * scenarios where they can still help alleviate memory pressure. + */ + if (nr == 0) + nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); return nr; } From 34d7cf637c437d5c2a8a6ef23ea45193bad8a91c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 6 Dec 2024 15:03:45 +0800 Subject: [PATCH 04/25] mm: don't try THP alignment for FS without get_unmapped_area Commit ed48e87c7df3 ("thp: add thp_get_unmapped_area_vmflags()") changes thp_get_unmapped_area() to thp_get_unmapped_area_vmflags() in __get_unmapped_area(), which doesn't initialize local get_area for anonymous mappings. This leads to us always trying THP alignment even for file_operations which have a NULL ->get_unmapped_area() callback. Since commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") we only want to enable THP alignment for anonymous mappings, so add a !file check to avoid attempting THP alignment for file mappings. Found issue by code inspection. THP alignment is used for easy or more pmd mappings, from vma side. This may cause unnecessary VMA fragmentation and potentially worse performance on filesystems that do not actually support THPs and thus cannot benefit from the alignment. Link: https://lkml.kernel.org/r/20241206070345.2526501-1-wangkefeng.wang@huawei.com Fixes: ed48e87c7df3 ("thp: add thp_get_unmapped_area_vmflags()") Signed-off-by: Kefeng Wang Reviewed-by: Vlastimil Babka Reviewed-by: Yang Shi Cc: Christophe Leroy Cc: David Hildenbrand Cc: Jann Horn Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Rick Edgecombe Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 16f8e8be01f8..aec208f90337 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -892,7 +892,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ From 158cdce87c8c172787063998ad5dd3e2f658b963 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 6 Dec 2024 16:30:25 +0800 Subject: [PATCH 05/25] mm/readahead: fix large folio support in async readahead When testing large folio support with XFS on our servers, we observed that only a few large folios are mapped when reading large files via mmap. After a thorough analysis, I identified it was caused by the `/sys/block/*/queue/read_ahead_kb` setting. On our test servers, this parameter is set to 128KB. After I tune it to 2MB, the large folio can work as expected. However, I believe the large folio behavior should not be dependent on the value of read_ahead_kb. It would be more robust if the kernel can automatically adopt to it. With /sys/block/*/queue/read_ahead_kb set to 128KB and performing a sequential read on a 1GB file using MADV_HUGEPAGE, the differences in /proc/meminfo are as follows: - before this patch FileHugePages: 18432 kB FilePmdMapped: 4096 kB - after this patch FileHugePages: 1067008 kB FilePmdMapped: 1048576 kB This shows that after applying the patch, the entire 1GB file is mapped to huge pages. The stable list is CCed, as without this patch, large folios don't function optimally in the readahead path. It's worth noting that if read_ahead_kb is set to a larger value that isn't aligned with huge page sizes (e.g., 4MB + 128KB), it may still fail to map to hugepages. Link: https://lkml.kernel.org/r/20241108141710.9721-1-laoar.shao@gmail.com Link: https://lkml.kernel.org/r/20241206083025.3478-1-laoar.shao@gmail.com Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings") Signed-off-by: Yafang Shao Tested-by: kernel test robot Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/readahead.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/readahead.c b/mm/readahead.c index ea650b8b02fb..e151f4b13ca4 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -646,7 +646,11 @@ void page_cache_async_ra(struct readahead_control *ractl, 1UL << order); if (index == expected) { ra->start += ra->size; - ra->size = get_next_ra_size(ra, max_pages); + /* + * In the case of MADV_HUGEPAGE, the actual size might exceed + * the readahead window. + */ + ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); ra->async_size = ra->size; goto readit; } From 1fd8bc7cd889bd73d07a83cb32d674ac68f99153 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Sat, 14 Dec 2024 17:30:05 +0800 Subject: [PATCH 06/25] maple_tree: reload mas before the second call for mas_empty_area Change the LONG_MAX in simple_offset_add to 1024, and do latter: [root@fedora ~]# mkdir /tmp/dir [root@fedora ~]# for i in {1..1024}; do touch /tmp/dir/$i; done touch: cannot touch '/tmp/dir/1024': Device or resource busy [root@fedora ~]# rm /tmp/dir/123 [root@fedora ~]# touch /tmp/dir/1024 [root@fedora ~]# rm /tmp/dir/100 [root@fedora ~]# touch /tmp/dir/1025 touch: cannot touch '/tmp/dir/1025': Device or resource busy After we delete file 100, actually this is a empty entry, but the latter create failed unexpected. mas_alloc_cyclic has two chance to find empty entry. First find the entry with range range_lo and range_hi, if no empty entry exist, and range_lo > min, retry find with range min and range_hi. However, the first call mas_empty_area may mark mas as EBUSY, and the second call for mas_empty_area will return false directly. Fix this by reload mas before second call for mas_empty_area. [Liam.Howlett@Oracle.com: fix mas_alloc_cyclic() second search] Link: https://lore.kernel.org/all/20241216060600.287B4C4CED0@smtp.kernel.org/ Link: https://lkml.kernel.org/r/20241216190113.1226145-2-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20241214093005.72284-1-yangerkun@huaweicloud.com Fixes: 9b6713cc7522 ("maple_tree: Add mtree_alloc_cyclic()") Signed-off-by: Yang Erkun Signed-off-by: Liam R. Howlett Cc: Christian Brauner Cc: Chuck Lever says: Cc: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d0ae808f3a14..047397136f15 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4354,6 +4354,7 @@ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, ret = 1; } if (ret < 0 && range_lo > min) { + mas_reset(mas); ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; From 59d9094df3d79443937add8700b2ef1a866b1081 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 16 Dec 2024 15:11:47 +0800 Subject: [PATCH 07/25] mm: hugetlb: independent PMD page table shared count The folio refcount may be increased unexpectly through try_get_folio() by caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to check whether a pmd page table is shared. The check is incorrect if the refcount is increased by the above caller, and this can cause the page table leaked: BUG: Bad page state in process sh pfn:109324 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) page_type: f2(table) raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 page dumped because: nonzero mapcount ... CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7 Tainted: [B]=BAD_PAGE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x80/0xf8 dump_stack+0x18/0x28 bad_page+0x8c/0x130 free_page_is_bad_report+0xa4/0xb0 free_unref_page+0x3cc/0x620 __folio_put+0xf4/0x158 split_huge_pages_all+0x1e0/0x3e8 split_huge_pages_write+0x25c/0x2d8 full_proxy_write+0x64/0xd8 vfs_write+0xcc/0x280 ksys_write+0x70/0x110 __arm64_sys_write+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 The issue may be triggered by damon, offline_page, page_idle, etc, which will increase the refcount of page table. 1. The page table itself will be discarded after reporting the "nonzero mapcount". 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared and a shared page table will not be unmapped. Fix it by introducing independent PMD page table shared count. As described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv pmds, so we can reuse the field as pt_share_count. Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Liu Shixin Cc: Kefeng Wang Cc: Ken Chen Cc: Muchun Song Cc: Nanyong Sun Cc: Jane Chu Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + include/linux/mm_types.h | 30 ++++++++++++++++++++++++++++++ mm/hugetlb.c | 16 +++++++--------- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index fb397918c43d..b1c3db9cf355 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3125,6 +3125,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) if (!pmd_ptlock_init(ptdesc)) return false; __folio_set_pgtable(folio); + ptdesc_pmd_pts_init(ptdesc); lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7361a8f3ab68..332cee285662 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. + * @pt_share_count: Used for HugeTLB PMD page table share count. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. @@ -471,6 +472,9 @@ struct ptdesc { pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + atomic_t pt_share_count; +#endif }; union { @@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); const struct page *: (const struct ptdesc *)(p), \ struct page *: (struct ptdesc *)(p))) +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ + atomic_set(&ptdesc->pt_share_count, 0); +} + +static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc) +{ + atomic_inc(&ptdesc->pt_share_count); +} + +static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) +{ + atomic_dec(&ptdesc->pt_share_count); +} + +static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +{ + return atomic_read(&ptdesc->pt_share_count); +} +#else +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ +} +#endif + /* * Used for sizing the vmemmap region on some architectures */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cec4b121193f..c498874a7170 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7211,7 +7211,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { - get_page(virt_to_page(spte)); + ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); break; } } @@ -7226,7 +7226,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { - put_page(virt_to_page(spte)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); } spin_unlock(&mm->page_table_lock); out: @@ -7238,10 +7238,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, /* * unmap huge page backed by shared pte. * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page @@ -7250,18 +7246,20 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) + if (sz != PMD_SIZE) + return 0; + if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); - put_page(virt_to_page(ptep)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; } From cddc76b165161a02ff14c4d84d0f5266d9d32b9e Mon Sep 17 00:00:00 2001 From: Alessandro Carminati Date: Tue, 17 Dec 2024 14:20:33 +0000 Subject: [PATCH 08/25] mm/kmemleak: fix sleeping function called from invalid context at print message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address a bug in the kernel that triggers a "sleeping function called from invalid context" warning when /sys/kernel/debug/kmemleak is printed under specific conditions: - CONFIG_PREEMPT_RT=y - Set SELinux as the LSM for the system - Set kptr_restrict to 1 - kmemleak buffer contains at least one item BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 136, name: cat preempt_count: 1, expected: 0 RCU nest depth: 2, expected: 2 6 locks held by cat/136: #0: ffff32e64bcbf950 (&p->lock){+.+.}-{3:3}, at: seq_read_iter+0xb8/0xe30 #1: ffffafe6aaa9dea0 (scan_mutex){+.+.}-{3:3}, at: kmemleak_seq_start+0x34/0x128 #3: ffff32e6546b1cd0 (&object->lock){....}-{2:2}, at: kmemleak_seq_show+0x3c/0x1e0 #4: ffffafe6aa8d8560 (rcu_read_lock){....}-{1:2}, at: has_ns_capability_noaudit+0x8/0x1b0 #5: ffffafe6aabbc0f8 (notif_lock){+.+.}-{2:2}, at: avc_compute_av+0xc4/0x3d0 irq event stamp: 136660 hardirqs last enabled at (136659): [] _raw_spin_unlock_irqrestore+0xa8/0xd8 hardirqs last disabled at (136660): [] _raw_spin_lock_irqsave+0x8c/0xb0 softirqs last enabled at (0): [] copy_process+0x11d8/0x3df8 softirqs last disabled at (0): [<0000000000000000>] 0x0 Preemption disabled at: [] kmemleak_seq_show+0x3c/0x1e0 CPU: 1 UID: 0 PID: 136 Comm: cat Tainted: G E 6.11.0-rt7+ #34 Tainted: [E]=UNSIGNED_MODULE Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0xa0/0x128 show_stack+0x1c/0x30 dump_stack_lvl+0xe8/0x198 dump_stack+0x18/0x20 rt_spin_lock+0x8c/0x1a8 avc_perm_nonode+0xa0/0x150 cred_has_capability.isra.0+0x118/0x218 selinux_capable+0x50/0x80 security_capable+0x7c/0xd0 has_ns_capability_noaudit+0x94/0x1b0 has_capability_noaudit+0x20/0x30 restricted_pointer+0x21c/0x4b0 pointer+0x298/0x760 vsnprintf+0x330/0xf70 seq_printf+0x178/0x218 print_unreferenced+0x1a4/0x2d0 kmemleak_seq_show+0xd0/0x1e0 seq_read_iter+0x354/0xe30 seq_read+0x250/0x378 full_proxy_read+0xd8/0x148 vfs_read+0x190/0x918 ksys_read+0xf0/0x1e0 __arm64_sys_read+0x70/0xa8 invoke_syscall.constprop.0+0xd4/0x1d8 el0_svc+0x50/0x158 el0t_64_sync+0x17c/0x180 %pS and %pK, in the same back trace line, are redundant, and %pS can void %pK service in certain contexts. %pS alone already provides the necessary information, and if it cannot resolve the symbol, it falls back to printing the raw address voiding the original intent behind the %pK. Additionally, %pK requires a privilege check CAP_SYSLOG enforced through the LSM, which can trigger a "sleeping function called from invalid context" warning under RT_PREEMPT kernels when the check occurs in an atomic context. This issue may also affect other LSMs. This change avoids the unnecessary privilege check and resolves the sleeping function warning without any loss of information. Link: https://lkml.kernel.org/r/20241217142032.55793-1-acarmina@redhat.com Fixes: 3a6f33d86baa ("mm/kmemleak: use %pK to display kernel pointers in backtrace") Signed-off-by: Alessandro Carminati Acked-by: Sebastian Andrzej Siewior Acked-by: Catalin Marinas Cc: Clément Léger Cc: Alessandro Carminati Cc: Eric Chanudet Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Steven Rostedt Cc: Thomas Weißschuh Cc: Signed-off-by: Andrew Morton --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2a945c07ae99..737af23f4f4e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -373,7 +373,7 @@ static void print_unreferenced(struct seq_file *seq, for (i = 0; i < nr_entries; i++) { void *ptr = (void *)entries[i]; - warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr); + warn_or_seq_printf(seq, " %pS\n", ptr); } } From 4d9b90df2eb49ab9becdbfd1fd60071bb107406e Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Tue, 17 Dec 2024 11:09:21 +0100 Subject: [PATCH 09/25] mailmap: modify the entry for Mathieu Othacehe Set my gnu address as the main one. Link: https://lkml.kernel.org/r/20241217100924.7821-1-othacehe@gnu.org Signed-off-by: Mathieu Othacehe Cc: Alex Elder Cc: David S. Miller Cc: Geliang Tang Cc: Kees Cook Cc: Matthieu Baerts (NGI0) Cc: Neeraj Upadhyay Cc: Quentin Monnet Signed-off-by: Andrew Morton --- .mailmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 7efe43237ca8..f5f97f947020 100644 --- a/.mailmap +++ b/.mailmap @@ -435,7 +435,7 @@ Martin Kepplinger Martin Kepplinger Martin Kepplinger Martyna Szapar-Mudlaw -Mathieu Othacehe +Mathieu Othacehe Mat Martineau Mat Martineau Matthew Wilcox From 472098f23323c39cc6269d7b7bf76cba62830a4c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 17 Dec 2024 16:55:39 +0800 Subject: [PATCH 10/25] docs: mm: fix the incorrect 'FileHugeMapped' field The '/proc/PID/smaps' does not have the 'FileHugeMapped' field to count the file transparent huge pages, instead, the 'FilePmdMapped' field should be used. Fix it. Link: https://lkml.kernel.org/r/d520ce3aba2b03b088be30bece732426a939049a.1734425264.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 5034915f4e8e..8872203df088 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -436,7 +436,7 @@ AnonHugePmdMapped). The number of file transparent huge pages mapped to userspace is available by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. To identify what applications are mapping file transparent huge pages, it -is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields +is necessary to read ``/proc/PID/smaps`` and count the FilePmdMapped fields for each mapping. Note that reading the smaps file is expensive and reading it From cb0ca08b326aa03f87fe94bb91872ce8d2ef1ed8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Dec 2024 08:18:10 +0100 Subject: [PATCH 11/25] kcov: mark in_softirq_really() as __always_inline If gcc decides not to inline in_softirq_really(), objtool warns about a function call with UACCESS enabled: kernel/kcov.o: warning: objtool: __sanitizer_cov_trace_pc+0x1e: call to in_softirq_really() with UACCESS enabled kernel/kcov.o: warning: objtool: check_kcov_mode+0x11: call to in_softirq_really() with UACCESS enabled Mark this as __always_inline to avoid the problem. Link: https://lkml.kernel.org/r/20241217071814.2261620-1-arnd@kernel.org Fixes: 7d4df2dad312 ("kcov: properly check for softirq context") Signed-off-by: Arnd Bergmann Reviewed-by: Marco Elver Cc: Aleksandr Nogikh Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 28a6be6e64fd..187ba1b80bda 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -166,7 +166,7 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, * Unlike in_serving_softirq(), this function returns false when called during * a hardirq or an NMI that happened in the softirq context. */ -static inline bool in_softirq_really(void) +static __always_inline bool in_softirq_really(void) { return in_serving_softirq() && !in_hardirq() && !in_nmi(); } From 3754137d263f52f4b507cf9ae913f8f0497d1b0e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 17 Dec 2024 20:50:00 +0100 Subject: [PATCH 12/25] fs/proc/task_mmu: fix pagemap flags with PMD THP entries on 32bit Entries (including flags) are u64, even on 32bit. So right now we are cutting of the flags on 32bit. This way, for example the cow selftest complains about: # ./cow ... Bail Out! read and ioctl return unmatched results for populated: 0 1 Link: https://lkml.kernel.org/r/20241217195000.1734039-1-david@redhat.com Fixes: 2c1f057e5be6 ("fs/proc/task_mmu: properly detect PM_MMAP_EXCLUSIVE per page of PMD-mapped THPs") Signed-off-by: David Hildenbrand Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 38a5a3e9cba2..f02cd362309a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1810,7 +1810,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } for (; addr != end; addr += PAGE_SIZE, idx++) { - unsigned long cur_flags = flags; + u64 cur_flags = flags; pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && From 5f3fd772d152229d94602bca243fbb658068a597 Mon Sep 17 00:00:00 2001 From: Dennis Lam Date: Tue, 17 Dec 2024 21:39:25 -0500 Subject: [PATCH 13/25] ocfs2: fix slab-use-after-free due to dangling pointer dqi_priv When mounting ocfs2 and then remounting it as read-only, a slab-use-after-free occurs after the user uses a syscall to quota_getnextquota. Specifically, sb_dqinfo(sb, type)->dqi_priv is the dangling pointer. During the remounting process, the pointer dqi_priv is freed but is never set as null leaving it to be accessed. Additionally, the read-only option for remounting sets the DQUOT_SUSPENDED flag instead of setting the DQUOT_USAGE_ENABLED flags. Moreover, later in the process of getting the next quota, the function ocfs2_get_next_id is called and only checks the quota usage flags and not the quota suspended flags. To fix this, I set dqi_priv to null when it is freed after remounting with read-only and put a check for DQUOT_SUSPENDED in ocfs2_get_next_id. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20241218023924.22821-2-dennis.lamerice@gmail.com Fixes: 8f9e8f5fcc05 ("ocfs2: Fix Q_GETNEXTQUOTA for filesystem without quotas") Signed-off-by: Dennis Lam Reported-by: syzbot+d173bf8a5a7faeede34c@syzkaller.appspotmail.com Tested-by: syzbot+d173bf8a5a7faeede34c@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6731d26f.050a0220.1fb99c.014b.GAE@google.com/T/ Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/quota_global.c | 2 +- fs/ocfs2/quota_local.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 2b0daced98eb..3404e7a30c33 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -893,7 +893,7 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) int status = 0; trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type); - if (!sb_has_quota_loaded(sb, type)) { + if (!sb_has_quota_active(sb, type)) { status = -ESRCH; goto out; } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 73d3367c533b..2956d888c131 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -867,6 +867,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) brelse(oinfo->dqi_libh); brelse(oinfo->dqi_lqi_bh); kfree(oinfo); + info->dqi_priv = NULL; return status; } From eaebeb93922ca6ab0dd92027b73d0112701706ef Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 19 Dec 2024 21:24:37 +0000 Subject: [PATCH 14/25] mm: zswap: fix race between [de]compression and CPU hotunplug In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the current CPU at the beginning of the operation is retrieved and used throughout. However, since neither preemption nor migration are disabled, it is possible that the operation continues on a different CPU. If the original CPU is hotunplugged while the acomp_ctx is still in use, we run into a UAF bug as the resources attached to the acomp_ctx are freed during hotunplug in zswap_cpu_comp_dead(). The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") when the switch to the crypto_acomp API was made. Prior to that, the per-CPU crypto_comp was retrieved using get_cpu_ptr() which disables preemption and makes sure the CPU cannot go away from under us. Preemption cannot be disabled with the crypto_acomp API as a sleepable context is needed. Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx") increased the UAF surface area by making the per-CPU buffers dynamic, adding yet another resource that can be freed from under zswap compression/decompression by CPU hotunplug. There are a few ways to fix this: (a) Add a refcount for acomp_ctx. (b) Disable migration while using the per-CPU acomp_ctx. (c) Disable CPU hotunplug while using the per-CPU acomp_ctx by holding the CPUs read lock. Implement (c) since it's simpler than (a), and (b) involves using migrate_disable() which is apparently undesired (see huge comment in include/linux/preempt.h). Link: https://lkml.kernel.org/r/20241219212437.2714151-1-yosryahmed@google.com Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") Signed-off-by: Yosry Ahmed Reported-by: Johannes Weiner Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/ Reported-by: Sam Sun Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/ Reviewed-by: Chengming Zhou Acked-by: Barry Song Reviewed-by: Nhat Pham Cc: Vitaly Wool Cc: Signed-off-by: Andrew Morton --- mm/zswap.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index f6316b66fb23..5a27af8d86ea 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -880,6 +880,18 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } +/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */ +static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx) +{ + cpus_read_lock(); + return raw_cpu_ptr(acomp_ctx); +} + +static void acomp_ctx_put_cpu(void) +{ + cpus_read_unlock(); +} + static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -893,8 +905,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, gfp_t gfp; u8 *dst; - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); - + acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); dst = acomp_ctx->buffer; @@ -950,6 +961,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, zswap_reject_alloc_fail++; mutex_unlock(&acomp_ctx->mutex); + acomp_ctx_put_cpu(); return comp_ret == 0 && alloc_ret == 0; } @@ -960,7 +972,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct crypto_acomp_ctx *acomp_ctx; u8 *src; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); @@ -990,6 +1002,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) if (src != acomp_ctx->buffer) zpool_unmap_handle(zpool, entry->handle); + acomp_ctx_put_cpu(); } /********************************* From 11673247700e2af3a6a95f7b3f1bb80b691c950e Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 19 Dec 2024 14:18:28 +0200 Subject: [PATCH 15/25] percpu: remove intermediate variable in PERCPU_PTR() The intermediate variable in the PERCPU_PTR() macro results in a kernel panic on boot [1] due to a compiler bug seen when compiling the kernel (+ KASAN) with gcc 11.3.1, but not when compiling with latest gcc (v14.2)/clang(v18.1). To solve it, remove the intermediate variable (which is not needed) and keep the casting that resolves the address space checks. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] CPU: 0 UID: 0 PID: 547 Comm: iptables Not tainted 6.13.0-rc1_external_tested-master #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:nf_ct_netns_do_get+0x139/0x540 Code: 03 00 00 48 81 c4 88 00 00 00 5b 5d 41 5c 41 5d 41 5e 41 5f c3 4d 8d 75 08 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 27 03 00 00 41 8b 45 08 83 c0 RSP: 0018:ffff888116df75e8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 1ffff11022dbeebe RCX: ffffffff839a2382 RDX: 0000000000000003 RSI: 0000000000000008 RDI: ffff88842ec46d10 RBP: 0000000000000002 R08: 0000000000000000 R09: fffffbfff0b0860c R10: ffff888116df75e8 R11: 0000000000000001 R12: ffffffff879d6a80 R13: 0000000000000016 R14: 000000000000001e R15: ffff888116df7908 FS: 00007fba01646740(0000) GS:ffff88842ec00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bd901800d8 CR3: 00000001205f0003 CR4: 0000000000172eb0 Call Trace: ? die_addr+0x3d/0xa0 ? exc_general_protection+0x144/0x220 ? asm_exc_general_protection+0x22/0x30 ? __mutex_lock+0x2c2/0x1d70 ? nf_ct_netns_do_get+0x139/0x540 ? nf_ct_netns_do_get+0xb5/0x540 ? net_generic+0x1f0/0x1f0 ? __create_object+0x5e/0x80 xt_check_target+0x1f0/0x930 ? textify_hooks.constprop.0+0x110/0x110 ? pcpu_alloc_noprof+0x7cd/0xcf0 ? xt_find_target+0x148/0x1e0 find_check_entry.constprop.0+0x6c0/0x920 ? get_info+0x380/0x380 ? __virt_addr_valid+0x1df/0x3b0 ? kasan_quarantine_put+0xe3/0x200 ? kfree+0x13e/0x3d0 ? translate_table+0xaf5/0x1750 translate_table+0xbd8/0x1750 ? ipt_unregister_table_exit+0x30/0x30 ? __might_fault+0xbb/0x170 do_ipt_set_ctl+0x408/0x1340 ? nf_sockopt_find.constprop.0+0x17b/0x1f0 ? lock_downgrade+0x680/0x680 ? lockdep_hardirqs_on_prepare+0x284/0x400 ? ipt_register_table+0x440/0x440 ? bit_wait_timeout+0x160/0x160 nf_setsockopt+0x6f/0xd0 raw_setsockopt+0x7e/0x200 ? raw_bind+0x590/0x590 ? do_user_addr_fault+0x812/0xd20 do_sock_setsockopt+0x1e2/0x3f0 ? move_addr_to_user+0x90/0x90 ? lock_downgrade+0x680/0x680 __sys_setsockopt+0x9e/0x100 __x64_sys_setsockopt+0xb9/0x150 ? do_syscall_64+0x33/0x140 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fba015134ce Code: 0f 1f 40 00 48 8b 15 59 69 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b1 0f 1f 00 f3 0f 1e fa 49 89 ca b8 36 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 21 RSP: 002b:00007ffd9de6f388 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 000055bd9017f490 RCX: 00007fba015134ce RDX: 0000000000000040 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 0000000000000500 R08: 0000000000000560 R09: 0000000000000052 R10: 000055bd901800e0 R11: 0000000000000246 R12: 000055bd90180140 R13: 000055bd901800e0 R14: 000055bd9017f498 R15: 000055bd9017ff10 Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc mlx4_ib mlx4_en mlx4_core rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi fuse ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_core ---[ end trace 0000000000000000 ]--- [akpm@linux-foundation.org: simplification, per Uros] Link: https://lkml.kernel.org/r/20241219121828.2120780-1-gal@nvidia.com Fixes: dabddd687c9e ("percpu: cast percpu pointer in PERCPU_PTR() via unsigned long") Signed-off-by: Gal Pressman Closes: https://lore.kernel.org/all/7590f546-4021-4602-9252-0d525de35b52@nvidia.com Cc: Uros Bizjak Cc: Bill Wendling Cc: Christoph Lameter Cc: Dennis Zhou Cc: Justin Stitt Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 35842d1e3879..5b520fe86b60 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,10 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ -({ \ - unsigned long __pcpu_ptr = (__force unsigned long)(__p); \ - (typeof(*(__p)) __force __kernel *)(__pcpu_ptr); \ -}) + (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) #ifdef CONFIG_SMP From d0e6983a6d1719738cf8d13982a68094f0a1872a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 19 Dec 2024 15:30:08 +0800 Subject: [PATCH 16/25] mm: shmem: fix incorrect index alignment for within_size policy With enabling the shmem per-size within_size policy, using an incorrect 'order' size to round_up() the index can lead to incorrect i_size checks, resulting in an inappropriate large orders being returned. Changing to use '1 << order' to round_up() the index to fix this issue. Additionally, adding an 'aligned_index' variable to avoid affecting the index checks. Link: https://lkml.kernel.org/r/77d8ef76a7d3d646e9225e9af88a76549a68aab1.1734593154.git.baolin.wang@linux.alibaba.com Fixes: e7a2ab7b3bb5 ("mm: shmem: add mTHP support for anonymous shmem") Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f6fb053ac50d..dec659e84562 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1689,6 +1689,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; + pgoff_t aligned_index; bool global_huge; loff_t i_size; int order; @@ -1723,9 +1724,9 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, /* Allow mTHP that will be fully within i_size. */ order = highest_order(within_size_orders); while (within_size_orders) { - index = round_up(index + 1, order); + aligned_index = round_up(index + 1, 1 << order); i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) { + if (i_size >> PAGE_SHIFT >= aligned_index) { mask |= within_size_orders; break; } From d77b90d2b2642655b5f60953c36ad887257e1802 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 19 Dec 2024 15:30:09 +0800 Subject: [PATCH 17/25] mm: shmem: fix the update of 'shmem_falloc->nr_unswapped' The 'shmem_falloc->nr_unswapped' is used to record how many writepage refused to swap out because fallocate() is allocating, but after shmem supports large folio swap out, the update of 'shmem_falloc->nr_unswapped' does not use the correct number of pages in the large folio, which may lead to fallocate() not exiting as soon as possible. Anyway, this is found through code inspection, and I am not sure whether it would actually cause serious issues. Link: https://lkml.kernel.org/r/f66a0119d0564c2c37c84f045835b870d1b2196f.1734593154.git.baolin.wang@linux.alibaba.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index dec659e84562..ac58d4fb2e6f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1535,7 +1535,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) - shmem_falloc->nr_unswapped++; + shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); From adcfb264c3ed51fbbf5068ddf10d309a63683868 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Sat, 21 Dec 2024 12:33:20 +0900 Subject: [PATCH 18/25] vmstat: disable vmstat_work on vmstat_cpu_down_prep() Even after mm/vmstat:online teardown, shepherd may still queue work for the dying cpu until the cpu is removed from online mask. While it's quite rare, this means that after unbind_workers() unbinds a per-cpu kworker, it potentially runs vmstat_update for the dying CPU on an irrelevant cpu before entering atomic AP states. When CONFIG_DEBUG_PREEMPT=y, it results in the following error with the backtrace. BUG: using smp_processor_id() in preemptible [00000000] code: \ kworker/7:3/1702 caller is refresh_cpu_vm_stats+0x235/0x5f0 CPU: 0 UID: 0 PID: 1702 Comm: kworker/7:3 Tainted: G Tainted: [N]=TEST Workqueue: mm_percpu_wq vmstat_update Call Trace: dump_stack_lvl+0x8d/0xb0 check_preemption_disabled+0xce/0xe0 refresh_cpu_vm_stats+0x235/0x5f0 vmstat_update+0x17/0xa0 process_one_work+0x869/0x1aa0 worker_thread+0x5e5/0x1100 kthread+0x29e/0x380 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 So, for mm/vmstat:online, disable vmstat_work reliably on teardown and symmetrically enable it on startup. Link: https://lkml.kernel.org/r/20241221033321.4154409-1-koichiro.den@canonical.com Signed-off-by: Koichiro Den Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Andrew Morton --- mm/vmstat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 4d016314a56c..0889b75cef14 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2148,13 +2148,14 @@ static int vmstat_cpu_online(unsigned int cpu) if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); } + enable_delayed_work(&per_cpu(vmstat_work, cpu)); return 0; } static int vmstat_cpu_down_prep(unsigned int cpu) { - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); return 0; } From 98a6abc6cec186bdc3d94c162227cc8e003de76c Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 23 Dec 2024 23:09:07 +0800 Subject: [PATCH 19/25] mm/list_lru: fix false warning of negative counter commit 2788cf0c401c ("memcg: reparent list_lrus and free kmemcg_id on css offline") removed sanity checks for the nr_items counter's value because it implemented list_lru re-parenting in a way that will redirect children's list_lru to the parent before re-parenting the items in list_lru. This will make item counter uncharging happen in the parent while the item is still being held by the child. As a result, the parent's counter value may become negative. This is acceptable because re-parenting will sum up the children's counter values, and the parent's counter will be fixed. Later commit fb56fdf8b9a2 ("mm/list_lru: split the lock to per-cgroup scope") reworked the re-parenting process, and removed the redirect. So it added the sanity check back, assuming that as long as items are still in the children's list_lru, parent's counter will not be uncharged. But that assumption is incorrect. The xas_store in memcg_reparent_list_lrus will set children's list_lru to NULL before re-parenting the items, it redirects list_lru helpers to use parent's list_lru just like before. But still, it's not a problem as re-parenting will fix the counter. Therefore, remove this sanity check, but add a new check to ensure that the counter won't go negative in a different way: the child's list_lru being re-parented should never have a negative counter, since re-parenting should occur in order and fixes counters. Link: https://lkml.kernel.org/r/20241223150907.1591-1-ryncsn@gmail.com Fixes: fb56fdf8b9a2 ("mm/list_lru: split the lock to per-cgroup scope") Signed-off-by: Kairui Song Closes: https://lore.kernel.org/lkml/Z2Bz9t92Be9l1xqj@lappy/ Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Sasha Levin Cc: Shakeel Butt Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/list_lru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index f93ada6a207b..7d69434c70e0 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -77,7 +77,6 @@ lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, spin_lock(&l->lock); nr_items = READ_ONCE(l->nr_items); if (likely(nr_items != LONG_MIN)) { - WARN_ON(nr_items < 0); rcu_read_unlock(); return l; } @@ -450,6 +449,7 @@ static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, list_splice_init(&src->list, &dst->list); if (src->nr_items) { + WARN_ON(src->nr_items < 0); dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); } From 8debfc5b1aa569d3d2ac836af2553da037611c61 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Dec 2024 15:12:21 -0800 Subject: [PATCH 20/25] mm/damon/core: fix new damon_target objects leaks on damon_commit_targets() Patch series "mm/damon/core: fix memory leaks and ignored inputs from damon_commit_ctx()". Due to two bugs in damon_commit_targets() and damon_commit_schemes(), which are called from damon_commit_ctx(), some user inputs can be ignored, and some mmeory objects can be leaked. Fix those. Note that only DAMON sysfs interface users are affected. Other DAMON core API user modules that more focused more on simple and dedicated production usages, including DAMON_RECLAIM and DAMON_LRU_SORT are not using the buggy function in the way, so not affected. This patch (of 2): When new DAMON targets are added via damon_commit_targets(), the newly created targets are not deallocated when updating the internal data (damon_commit_target()) is failed. Worse yet, even if the setup is successfully done, the new target is not linked to the context. Hence, the new targets are always leaked regardless of the internal data setup failure. Fix the leaks. Link: https://lkml.kernel.org/r/20241222231222.85060-2-sj@kernel.org Fixes: 9cb3d0b9dfce ("mm/damon/core: implement DAMON context commit function") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 8b8e2933dcd4..dc52361f1863 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -961,8 +961,11 @@ static int damon_commit_targets( return -ENOMEM; err = damon_commit_target(new_target, false, src_target, damon_target_has_pid(src)); - if (err) + if (err) { + damon_destroy_target(new_target); return err; + } + damon_add_target(dst, new_target); } return 0; } From 7d390b53067ef745e2d9bee5a9683df4c96b80a0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Dec 2024 15:12:22 -0800 Subject: [PATCH 21/25] mm/damon/core: fix ignored quota goals and filters of newly committed schemes damon_commit_schemes() ignores quota goals and filters of the newly committed schemes. This makes users confused about the behaviors. Correctly handle those inputs. Link: https://lkml.kernel.org/r/20241222231222.85060-3-sj@kernel.org Fixes: 9cb3d0b9dfce ("mm/damon/core: implement DAMON context commit function") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index dc52361f1863..0776452a1abb 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -868,6 +868,11 @@ static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src) NUMA_NO_NODE); if (!new_scheme) return -ENOMEM; + err = damos_commit(new_scheme, src_scheme); + if (err) { + damon_destroy_scheme(new_scheme); + return err; + } damon_add_scheme(dst, new_scheme); } return 0; From 62e72d2cf702a5e2fb53d9c46ed900d9384e4a06 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 22 Dec 2024 20:29:36 +0800 Subject: [PATCH 22/25] mm, madvise: fix potential workingset node list_lru leaks Since commit 5abc1e37afa0 ("mm: list_lru: allocate list_lru_one only when needed"), all list_lru users need to allocate the items using the new infrastructure that provides list_lru info for slab allocation, ensuring that the corresponding memcg list_lru is allocated before use. For workingset shadow nodes (which are xa_node), users are converted to use the new infrastructure by commit 9bbdc0f32409 ("xarray: use kmem_cache_alloc_lru to allocate xa_node"). The xas->xa_lru will be set correctly for filemap users. However, there is a missing case: xa_node allocations caused by madvise(..., MADV_COLLAPSE). madvise(..., MADV_COLLAPSE) will also read in the absent parts of file map, and there will be xa_nodes allocated for the caller's memcg (assuming it's not rootcg). However, these allocations won't trigger memcg list_lru allocation because the proper xas info was not set. If nothing else has allocated other xa_nodes for that memcg to trigger list_lru creation, and memory pressure starts to evict file pages, workingset_update_node will try to add these xa_nodes to their corresponding memcg list_lru, and it does not exist (NULL). So they will be added to rootcg's list_lru instead. This shouldn't be a significant issue in practice, but it is indeed unexpected behavior, and these xa_nodes will not be reclaimed effectively. And may lead to incorrect counting of the list_lru->nr_items counter. This problem wasn't exposed until recent commit 28e98022b31ef ("mm/list_lru: simplify reparenting and initial allocation") added a sanity check: only dying memcg could have a NULL list_lru when list_lru_{add,del} is called. This problem triggered this WARNING. So make madvise(..., MADV_COLLAPSE) also call xas_set_lru() to pass the list_lru which we may want to insert xa_node into later. And move mapping_set_update to mm/internal.h, and turn into a macro to avoid including extra headers in mm/internal.h. Link: https://lkml.kernel.org/r/20241222122936.67501-1-ryncsn@gmail.com Fixes: 9bbdc0f32409 ("xarray: use kmem_cache_alloc_lru to allocate xa_node") Reported-by: syzbot+38a0cbd267eff2d286ff@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/675d01e9.050a0220.37aaf.00be.GAE@google.com/ Signed-off-by: Kairui Song Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Sasha Levin Cc: Shakeel Butt Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/filemap.c | 9 --------- mm/internal.h | 6 ++++++ mm/khugepaged.c | 3 +++ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index f61cf51c2238..33b60d448fca 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,15 +124,6 @@ * ->private_lock (zap_pte_range->block_dirty_folio) */ -static void mapping_set_update(struct xa_state *xas, - struct address_space *mapping) -{ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - xas_set_update(xas, workingset_update_node); - xas_set_lru(xas, &shadow_nodes); -} - static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { diff --git a/mm/internal.h b/mm/internal.h index 3bd08bafad04..9826f7dce607 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1504,6 +1504,12 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; +#define mapping_set_update(xas, mapping) do { \ + if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ + xas_set_update(xas, workingset_update_node); \ + xas_set_lru(xas, &shadow_nodes); \ + } \ +} while (0) /* mremap.c */ unsigned long move_page_tables(struct vm_area_struct *vma, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6f8d46d107b4..653dbb1ff05c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -1837,6 +1838,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (result != SCAN_SUCCEED) goto out; + mapping_set_update(&xas, mapping); + __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); From dd2a5b5514ab0e690f018595e34dd1fcb981d345 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 21 Dec 2024 16:47:29 +0900 Subject: [PATCH 23/25] mm/util: make memdup_user_nul() similar to memdup_user() Since the string data to copy from userspace is likely less than PAGE_SIZE bytes, replace GFP_KERNEL with GFP_USER like commit 6c2c97a24f09 ("memdup_user(): switch to GFP_USER") does and add __GFP_NOWARN like commit 6c8fcc096be9 ("mm: don't let userspace spam allocations warnings") does. Also, use dedicated slab buckets like commit d73778e4b867 ("mm/util: Use dedicated slab buckets for memdup_user()") does. Link: https://lkml.kernel.org/r/014cd694-cc27-4a07-a34a-2ae95d744515@I-love.SAKURA.ne.jp Reported-by: syzbot+7e12e97b36154c54414b@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7e12e97b36154c54414b Signed-off-by: Tetsuo Handa Signed-off-by: Andrew Morton --- mm/util.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/util.c b/mm/util.c index c1c3b06ab4f9..60aa40f612b8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -297,12 +297,7 @@ void *memdup_user_nul(const void __user *src, size_t len) { char *p; - /* - * Always use GFP_KERNEL, since copy_from_user() can sleep and - * cause pagefault, which makes it pointless to use GFP_NOFS - * or GFP_ATOMIC. - */ - p = kmalloc_track_caller(len + 1, GFP_KERNEL); + p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); From 0210d251162f4033350a94a43f95b1c39ec84a90 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Thu, 26 Dec 2024 22:03:32 +0800 Subject: [PATCH 24/25] scripts/sorttable: fix orc_sort_cmp() to maintain symmetry and transitivity The orc_sort_cmp() function, used with qsort(), previously violated the symmetry and transitivity rules required by the C standard. Specifically, when both entries are ORC_TYPE_UNDEFINED, it could result in both a < b and b < a, which breaks the required symmetry and transitivity. This can lead to undefined behavior and incorrect sorting results, potentially causing memory corruption in glibc implementations [1]. Symmetry: If x < y, then y > x. Transitivity: If x < y and y < z, then x < z. Fix the comparison logic to return 0 when both entries are ORC_TYPE_UNDEFINED, ensuring compliance with qsort() requirements. Link: https://www.qualys.com/2024/01/30/qsort.txt [1] Link: https://lkml.kernel.org/r/20241226140332.2670689-1-visitorckw@gmail.com Fixes: 57fa18994285 ("scripts/sorttable: Implement build-time ORC unwind table sorting") Fixes: fb799447ae29 ("x86,objtool: Split UNWIND_HINT_EMPTY in two") Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Shile Zhang Cc: Steven Rostedt Cc: Signed-off-by: Andrew Morton --- scripts/sorttable.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 7bd0184380d3..a7c5445baf00 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -110,7 +110,7 @@ static inline unsigned long orc_ip(const int *ip) static int orc_sort_cmp(const void *_a, const void *_b) { - struct orc_entry *orc_a; + struct orc_entry *orc_a, *orc_b; const int *a = g_orc_ip_table + *(int *)_a; const int *b = g_orc_ip_table + *(int *)_b; unsigned long a_val = orc_ip(a); @@ -128,6 +128,9 @@ static int orc_sort_cmp(const void *_a, const void *_b) * whitelisted .o files which didn't get objtool generation. */ orc_a = g_orc_table + (a - g_orc_ip_table); + orc_b = g_orc_table + (b - g_orc_ip_table); + if (orc_a->type == ORC_TYPE_UNDEFINED && orc_b->type == ORC_TYPE_UNDEFINED) + return 0; return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1; } From e7404921818d676da4d7143ce78659456b05e2af Mon Sep 17 00:00:00 2001 From: "Chester A. Unal" Date: Wed, 25 Dec 2024 15:50:41 +0300 Subject: [PATCH 25/25] =?UTF-8?q?MAINTAINERS:=20change=20Ar=C4=B1n=C3=A7?= =?UTF-8?q?=20=5FNAL's=20name=20and=20email=20address?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My legal name now includes Chester. Change the name and the email address sections to reflect that. Link: https://lkml.kernel.org/r/20241225-for-unknown-upstream-v1-1-3e35e4d5e161@arinc9.com Signed-off-by: Chester A. Unal Signed-off-by: Andrew Morton --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 910305c11e8a..22fa261cb60e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14756,7 +14756,7 @@ F: drivers/memory/mtk-smi.c F: include/soc/mediatek/smi.h MEDIATEK SWITCH DRIVER -M: Arınç ÜNAL +M: Chester A. Unal M: Daniel Golle M: DENG Qingfang M: Sean Wang @@ -18460,7 +18460,7 @@ F: Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml F: drivers/pinctrl/mediatek/ PIN CONTROLLER - MEDIATEK MIPS -M: Arınç ÜNAL +M: Chester A. Unal M: Sergio Paracuellos L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) L: linux-mips@vger.kernel.org @@ -19504,7 +19504,7 @@ S: Maintained F: arch/mips/ralink RALINK MT7621 MIPS ARCHITECTURE -M: Arınç ÜNAL +M: Chester A. Unal M: Sergio Paracuellos L: linux-mips@vger.kernel.org S: Maintained