diff --git a/CREDITS b/CREDITS index bee8609d889f..49dfefcb7575 100644 --- a/CREDITS +++ b/CREDITS @@ -1451,6 +1451,14 @@ N: Andy Gospodarek E: andy@greyhouse.net D: Maintenance and contributions to the network interface bonding driver. +N: Vivek Goyal +E: vgoyal@redhat.com +D: KDUMP, KEXEC, and VIRTIO FILE SYSTEM + +N: Alexander Graf +E: graf@amazon.com +D: Kexec Handover (KHO) + N: Wolfgang Grandegger E: wg@grandegger.com D: Controller Area Network (device drivers) diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst index a7dea7c75a9b..14cc6b2db897 100644 --- a/Documentation/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -79,6 +79,10 @@ of parametrs except ``enabled`` again. Once the re-reading is done, this parameter is set as ``N``. If invalid parameters are found while the re-reading, DAMON_LRU_SORT will be disabled. +Once ``Y`` is written to this parameter, the user must not write to any +parameters until reading ``commit_inputs`` again returns ``N``. If users +violate this rule, the kernel may exhibit undefined behavior. + active_mem_bp ------------- diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 47854c461706..d7a0225b4950 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -71,6 +71,10 @@ of parametrs except ``enabled`` again. Once the re-reading is done, this parameter is set as ``N``. If invalid parameters are found while the re-reading, DAMON_RECLAIM will be disabled. +Once ``Y`` is written to this parameter, the user must not write to any +parameters until reading ``commit_inputs`` again returns ``N``. If users +violate this rule, the kernel may exhibit undefined behavior. + min_age ------- diff --git a/Documentation/admin-guide/mm/kho.rst b/Documentation/admin-guide/mm/kho.rst index cb9a20f64920..2c26e560bd78 100644 --- a/Documentation/admin-guide/mm/kho.rst +++ b/Documentation/admin-guide/mm/kho.rst @@ -42,6 +42,45 @@ For example, if you used ``reserve_mem`` command line parameter to create an early memory reservation, the new kernel will have that memory at the same physical address as the old kernel. +Kexec Metadata +============== + +KHO automatically tracks metadata about the kexec chain, passing information +about the previous kernel to the next kernel. This feature helps diagnose +bugs that only reproduce when kexecing from specific kernel versions. + +On each KHO kexec, the kernel logs the previous kernel's version and the +number of kexec reboots since the last cold boot:: + + [ 0.000000] KHO: exec from: 6.19.0-rc4-next-20260107 (count 1) + +The metadata includes: + +``previous_release`` + The kernel version string (from ``uname -r``) of the kernel that + initiated the kexec. + +``kexec_count`` + The number of kexec boots since the last cold boot. On cold boot, + this counter starts at 0 and increments with each kexec. This helps + identify issues that only manifest after multiple consecutive kexec + reboots. + +Use Cases +--------- + +This metadata is particularly useful for debugging kexec transition bugs, +where a buggy kernel kexecs into a new kernel and the bug manifests only +in the second kernel. Examples of such bugs include: + +- Memory corruption from the previous kernel affecting the new kernel +- Incorrect hardware state left by the previous kernel +- Firmware/ACPI state issues that only appear in kexec scenarios + +At scale, correlating crashes to the previous kernel version enables +faster root cause analysis when issues only occur in specific kernel +transition scenarios. + debugfs Interfaces ================== @@ -80,5 +119,5 @@ stabilized. it finished to interpret their metadata. ``/sys/kernel/debug/kho/in/sub_fdts/`` - Similar to ``kho/out/sub_fdts/``, but contains sub FDT blobs + Similar to ``kho/out/sub_fdts/``, but contains sub blobs of KHO producers passed from the old kernel. diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 7a4e67a04290..db6167befb7b 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -565,6 +565,10 @@ does not take into account swapped out page of underlying shmem objects. naturally aligned THP pages of any currently enabled size. 1 if true, 0 otherwise. +If both the kernel and the CPU support protection keys (pkeys), +"ProtectionKey" indicates the memory protection key associated with the +virtual memory area. + "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter encoded manner. The codes are the following: diff --git a/MAINTAINERS b/MAINTAINERS index 13f49378b157..c00df84252aa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13859,8 +13859,10 @@ F: scripts/Makefile.kcsan KDUMP M: Andrew Morton M: Baoquan He -R: Vivek Goyal -R: Dave Young +M: Mike Rapoport +M: Pasha Tatashin +M: Pratyush Yadav +R: Dave Young L: kexec@lists.infradead.org S: Maintained W: http://lse.sourceforge.net/kdump/ @@ -14175,6 +14177,9 @@ F: include/linux/kernfs.h KEXEC M: Andrew Morton M: Baoquan He +M: Mike Rapoport +M: Pasha Tatashin +M: Pratyush Yadav L: kexec@lists.infradead.org W: http://kernel.org/pub/linux/utils/kernel/kexec/ F: include/linux/kexec.h @@ -14182,18 +14187,18 @@ F: include/uapi/linux/kexec.h F: kernel/kexec* KEXEC HANDOVER (KHO) -M: Alexander Graf M: Mike Rapoport M: Pasha Tatashin -R: Pratyush Yadav +M: Pratyush Yadav +R: Alexander Graf L: kexec@lists.infradead.org L: linux-mm@kvack.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* F: include/linux/kexec_handover.h F: include/linux/kho/ -F: include/linux/kho/abi/ F: kernel/liveupdate/kexec_handover* F: lib/test_kho.c F: tools/testing/selftests/kho/ @@ -14892,15 +14897,15 @@ F: tools/testing/selftests/livepatch/ LIVE UPDATE M: Pasha Tatashin M: Mike Rapoport -R: Pratyush Yadav +M: Pratyush Yadav L: linux-kernel@vger.kernel.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git F: Documentation/core-api/liveupdate.rst F: Documentation/mm/memfd_preservation.rst F: Documentation/userspace-api/liveupdate.rst F: include/linux/kho/abi/ F: include/linux/liveupdate.h -F: include/linux/liveupdate/ F: include/uapi/linux/liveupdate.h F: kernel/liveupdate/ F: lib/tests/liveupdate.c @@ -16859,8 +16864,12 @@ F: mm/migrate_device.c MEMORY MANAGEMENT - MGLRU (MULTI-GEN LRU) M: Andrew Morton -M: Axel Rasmussen -M: Yuanchu Xie +R: Kairui Song +R: Qi Zheng +R: Shakeel Butt +R: Barry Song +R: Axel Rasmussen +R: Yuanchu Xie R: Wei Xu L: linux-mm@kvack.org S: Maintained @@ -20115,7 +20124,9 @@ F: kernel/padata.c PAGE CACHE M: Matthew Wilcox (Oracle) +R: Jan Kara L: linux-fsdevel@vger.kernel.org +L: linux-mm@kvack.org S: Supported T: git git://git.infradead.org/users/willy/pagecache.git F: Documentation/filesystems/locking.rst diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c2afd1c34f4a..aebc710f0d6a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2546,6 +2546,8 @@ static ssize_t recompress_store(struct device *dev, mode = RECOMPRESS_HUGE; if (!strcmp(val, "huge_idle")) mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE; + if (!mode) + return -EINVAL; continue; } @@ -2678,7 +2680,7 @@ static void zram_bio_discard(struct zram *zram, struct bio *bio) */ if (offset) { if (n <= (PAGE_SIZE - offset)) - return; + goto end_bio; n -= (PAGE_SIZE - offset); index++; @@ -2693,6 +2695,7 @@ static void zram_bio_discard(struct zram *zram, struct bio *bio) n -= PAGE_SIZE; } +end_bio: bio_endio(bio); } diff --git a/fs/buffer.c b/fs/buffer.c index 4d7f84e77d2f..d6e062c42a8d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -822,8 +822,7 @@ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, long offset; struct mem_cgroup *memcg, *old_memcg; - /* The folio lock pins the memcg */ - memcg = folio_memcg(folio); + memcg = get_mem_cgroup_from_folio(folio); old_memcg = set_active_memcg(memcg); head = NULL; @@ -844,6 +843,7 @@ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, } out: set_active_memcg(old_memcg); + mem_cgroup_put(memcg); return head; /* * In case anything failed, we just free everything we got. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3c75ee025bda..e1fbdf9ee769 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -280,15 +280,13 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; - if (folio) { - memcg_css = mem_cgroup_css_from_folio(folio); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); - } else { - /* must pin memcg_css, see wb_get_create() */ + /* must pin memcg_css, see wb_get_create() */ + if (folio) + memcg_css = get_mem_cgroup_css_from_folio(folio); + else memcg_css = task_get_css(current, memory_cgrp_id); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); - css_put(memcg_css); - } + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); } if (!wb) @@ -979,16 +977,16 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio if (!wbc->wb || wbc->no_cgroup_owner) return; - css = mem_cgroup_css_from_folio(folio); + css = get_mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!css_is_online(css)) - return; + goto out; id = css->id; if (id == wbc->wb_id) { wbc->wb_bytes += bytes; - return; + goto out; } if (id == wbc->wb_lcand_id) @@ -1001,6 +999,8 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio wbc->wb_tcand_bytes += bytes; else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); +out: + css_put(css); } EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index bdc84e5219cd..4b53dc4a3266 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1238,8 +1238,6 @@ static __always_inline int validate_unaligned_range( return -EINVAL; if (!len) return -EINVAL; - if (start < mmap_min_addr) - return -EINVAL; if (start >= task_size) return -EINVAL; if (len > task_size - start) diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index d40ac39bfbe8..02de2ede560f 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -163,9 +163,11 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) { WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); } +void alloc_tag_add_early_pfn(unsigned long pfn); #else static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {} static inline void alloc_tag_sub_check(union codetag_ref *ref) {} +static inline void alloc_tag_add_early_pfn(unsigned long pfn) {} #endif /* Caller should verify both ref and tag to be valid */ diff --git a/include/linux/damon.h b/include/linux/damon.h index d9a3babbafc1..f2cdb7c3f5e6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -818,9 +818,11 @@ struct damon_ctx { /* lists of &struct damon_call_control */ struct list_head call_controls; + bool call_controls_obsolete; struct mutex call_controls_lock; struct damos_walk_control *walk_control; + bool walk_control_obsolete; struct mutex walk_control_lock; /* diff --git a/include/linux/fs.h b/include/linux/fs.h index e1d257e6da68..11559c513dfb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2062,20 +2062,13 @@ void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file const struct vm_area_struct *vma); int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); -int __vma_check_mmap_hook(struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { - int err; - if (file->f_op->mmap_prepare) return compat_vma_mmap(file, vma); - err = file->f_op->mmap(file, vma); - if (err) - return err; - - return __vma_check_mmap_hook(vma); + return file->f_op->mmap(file, vma); } static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index ac4129d1d741..8968c56d2d73 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -32,9 +32,9 @@ void kho_restore_free(void *mem); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); -int kho_add_subtree(const char *name, void *fdt); -void kho_remove_subtree(void *fdt); -int kho_retrieve_subtree(const char *name, phys_addr_t *phys); +int kho_add_subtree(const char *name, void *blob, size_t size); +void kho_remove_subtree(void *blob); +int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size); void kho_memory_init(void); @@ -97,14 +97,15 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) return NULL; } -static inline int kho_add_subtree(const char *name, void *fdt) +static inline int kho_add_subtree(const char *name, void *blob, size_t size) { return -EOPNOTSUPP; } -static inline void kho_remove_subtree(void *fdt) { } +static inline void kho_remove_subtree(void *blob) { } -static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys, + size_t *size) { return -EOPNOTSUPP; } diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h index 6b7d8ef550f9..7e847a2339b0 100644 --- a/include/linux/kho/abi/kexec_handover.h +++ b/include/linux/kho/abi/kexec_handover.h @@ -41,25 +41,28 @@ * restore the preserved data.:: * * / { - * compatible = "kho-v2"; + * compatible = "kho-v3"; * * preserved-memory-map = <0x...>; * * { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * * { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * ... ... * { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * }; * * Root KHO Node (/): - * - compatible: "kho-v2" + * - compatible: "kho-v3" * * Indentifies the overall KHO ABI version. * @@ -78,16 +81,25 @@ * * Physical address pointing to a subnode data blob that is also * being preserved. + * + * - blob-size: u64 + * + * Size in bytes of the preserved data blob. This is needed because + * blobs may use arbitrary formats (not just FDT), so the size + * cannot be determined from the blob content alone. */ /* The compatible string for the KHO FDT root node. */ -#define KHO_FDT_COMPATIBLE "kho-v2" +#define KHO_FDT_COMPATIBLE "kho-v3" /* The FDT property for the preserved memory map. */ #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map" /* The FDT property for preserved data blobs. */ -#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data" +#define KHO_SUB_TREE_PROP_NAME "preserved-data" + +/* The FDT property for the size of preserved data blobs. */ +#define KHO_SUB_TREE_SIZE_PROP_NAME "blob-size" /** * DOC: Kexec Handover ABI for vmalloc Preservation diff --git a/include/linux/kho/abi/kexec_metadata.h b/include/linux/kho/abi/kexec_metadata.h new file mode 100644 index 000000000000..e9e3f7e38a7c --- /dev/null +++ b/include/linux/kho/abi/kexec_metadata.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/** + * DOC: Kexec Metadata ABI + * + * The "kexec-metadata" subtree stores optional metadata about the kexec chain. + * It is registered via kho_add_subtree(), keeping it independent from the core + * KHO ABI. This allows the metadata format to evolve without affecting other + * KHO consumers. + * + * The metadata is stored as a plain C struct rather than FDT format for + * simplicity and direct field access. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Breno Leitao + */ + +#ifndef _LINUX_KHO_ABI_KEXEC_METADATA_H +#define _LINUX_KHO_ABI_KEXEC_METADATA_H + +#include +#include + +#define KHO_KEXEC_METADATA_VERSION 1 + +/** + * struct kho_kexec_metadata - Kexec metadata passed between kernels + * @version: ABI version of this struct (must be first field) + * @previous_release: Kernel version string that initiated the kexec + * @kexec_count: Number of kexec boots since last cold boot + * + * This structure is preserved across kexec and allows the new kernel to + * identify which kernel it was booted from and how many kexec reboots + * have occurred. + * + * __NEW_UTS_LEN is part of uABI, so it safe to use it in here. + */ +struct kho_kexec_metadata { + u32 version; + char previous_release[__NEW_UTS_LEN + 1]; + u32 kexec_count; +} __packed; + +#define KHO_METADATA_NODE_NAME "kexec-metadata" + +#endif /* _LINUX_KHO_ABI_KEXEC_METADATA_H */ diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index dd11fdc76a5f..30c5a39ff9e9 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -63,6 +64,7 @@ struct liveupdate_file_op_args { * finish, in order to do successful finish calls for all * resources in the session. * @finish: Required. Final cleanup in the new kernel. + * @get_id: Optional. Returns a unique identifier for the file. * @owner: Module reference * * All operations (except can_preserve) receive a pointer to a @@ -78,6 +80,7 @@ struct liveupdate_file_ops { int (*retrieve)(struct liveupdate_file_op_args *args); bool (*can_finish)(struct liveupdate_file_op_args *args); void (*finish)(struct liveupdate_file_op_args *args); + unsigned long (*get_id)(struct file *file); struct module *owner; }; @@ -228,12 +231,12 @@ bool liveupdate_enabled(void); int liveupdate_reboot(void); int liveupdate_register_file_handler(struct liveupdate_file_handler *fh); -int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); +void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); int liveupdate_register_flb(struct liveupdate_file_handler *fh, struct liveupdate_flb *flb); -int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb); +void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb); int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp); int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp); @@ -255,9 +258,8 @@ static inline int liveupdate_register_file_handler(struct liveupdate_file_handle return -EOPNOTSUPP; } -static inline int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +static inline void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) { - return -EOPNOTSUPP; } static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh, @@ -266,10 +268,9 @@ static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh, return -EOPNOTSUPP; } -static inline int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb) +static inline void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) { - return -EOPNOTSUPP; } static inline int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5173a9f16721..dc3fa687759b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -115,6 +115,16 @@ struct mem_cgroup_per_node { unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; + /* + * objcg is wiped out as a part of the objcg repaprenting process. + * orig_objcg preserves a pointer (and a reference) to the original + * objcg until the end of live of memcg. + */ + struct obj_cgroup __rcu *objcg; + struct obj_cgroup *orig_objcg; + /* list of inherited objcgs, protected by objcg_lock */ + struct list_head objcg_list; + #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC /* slab stats for nmi context */ atomic_t slab_reclaimable; @@ -179,6 +189,7 @@ struct obj_cgroup { struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; + bool is_root; }; /* @@ -257,15 +268,6 @@ struct mem_cgroup { seqlock_t socket_pressure_seqlock; #endif int kmemcg_id; - /* - * memcg->objcg is wiped out as a part of the objcg repaprenting - * process. memcg->orig_objcg preserves a pointer (and a reference) - * to the original objcg until the end of live of memcg. - */ - struct obj_cgroup __rcu *objcg; - struct obj_cgroup *orig_objcg; - /* list of inherited objcgs, protected by objcg_lock */ - struct list_head objcg_list; struct memcg_vmstats_percpu __percpu *vmstats_percpu; @@ -367,9 +369,6 @@ enum objext_flags { #define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) #ifdef CONFIG_MEMCG - -static inline bool folio_memcg_kmem(struct folio *folio); - /* * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. @@ -383,43 +382,19 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) } /* - * __folio_memcg - Get the memory cgroup associated with a non-kmem folio - * @folio: Pointer to the folio. - * - * Returns a pointer to the memory cgroup associated with the folio, - * or NULL. This function assumes that the folio is known to have a - * proper memory cgroup pointer. It's not safe to call this function - * against some type of folios, e.g. slab folios or ex-slab folios or - * kmem folios. - */ -static inline struct mem_cgroup *__folio_memcg(struct folio *folio) -{ - unsigned long memcg_data = folio->memcg_data; - - VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); - - return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); -} - -/* - * __folio_objcg - get the object cgroup associated with a kmem folio. + * folio_objcg - get the object cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the object cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a - * proper object cgroup pointer. It's not safe to call this function - * against some type of folios, e.g. slab folios or ex-slab folios or - * LRU folios. + * proper object cgroup pointer. */ -static inline struct obj_cgroup *__folio_objcg(struct folio *folio) +static inline struct obj_cgroup *folio_objcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); - VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } @@ -433,21 +408,30 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * - * For a non-kmem folio any of the following ensures folio and memcg binding - * stability: + * For a folio any of the following ensures folio and objcg binding stability: * * - the folio lock * - LRU isolation * - exclusive reference * - * For a kmem folio a caller should hold an rcu read lock to protect memcg - * associated with a kmem folio from being released. + * Based on the stable binding of folio and objcg, for a folio any of the + * following ensures folio and memcg binding stability: + * + * - cgroup_mutex + * - the lruvec lock + * + * If the caller only want to ensure that the page counters of memcg are + * updated correctly, ensure that the binding stability of folio and objcg + * is sufficient. + * + * Note: The caller should hold an rcu read lock or cgroup_mutex to protect + * memcg associated with a folio from being released. */ static inline struct mem_cgroup *folio_memcg(struct folio *folio) { - if (folio_memcg_kmem(folio)) - return obj_cgroup_memcg(__folio_objcg(folio)); - return __folio_memcg(folio); + struct obj_cgroup *objcg = folio_objcg(folio); + + return objcg ? obj_cgroup_memcg(objcg) : NULL; } /* @@ -471,15 +455,10 @@ static inline bool folio_memcg_charged(struct folio *folio) * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * - * For a non-kmem folio any of the following ensures folio and memcg binding - * stability: + * The page and objcg or memcg binding rules can refer to folio_memcg(). * - * - the folio lock - * - LRU isolation - * - exclusive reference - * - * For a kmem folio a caller should hold an rcu read lock to protect memcg - * associated with a kmem folio from being released. + * A caller should hold an rcu read lock to protect memcg associated with a + * page from being released. */ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { @@ -488,18 +467,14 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) * for slabs, READ_ONCE() should be used here. */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); + struct obj_cgroup *objcg; if (memcg_data & MEMCG_DATA_OBJEXTS) return NULL; - if (memcg_data & MEMCG_DATA_KMEM) { - struct obj_cgroup *objcg; + objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - return obj_cgroup_memcg(objcg); - } - - return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); + return objcg ? obj_cgroup_memcg(objcg) : NULL; } static inline struct mem_cgroup *page_memcg_check(struct page *page) @@ -548,6 +523,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } +static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) +{ + return objcg->is_root; +} + static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); @@ -735,7 +715,15 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, * folio_lruvec - return lruvec for isolating/putting an LRU folio * @folio: Pointer to the folio. * - * This function relies on folio->mem_cgroup being stable. + * Call with rcu_read_lock() held to ensure the lifetime of the returned lruvec. + * Note that this alone will NOT guarantee the stability of the folio->lruvec + * association; the folio can be reparented to an ancestor if this races with + * cgroup deletion. + * + * Use folio_lruvec_lock() to ensure both lifetime and stability of the binding. + * Once a lruvec is locked, folio_lruvec() can be called on other folios, and + * their binding is stable if the returned lruvec matches the one the caller has + * locked. Useful for lock batching. */ static inline struct lruvec *folio_lruvec(struct folio *folio) { @@ -758,15 +746,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); -#ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); -#else -static inline -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ -} -#endif - static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -774,23 +753,26 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg) { + if (obj_cgroup_is_root(objcg)) + return true; return percpu_ref_tryget(&objcg->refcnt); } -static inline void obj_cgroup_get(struct obj_cgroup *objcg) -{ - percpu_ref_get(&objcg->refcnt); -} - static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, unsigned long nr) { - percpu_ref_get_many(&objcg->refcnt, nr); + if (!obj_cgroup_is_root(objcg)) + percpu_ref_get_many(&objcg->refcnt, nr); +} + +static inline void obj_cgroup_get(struct obj_cgroup *objcg) +{ + obj_cgroup_get_many(objcg, 1); } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { - if (objcg) + if (objcg && !obj_cgroup_is_root(objcg)) percpu_ref_put(&objcg->refcnt); } @@ -885,7 +867,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); +struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) @@ -896,7 +878,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) } void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zid, int nr_pages); + int zid, long nr_pages); static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, @@ -966,10 +948,15 @@ void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; - if (memcg) - count_memcg_events(memcg, idx, nr); + if (!folio_memcg_charged(folio)) + return; + + rcu_read_lock(); + memcg = folio_memcg(folio); + count_memcg_events(memcg, idx, nr); + rcu_read_unlock(); } static inline void count_memcg_events_mm(struct mm_struct *mm, @@ -1087,6 +1074,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return true; } +static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) +{ + return true; +} + static inline bool mem_cgroup_disabled(void) { return true; @@ -1179,11 +1171,6 @@ static inline struct lruvec *folio_lruvec(struct folio *folio) return &pgdat->__lruvec; } -static inline -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ -} - static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; @@ -1242,6 +1229,7 @@ static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } @@ -1250,6 +1238,7 @@ static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } @@ -1259,6 +1248,7 @@ static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; } @@ -1479,20 +1469,28 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } -static inline void unlock_page_lruvec(struct lruvec *lruvec) +static inline void lruvec_lock_irq(struct lruvec *lruvec) +{ + rcu_read_lock(); + spin_lock_irq(&lruvec->lru_lock); +} + +static inline void lruvec_unlock(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); + rcu_read_unlock(); } -static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) +static inline void lruvec_unlock_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); + rcu_read_unlock(); } -static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, - unsigned long flags) +static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); + rcu_read_unlock(); } /* Test requires a stable folio->memcg binding, see folio_memcg() */ @@ -1511,7 +1509,7 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; - unlock_page_lruvec_irq(locked_lruvec); + lruvec_unlock_irq(locked_lruvec); } return folio_lruvec_lock_irq(folio); @@ -1525,7 +1523,7 @@ static inline void folio_lruvec_relock_irqsave(struct folio *folio, if (folio_matches_lruvec(folio, *lruvecp)) return; - unlock_page_lruvec_irqrestore(*lruvecp, *flags); + lruvec_unlock_irqrestore(*lruvecp, *flags); } *lruvecp = folio_lruvec_lock_irqsave(folio, flags); @@ -1549,9 +1547,14 @@ static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, if (mem_cgroup_disabled()) return; + if (!folio_memcg_charged(folio)) + return; + + rcu_read_lock(); memcg = folio_memcg(folio); - if (unlikely(memcg && &memcg->css != wb->memcg_css)) + if (unlikely(&memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); + rcu_read_unlock(); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); diff --git a/include/linux/mm.h b/include/linux/mm.h index 255e0f50ea32..0b776907152e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -758,6 +758,8 @@ struct vm_fault { */ }; +struct vm_uffd_ops; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -865,6 +867,9 @@ struct vm_operations_struct { struct page *(*find_normal_page)(struct vm_area_struct *vma, unsigned long addr); #endif /* CONFIG_FIND_NORMAL_PAGE */ +#ifdef CONFIG_USERFAULTFD + const struct vm_uffd_ops *uffd_ops; +#endif }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 7fc2ced00f8f..a171070e15f0 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -348,6 +348,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_add_folio(lruvec, folio, false)) return; @@ -362,6 +364,8 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_add_folio(lruvec, folio, true)) return; @@ -376,6 +380,8 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_del_folio(lruvec, folio, false)) return; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3bcdda226a91..9adb2ad21da5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -694,6 +694,9 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); +void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid); +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid); +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid); #else /* !CONFIG_LRU_GEN */ @@ -735,6 +738,20 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } +static inline void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid) +{ +} + +static inline bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid) +{ + return true; +} + +static inline +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ +} + #endif /* CONFIG_LRU_GEN */ struct lruvec { @@ -2053,21 +2070,16 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) extern size_t mem_section_usage_size(void); /* - * We use the lower bits of the mem_map pointer to store - * a little bit of information. The pointer is calculated - * as mem_map - section_nr_to_pfn(pnum). The result is - * aligned to the minimum alignment of the two values: - * 1. All mem_map arrays are page-aligned. - * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT - * lowest bits. PFN_SECTION_SHIFT is arch-specific - * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the - * worst combination is powerpc with 256k pages, - * which results in PFN_SECTION_SHIFT equal 6. - * To sum it up, at least 6 bits are available on all architectures. - * However, we can exceed 6 bits on some other architectures except - * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available - * with the worst case of 64K pages on arm64) if we make sure the - * exceeded bit is not applicable to powerpc. + * We use the lower bits of the mem_map pointer to store a little bit of + * information. The pointer is calculated as mem_map - section_nr_to_pfn(). + * The result is aligned to the minimum alignment of the two values: + * + * 1. All mem_map arrays are page-aligned. + * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT lowest bits. + * + * We always expect a single section to cover full pages. Therefore, + * we can safely assume that PFN_SECTION_SHIFT is large enough to + * accommodate SECTION_MAP_LAST_BIT. We use BUILD_BUG_ON() to ensure this. */ enum { SECTION_MARKED_PRESENT_BIT, diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 38a82d65e58e..951d33362268 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -181,7 +181,7 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) if (get_page_tag_ref(page, &ref, &handle)) { alloc_tag_sub_check(&ref); - if (ref.ct) + if (ref.ct && !is_codetag_empty(&ref)) tag = ct_to_alloc_tag(ref.ct); put_page_tag_ref(handle); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 004e6d56a499..368c7b4d7cb5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1535,7 +1535,7 @@ struct task_struct { /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; - /* Cache for current->cgroups->memcg->objcg lookups: */ + /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */ struct obj_cgroup *objcg; #endif diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f6a2d3402d76..93a0ba872ebe 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -221,20 +221,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) extern bool shmem_charge(struct inode *inode, long pages); -#ifdef CONFIG_USERFAULTFD -#ifdef CONFIG_SHMEM -extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop); -#else /* !CONFIG_SHMEM */ -#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ - src_addr, flags, foliop) ({ BUG(); 0; }) -#endif /* CONFIG_SHMEM */ -#endif /* CONFIG_USERFAULTFD */ - /* * Used space is stored as unsigned 64-bit value in bytes but * quota core supports only signed 64-bit values so use that diff --git a/include/linux/swap.h b/include/linux/swap.h index 4b1f13b5bbad..7a09df6977a5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -310,8 +310,7 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, - unsigned int nr_io, unsigned int nr_rotated) - __releases(lruvec->lru_lock); + unsigned int nr_io, unsigned int nr_rotated); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); @@ -353,6 +352,7 @@ extern void swap_setup(void); extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) @@ -547,6 +547,8 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return READ_ONCE(memcg->swappiness); } + +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid); #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { @@ -611,5 +613,24 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) } #endif +/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to + * and including the specified highidx + * @zone: The current zone in the iterator + * @pgdat: The pgdat which node_zones are being iterated + * @idx: The index variable + * @highidx: The index of the highest zone to return + * + * This macro iterates through all managed zones up to and including the specified highidx. + * The zone iterator enters an invalid state after macro call and must be reinitialized + * before it can be used again. + */ +#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ + for ((idx) = 0, (zone) = (pgdat)->node_zones; \ + (idx) <= (highidx); \ + (idx)++, (zone)++) \ + if (!managed_zone(zone)) \ + continue; \ + else + #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d83e349900a3..d2920f98ab86 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -83,6 +83,39 @@ struct userfaultfd_ctx { extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +/* VMA userfaultfd operations */ +struct vm_uffd_ops { + /* Checks if a VMA can support userfaultfd */ + bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags); + /* + * Called to resolve UFFDIO_CONTINUE request. + * Should return the folio found at pgoff in the VMA's pagecache if it + * exists or ERR_PTR otherwise. + * The returned folio is locked and with reference held. + */ + struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff); + /* + * Called during resolution of UFFDIO_COPY request. + * Should allocate and return a folio or NULL if allocation fails. + */ + struct folio *(*alloc_folio)(struct vm_area_struct *vma, + unsigned long addr); + /* + * Called during resolution of UFFDIO_COPY request. + * Should only be called with a folio returned by alloc_folio() above. + * The folio will be set to locked. + * Returns 0 on success, error code on failure. + */ + int (*filemap_add)(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr); + /* + * Called during resolution of UFFDIO_COPY request on the error + * handling path. + * Should revert the operation of ->filemap_add(). + */ + void (*filemap_remove)(struct folio *folio, struct vm_area_struct *vma); +}; + /* A combined operation mode + behavior flags. */ typedef unsigned int __bitwise uffd_flags_t; @@ -114,11 +147,6 @@ static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_at /* Flags controlling behavior. These behavior changes are mode-independent. */ #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) -extern int mfill_atomic_install_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, struct page *page, - bool newly_allocated, uffd_flags_t flags); - extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags); @@ -211,39 +239,8 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } -static inline bool vma_can_userfault(struct vm_area_struct *vma, - vm_flags_t vm_flags, - bool wp_async) -{ - vm_flags &= __VM_UFFD_FLAGS; - - if (vma->vm_flags & VM_DROPPABLE) - return false; - - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; - - /* - * If wp async enabled, and WP is the only mode enabled, allow any - * memory type. - */ - if (wp_async && (vm_flags == VM_UFFD_WP)) - return true; - - /* - * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. - */ - if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && - !vma_is_anonymous(vma)) - return false; - - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); -} +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async); static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h index dfe2f51019b4..51b62c5931fc 100644 --- a/include/trace/events/memcg.h +++ b/include/trace/events/memcg.h @@ -11,14 +11,14 @@ DECLARE_EVENT_CLASS(memcg_rstat_stats, - TP_PROTO(struct mem_cgroup *memcg, int item, int val), + TP_PROTO(struct mem_cgroup *memcg, int item, long val), TP_ARGS(memcg, item, val), TP_STRUCT__entry( __field(u64, id) __field(int, item) - __field(int, val) + __field(long, val) ), TP_fast_assign( @@ -27,20 +27,20 @@ DECLARE_EVENT_CLASS(memcg_rstat_stats, __entry->val = val; ), - TP_printk("memcg_id=%llu item=%d val=%d", + TP_printk("memcg_id=%llu item=%d val=%ld", __entry->id, __entry->item, __entry->val) ); DEFINE_EVENT(memcg_rstat_stats, mod_memcg_state, - TP_PROTO(struct mem_cgroup *memcg, int item, int val), + TP_PROTO(struct mem_cgroup *memcg, int item, long val), TP_ARGS(memcg, item, val) ); DEFINE_EVENT(memcg_rstat_stats, mod_memcg_lruvec_state, - TP_PROTO(struct mem_cgroup *memcg, int item, int val), + TP_PROTO(struct mem_cgroup *memcg, int item, long val), TP_ARGS(memcg, item, val) ); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index e5cd2b80fd29..bdac0d685a98 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -294,7 +294,10 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); + + rcu_read_lock(); __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); + rcu_read_unlock(); ), TP_printk("bdi %s[%llu]: ino=%llu memcg_id=%u cgroup_ino=%llu page_cgroup_ino=%llu", diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1f084ee71443..43adc96c7f1a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5999,8 +5999,9 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) */ static void css_killed_work_fn(struct work_struct *work) { - struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys_state *css; + + css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork); cgroup_lock(); @@ -6021,8 +6022,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); if (atomic_dec_and_test(&css->online_cnt)) { - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_offline_wq, &css->destroy_work); + INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn); + queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork); } } diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 532f455c5d4f..94762de1fe5f 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -18,7 +18,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -724,12 +726,13 @@ static void __init kho_reserve_scratch(void) } /** - * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. + * kho_add_subtree - record the physical address of a sub blob in KHO root tree. * @name: name of the sub tree. - * @fdt: the sub tree blob. + * @blob: the sub tree blob. + * @size: size of the blob in bytes. * * Creates a new child node named @name in KHO root FDT and records - * the physical address of @fdt. The pages of @fdt must also be preserved + * the physical address of @blob. The pages of @blob must also be preserved * by KHO for the new kernel to retrieve it after kexec. * * A debugfs blob entry is also created at @@ -738,10 +741,11 @@ static void __init kho_reserve_scratch(void) * * Return: 0 on success, error code on failure */ -int kho_add_subtree(const char *name, void *fdt) +int kho_add_subtree(const char *name, void *blob, size_t size) { - phys_addr_t phys = virt_to_phys(fdt); + phys_addr_t phys = virt_to_phys(blob); void *root_fdt = kho_out.fdt; + u64 size_u64 = size; int err = -ENOMEM; int off, fdt_err; @@ -758,12 +762,18 @@ int kho_add_subtree(const char *name, void *fdt) goto out_pack; } - err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, + err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, &phys, sizeof(phys)); if (err < 0) goto out_pack; - WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); + err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME, + &size_u64, sizeof(size_u64)); + if (err < 0) + goto out_pack; + + WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, name, blob, + size, false)); out_pack: fdt_pack(root_fdt); @@ -772,9 +782,9 @@ int kho_add_subtree(const char *name, void *fdt) } EXPORT_SYMBOL_GPL(kho_add_subtree); -void kho_remove_subtree(void *fdt) +void kho_remove_subtree(void *blob) { - phys_addr_t target_phys = virt_to_phys(fdt); + phys_addr_t target_phys = virt_to_phys(blob); void *root_fdt = kho_out.fdt; int off; int err; @@ -790,13 +800,13 @@ void kho_remove_subtree(void *fdt) const u64 *val; int len; - val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len); + val = fdt_getprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(phys_addr_t)) continue; if ((phys_addr_t)*val == target_phys) { fdt_del_node(root_fdt, off); - kho_debugfs_fdt_remove(&kho_out.dbg, fdt); + kho_debugfs_blob_remove(&kho_out.dbg, blob); break; } } @@ -1260,6 +1270,8 @@ EXPORT_SYMBOL_GPL(kho_restore_free); struct kho_in { phys_addr_t fdt_phys; phys_addr_t scratch_phys; + char previous_release[__NEW_UTS_LEN + 1]; + u32 kexec_count; struct kho_debugfs dbg; }; @@ -1292,16 +1304,17 @@ bool is_kho_boot(void) EXPORT_SYMBOL_GPL(is_kho_boot); /** - * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. - * @name: the name of the sub FDT passed to kho_add_subtree(). - * @phys: if found, the physical address of the sub FDT is stored in @phys. + * kho_retrieve_subtree - retrieve a preserved sub blob by its name. + * @name: the name of the sub blob passed to kho_add_subtree(). + * @phys: if found, the physical address of the sub blob is stored in @phys. + * @size: if not NULL and found, the size of the sub blob is stored in @size. * - * Retrieve a preserved sub FDT named @name and store its physical - * address in @phys. + * Retrieve a preserved sub blob named @name and store its physical + * address in @phys and optionally its size in @size. * * Return: 0 on success, error code on failure */ -int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size) { const void *fdt = kho_get_fdt(); const u64 *val; @@ -1317,12 +1330,22 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) if (offset < 0) return -ENOENT; - val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len); + val = fdt_getprop(fdt, offset, KHO_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(*val)) return -EINVAL; *phys = (phys_addr_t)*val; + val = fdt_getprop(fdt, offset, KHO_SUB_TREE_SIZE_PROP_NAME, &len); + if (!val || len != sizeof(*val)) { + pr_warn("broken KHO subnode '%s': missing or invalid blob-size property\n", + name); + return -EINVAL; + } + + if (size) + *size = (size_t)*val; + return 0; } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); @@ -1373,6 +1396,96 @@ static __init int kho_out_fdt_setup(void) return err; } +static void __init kho_in_kexec_metadata(void) +{ + struct kho_kexec_metadata *metadata; + phys_addr_t metadata_phys; + size_t blob_size; + int err; + + err = kho_retrieve_subtree(KHO_METADATA_NODE_NAME, &metadata_phys, + &blob_size); + if (err) + /* This is fine, previous kernel didn't export metadata */ + return; + + /* Check that, at least, "version" is present */ + if (blob_size < sizeof(u32)) { + pr_warn("kexec-metadata blob too small (%zu bytes)\n", + blob_size); + return; + } + + metadata = phys_to_virt(metadata_phys); + + if (metadata->version != KHO_KEXEC_METADATA_VERSION) { + pr_warn("kexec-metadata version %u not supported (expected %u)\n", + metadata->version, KHO_KEXEC_METADATA_VERSION); + return; + } + + if (blob_size < sizeof(*metadata)) { + pr_warn("kexec-metadata blob too small for v%u (%zu < %zu)\n", + metadata->version, blob_size, sizeof(*metadata)); + return; + } + + /* + * Copy data to the kernel structure that will persist during + * kernel lifetime. + */ + kho_in.kexec_count = metadata->kexec_count; + strscpy(kho_in.previous_release, metadata->previous_release, + sizeof(kho_in.previous_release)); + + pr_info("exec from: %s (count %u)\n", + kho_in.previous_release, kho_in.kexec_count); +} + +/* + * Create kexec metadata to pass kernel version and boot count to the + * next kernel. This keeps the core KHO ABI minimal and allows the + * metadata format to evolve independently. + */ +static __init int kho_out_kexec_metadata(void) +{ + struct kho_kexec_metadata *metadata; + int err; + + metadata = kho_alloc_preserve(sizeof(*metadata)); + if (IS_ERR(metadata)) + return PTR_ERR(metadata); + + metadata->version = KHO_KEXEC_METADATA_VERSION; + strscpy(metadata->previous_release, init_uts_ns.name.release, + sizeof(metadata->previous_release)); + /* kho_in.kexec_count is set to 0 on cold boot */ + metadata->kexec_count = kho_in.kexec_count + 1; + + err = kho_add_subtree(KHO_METADATA_NODE_NAME, metadata, + sizeof(*metadata)); + if (err) + kho_unpreserve_free(metadata); + + return err; +} + +static int __init kho_kexec_metadata_init(const void *fdt) +{ + int err; + + if (fdt) + kho_in_kexec_metadata(); + + /* Populate kexec metadata for the possible next kexec */ + err = kho_out_kexec_metadata(); + if (err) + pr_warn("failed to initialize kexec-metadata subtree: %d\n", + err); + + return err; +} + static __init int kho_init(void) { struct kho_radix_tree *tree = &kho_out.radix_tree; @@ -1406,6 +1519,10 @@ static __init int kho_init(void) if (err) goto err_free_fdt; + err = kho_kexec_metadata_init(fdt); + if (err) + goto err_free_fdt; + if (fdt) { kho_in_debugfs_init(&kho_in.dbg, fdt); return 0; @@ -1430,8 +1547,9 @@ static __init int kho_init(void) init_cma_reserved_pageblock(pfn_to_page(pfn)); } - WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", - kho_out.fdt, true)); + WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, "fdt", + kho_out.fdt, + fdt_totalsize(kho_out.fdt), true)); return 0; diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index acf368222682..257ee8a52be6 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -24,8 +24,9 @@ struct fdt_debugfs { struct dentry *file; }; -static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, - const char *name, const void *fdt) +static int __kho_debugfs_blob_add(struct list_head *list, struct dentry *dir, + const char *name, const void *blob, + size_t size) { struct fdt_debugfs *f; struct dentry *file; @@ -34,8 +35,8 @@ static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, if (!f) return -ENOMEM; - f->wrapper.data = (void *)fdt; - f->wrapper.size = fdt_totalsize(fdt); + f->wrapper.data = (void *)blob; + f->wrapper.size = size; file = debugfs_create_blob(name, 0400, dir, &f->wrapper); if (IS_ERR(file)) { @@ -49,8 +50,8 @@ static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, return 0; } -int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, bool root) +int kho_debugfs_blob_add(struct kho_debugfs *dbg, const char *name, + const void *blob, size_t size, bool root) { struct dentry *dir; @@ -59,15 +60,15 @@ int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, else dir = dbg->sub_fdt_dir; - return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); + return __kho_debugfs_blob_add(&dbg->fdt_list, dir, name, blob, size); } -void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) +void kho_debugfs_blob_remove(struct kho_debugfs *dbg, void *blob) { struct fdt_debugfs *ff; list_for_each_entry(ff, &dbg->fdt_list, list) { - if (ff->wrapper.data == fdt) { + if (ff->wrapper.data == blob) { debugfs_remove(ff->file); list_del(&ff->list); kfree(ff); @@ -113,28 +114,42 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) goto err_rmdir; } - err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt); + err = __kho_debugfs_blob_add(&dbg->fdt_list, dir, "fdt", fdt, + fdt_totalsize(fdt)); if (err) goto err_rmdir; fdt_for_each_subnode(child, fdt, 0) { int len = 0; const char *name = fdt_get_name(fdt, child, NULL); - const u64 *fdt_phys; + const u64 *blob_phys; + const u64 *blob_size; + void *blob; - fdt_phys = fdt_getprop(fdt, child, KHO_FDT_SUB_TREE_PROP_NAME, &len); - if (!fdt_phys) + blob_phys = fdt_getprop(fdt, child, + KHO_SUB_TREE_PROP_NAME, &len); + if (!blob_phys) continue; - if (len != sizeof(*fdt_phys)) { - pr_warn("node %s prop fdt has invalid length: %d\n", - name, len); + if (len != sizeof(*blob_phys)) { + pr_warn("node %s prop %s has invalid length: %d\n", + name, KHO_SUB_TREE_PROP_NAME, len); continue; } - err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, - phys_to_virt(*fdt_phys)); + + blob_size = fdt_getprop(fdt, child, + KHO_SUB_TREE_SIZE_PROP_NAME, &len); + if (!blob_size || len != sizeof(*blob_size)) { + pr_warn("node %s missing or invalid %s property\n", + name, KHO_SUB_TREE_SIZE_PROP_NAME); + continue; + } + + blob = phys_to_virt(*blob_phys); + err = __kho_debugfs_blob_add(&dbg->fdt_list, sub_fdt_dir, name, + blob, *blob_size); if (err) { - pr_warn("failed to add fdt %s to debugfs: %pe\n", name, - ERR_PTR(err)); + pr_warn("failed to add blob %s to debugfs: %pe\n", + name, ERR_PTR(err)); continue; } } diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index 9a832a35254c..0399ff107775 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -26,18 +26,19 @@ extern unsigned int kho_scratch_cnt; int kho_debugfs_init(void); void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); int kho_out_debugfs_init(struct kho_debugfs *dbg); -int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, bool root); -void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt); +int kho_debugfs_blob_add(struct kho_debugfs *dbg, const char *name, + const void *blob, size_t size, bool root); +void kho_debugfs_blob_remove(struct kho_debugfs *dbg, void *blob); #else static inline int kho_debugfs_init(void) { return 0; } static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) { } static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } -static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, bool root) { return 0; } -static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, - void *fdt) { } +static inline int kho_debugfs_blob_add(struct kho_debugfs *dbg, + const char *name, const void *blob, + size_t size, bool root) { return 0; } +static inline void kho_debugfs_blob_remove(struct kho_debugfs *dbg, + void *blob) { } #endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ #ifdef CONFIG_KEXEC_HANDOVER_DEBUG diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index 84ac728d63ba..803f51c84275 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,11 @@ static struct { u64 liveupdate_num; } luo_global; +/* + * luo_register_rwlock - Protects registration of file handlers and FLBs. + */ +DECLARE_RWSEM(luo_register_rwlock); + static int __init early_liveupdate_param(char *buf) { return kstrtobool(buf, &luo_global.enabled); @@ -88,7 +94,7 @@ static int __init luo_early_startup(void) } /* Retrieve LUO subtree, and verify its format. */ - err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys); + err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys, NULL); if (err) { if (err != -ENOENT) { pr_err("failed to retrieve FDT '%s' from KHO: %pe\n", @@ -172,7 +178,8 @@ static int __init luo_fdt_setup(void) if (err) goto exit_free; - err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out); + err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out, + fdt_totalsize(fdt_out)); if (err) goto exit_free; luo_global.fdt_out = fdt_out; diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 5acee4174bf0..a0a419085e28 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -108,12 +108,16 @@ #include #include #include +#include #include #include #include "luo_internal.h" static LIST_HEAD(luo_file_handler_list); +/* Keep track of files being preserved by LUO */ +static DEFINE_XARRAY(luo_preserved_files); + /* 2 4K pages, give space for 128 files per file_set */ #define LUO_FILE_PGCNT 2ul #define LUO_FILE_MAX \ @@ -203,6 +207,12 @@ static void luo_free_files_mem(struct luo_file_set *file_set) file_set->files = NULL; } +static unsigned long luo_get_id(struct liveupdate_file_handler *fh, + struct file *file) +{ + return fh->ops->get_id ? fh->ops->get_id(file) : (unsigned long)file; +} + static bool luo_token_is_used(struct luo_file_set *file_set, u64 token) { struct luo_file *iter; @@ -248,6 +258,7 @@ static bool luo_token_is_used(struct luo_file_set *file_set, u64 token) * Context: Can be called from an ioctl handler during normal system operation. * Return: 0 on success. Returns a negative errno on failure: * -EEXIST if the token is already used. + * -EBUSY if the file descriptor is already preserved by another session. * -EBADF if the file descriptor is invalid. * -ENOSPC if the file_set is full. * -ENOENT if no compatible handler is found. @@ -277,20 +288,28 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) goto err_fput; err = -ENOENT; + down_read(&luo_register_rwlock); list_private_for_each_entry(fh, &luo_file_handler_list, list) { if (fh->ops->can_preserve(fh, file)) { - err = 0; + if (try_module_get(fh->ops->owner)) + err = 0; break; } } + up_read(&luo_register_rwlock); /* err is still -ENOENT if no handler was found */ if (err) goto err_free_files_mem; + err = xa_insert(&luo_preserved_files, luo_get_id(fh, file), + file, GFP_KERNEL); + if (err) + goto err_module_put; + err = luo_flb_file_preserve(fh); if (err) - goto err_free_files_mem; + goto err_erase_xa; luo_file = kzalloc_obj(*luo_file); if (!luo_file) { @@ -320,6 +339,10 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) kfree(luo_file); err_flb_unpreserve: luo_flb_file_unpreserve(fh); +err_erase_xa: + xa_erase(&luo_preserved_files, luo_get_id(fh, file)); +err_module_put: + module_put(fh->ops->owner); err_free_files_mem: luo_free_files_mem(file_set); err_fput: @@ -362,7 +385,10 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set) args.private_data = luo_file->private_data; luo_file->fh->ops->unpreserve(&args); luo_flb_file_unpreserve(luo_file->fh); + module_put(luo_file->fh->ops->owner); + xa_erase(&luo_preserved_files, + luo_get_id(luo_file->fh, luo_file->file)); list_del(&luo_file->list); file_set->count--; @@ -606,6 +632,11 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token, luo_file->file = args.file; /* Get reference so we can keep this file in LUO until finish */ get_file(luo_file->file); + + WARN_ON(xa_insert(&luo_preserved_files, + luo_get_id(luo_file->fh, luo_file->file), + luo_file->file, GFP_KERNEL)); + *filep = luo_file->file; luo_file->retrieve_status = 1; @@ -646,6 +677,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set, luo_file->fh->ops->finish(&args); luo_flb_file_finish(luo_file->fh); + module_put(luo_file->fh->ops->owner); } /** @@ -701,8 +733,11 @@ int luo_file_finish(struct luo_file_set *file_set) luo_file_finish_one(file_set, luo_file); - if (luo_file->file) + if (luo_file->file) { + xa_erase(&luo_preserved_files, + luo_get_id(luo_file->fh, luo_file->file)); fput(luo_file->file); + } list_del(&luo_file->list); file_set->count--; mutex_destroy(&luo_file->mutex); @@ -777,22 +812,28 @@ int luo_file_deserialize(struct luo_file_set *file_set, bool handler_found = false; struct luo_file *luo_file; + down_read(&luo_register_rwlock); list_private_for_each_entry(fh, &luo_file_handler_list, list) { if (!strcmp(fh->compatible, file_ser[i].compatible)) { - handler_found = true; + if (try_module_get(fh->ops->owner)) + handler_found = true; break; } } + up_read(&luo_register_rwlock); if (!handler_found) { - pr_warn("No registered handler for compatible '%s'\n", + pr_warn("No registered handler for compatible '%.*s'\n", + (int)sizeof(file_ser[i].compatible), file_ser[i].compatible); return -ENOENT; } luo_file = kzalloc_obj(*luo_file); - if (!luo_file) + if (!luo_file) { + module_put(fh->ops->owner); return -ENOMEM; + } luo_file->fh = fh; luo_file->file = NULL; @@ -842,41 +883,28 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) return -EINVAL; } - /* - * Ensure the system is quiescent (no active sessions). - * This prevents registering new handlers while sessions are active or - * while deserialization is in progress. - */ - if (!luo_session_quiesce()) - return -EBUSY; - + down_write(&luo_register_rwlock); /* Check for duplicate compatible strings */ list_private_for_each_entry(fh_iter, &luo_file_handler_list, list) { if (!strcmp(fh_iter->compatible, fh->compatible)) { pr_err("File handler registration failed: Compatible string '%s' already registered.\n", fh->compatible); err = -EEXIST; - goto err_resume; + goto err_unlock; } } - /* Pin the module implementing the handler */ - if (!try_module_get(fh->ops->owner)) { - err = -EAGAIN; - goto err_resume; - } - INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, flb_list)); INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list)); list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list); - luo_session_resume(); + up_write(&luo_register_rwlock); liveupdate_test_register(fh); return 0; -err_resume: - luo_session_resume(); +err_unlock: + up_write(&luo_register_rwlock); return err; } @@ -886,41 +914,13 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) * * Unregisters the file handler from the liveupdate core. This function * reverses the operations of liveupdate_register_file_handler(). - * - * It ensures safe removal by checking that: - * No live update session is currently in progress. - * No FLB registered with this file handler. - * - * If the unregistration fails, the internal test state is reverted. - * - * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live - * update is in progress, can't quiesce live update or FLB is registred with - * this file handler. */ -int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) { - int err = -EBUSY; - if (!liveupdate_enabled()) - return -EOPNOTSUPP; - - liveupdate_test_unregister(fh); - - if (!luo_session_quiesce()) - goto err_register; - - if (!list_empty(&ACCESS_PRIVATE(fh, flb_list))) - goto err_resume; + return; + guard(rwsem_write)(&luo_register_rwlock); + luo_flb_unregister_all(fh); list_del(&ACCESS_PRIVATE(fh, list)); - module_put(fh->ops->owner); - luo_session_resume(); - - return 0; - -err_resume: - luo_session_resume(); -err_register: - liveupdate_test_register(fh); - return err; } diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c index f52e8114837e..00f5494812c4 100644 --- a/kernel/liveupdate/luo_flb.c +++ b/kernel/liveupdate/luo_flb.c @@ -89,13 +89,18 @@ struct luo_flb_link { static struct luo_flb_private *luo_flb_get_private(struct liveupdate_flb *flb) { struct luo_flb_private *private = &ACCESS_PRIVATE(flb, private); + static DEFINE_SPINLOCK(luo_flb_init_lock); + if (smp_load_acquire(&private->initialized)) + return private; + + guard(spinlock)(&luo_flb_init_lock); if (!private->initialized) { mutex_init(&private->incoming.lock); mutex_init(&private->outgoing.lock); INIT_LIST_HEAD(&private->list); private->users = 0; - private->initialized = true; + smp_store_release(&private->initialized, true); } return private; @@ -110,10 +115,15 @@ static int luo_flb_file_preserve_one(struct liveupdate_flb *flb) struct liveupdate_flb_op_args args = {0}; int err; + if (!try_module_get(flb->ops->owner)) + return -ENODEV; + args.flb = flb; err = flb->ops->preserve(&args); - if (err) + if (err) { + module_put(flb->ops->owner); return err; + } private->outgoing.data = args.data; private->outgoing.obj = args.obj; } @@ -141,6 +151,7 @@ static void luo_flb_file_unpreserve_one(struct liveupdate_flb *flb) private->outgoing.data = 0; private->outgoing.obj = NULL; + module_put(flb->ops->owner); } } } @@ -176,12 +187,17 @@ static int luo_flb_retrieve_one(struct liveupdate_flb *flb) if (!found) return -ENOENT; + if (!try_module_get(flb->ops->owner)) + return -ENODEV; + args.flb = flb; args.data = private->incoming.data; err = flb->ops->retrieve(&args); - if (err) + if (err) { + module_put(flb->ops->owner); return err; + } private->incoming.obj = args.obj; private->incoming.retrieved = true; @@ -215,6 +231,7 @@ static void luo_flb_file_finish_one(struct liveupdate_flb *flb) private->incoming.data = 0; private->incoming.obj = NULL; private->incoming.finished = true; + module_put(flb->ops->owner); } } } @@ -240,17 +257,20 @@ int luo_flb_file_preserve(struct liveupdate_file_handler *fh) struct luo_flb_link *iter; int err = 0; + down_read(&luo_register_rwlock); list_for_each_entry(iter, flb_list, list) { err = luo_flb_file_preserve_one(iter->flb); if (err) goto exit_err; } + up_read(&luo_register_rwlock); return 0; exit_err: list_for_each_entry_continue_reverse(iter, flb_list, list) luo_flb_file_unpreserve_one(iter->flb); + up_read(&luo_register_rwlock); return err; } @@ -272,6 +292,7 @@ void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh) struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); struct luo_flb_link *iter; + guard(rwsem_read)(&luo_register_rwlock); list_for_each_entry_reverse(iter, flb_list, list) luo_flb_file_unpreserve_one(iter->flb); } @@ -292,10 +313,67 @@ void luo_flb_file_finish(struct liveupdate_file_handler *fh) struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); struct luo_flb_link *iter; + guard(rwsem_read)(&luo_register_rwlock); list_for_each_entry_reverse(iter, flb_list, list) luo_flb_file_finish_one(iter->flb); } +static void luo_flb_unregister_one(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + bool found = false; + + /* Find and remove the link from the file handler's list */ + list_for_each_entry(iter, flb_list, list) { + if (iter->flb == flb) { + list_del(&iter->list); + kfree(iter); + found = true; + break; + } + } + + if (!found) { + pr_warn("Failed to unregister FLB '%s': not found in file handler '%s'\n", + flb->compatible, fh->compatible); + return; + } + + private->users--; + + /* + * If this is the last file-handler with which we are registred, remove + * from the global list. + */ + if (!private->users) { + list_del_init(&private->list); + luo_flb_global.count--; + } +} + +/** + * luo_flb_unregister_all - Unregister all FLBs associated with a file handler. + * @fh: The file handler whose FLBs should be unregistered. + * + * This function iterates through the list of FLBs associated with the given + * file handler and unregisters them all one by one. + */ +void luo_flb_unregister_all(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter, *tmp; + + if (!liveupdate_enabled()) + return; + + lockdep_assert_held_write(&luo_register_rwlock); + list_for_each_entry_safe(iter, tmp, flb_list, list) + luo_flb_unregister_one(fh, iter->flb); +} + /** * liveupdate_register_flb - Associate an FLB with a file handler and register it globally. * @fh: The file handler that will now depend on the FLB. @@ -326,7 +404,6 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, struct luo_flb_link *link __free(kfree) = NULL; struct liveupdate_flb *gflb; struct luo_flb_link *iter; - int err; if (!liveupdate_enabled()) return -EOPNOTSUPP; @@ -347,19 +424,12 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, if (!link) return -ENOMEM; - /* - * Ensure the system is quiescent (no active sessions). - * This acts as a global lock for registration: no other thread can - * be in this section, and no sessions can be creating/using FDs. - */ - if (!luo_session_quiesce()) - return -EBUSY; + guard(rwsem_write)(&luo_register_rwlock); /* Check that this FLB is not already linked to this file handler */ - err = -EEXIST; list_for_each_entry(iter, flb_list, list) { if (iter->flb == flb) - goto err_resume; + return -EEXIST; } /* @@ -367,25 +437,16 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, * is registered */ if (!private->users) { - if (WARN_ON(!list_empty(&private->list))) { - err = -EINVAL; - goto err_resume; - } + if (WARN_ON(!list_empty(&private->list))) + return -EINVAL; - if (luo_flb_global.count == LUO_FLB_MAX) { - err = -ENOSPC; - goto err_resume; - } + if (luo_flb_global.count == LUO_FLB_MAX) + return -ENOSPC; /* Check that compatible string is unique in global list */ list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { if (!strcmp(gflb->compatible, flb->compatible)) - goto err_resume; - } - - if (!try_module_get(flb->ops->owner)) { - err = -EAGAIN; - goto err_resume; + return -EEXIST; } list_add_tail(&private->list, &luo_flb_global.list); @@ -396,13 +457,8 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, private->users++; link->flb = flb; list_add_tail(&no_free_ptr(link)->list, flb_list); - luo_session_resume(); return 0; - -err_resume: - luo_session_resume(); - return err; } /** @@ -418,63 +474,17 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, * the FLB is removed from the global registry and the reference to its * owner module (acquired during registration) is released. * - * Context: This function ensures the session is quiesced (no active FDs - * being created) during the update. It is typically called from a - * subsystem's module exit function. - * Return: 0 on success. - * -EOPNOTSUPP if live update is disabled. - * -EBUSY if the live update session is active and cannot be quiesced. - * -ENOENT if the FLB was not found in the file handler's list. + * Context: It is typically called from a subsystem's module exit function. */ -int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb) +void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) { - struct luo_flb_private *private = luo_flb_get_private(flb); - struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); - struct luo_flb_link *iter; - int err = -ENOENT; - if (!liveupdate_enabled()) - return -EOPNOTSUPP; + return; - /* - * Ensure the system is quiescent (no active sessions). - * This acts as a global lock for unregistration. - */ - if (!luo_session_quiesce()) - return -EBUSY; + guard(rwsem_write)(&luo_register_rwlock); - /* Find and remove the link from the file handler's list */ - list_for_each_entry(iter, flb_list, list) { - if (iter->flb == flb) { - list_del(&iter->list); - kfree(iter); - err = 0; - break; - } - } - - if (err) - goto err_resume; - - private->users--; - /* - * If this is the last file-handler with which we are registred, remove - * from the global list, and relese module reference. - */ - if (!private->users) { - list_del_init(&private->list); - luo_flb_global.count--; - module_put(flb->ops->owner); - } - - luo_session_resume(); - - return 0; - -err_resume: - luo_session_resume(); - return err; + luo_flb_unregister_one(fh, flb); } /** @@ -492,7 +502,8 @@ int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, * * Return: 0 on success, or a negative errno on failure. -ENODATA means no * incoming FLB data, -ENOENT means specific flb not found in the incoming - * data, and -EOPNOTSUPP when live update is disabled or not configured. + * data, -ENODEV if the FLB's module is unloading, and -EOPNOTSUPP when + * live update is disabled or not configured. */ int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp) { @@ -638,6 +649,7 @@ void luo_flb_serialize(void) struct liveupdate_flb *gflb; int i = 0; + guard(rwsem_read)(&luo_register_rwlock); list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { struct luo_flb_private *private = luo_flb_get_private(gflb); diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index 8083d8739b09..875844d7a41d 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -77,14 +77,14 @@ struct luo_session { struct mutex mutex; }; +extern struct rw_semaphore luo_register_rwlock; + int luo_session_create(const char *name, struct file **filep); int luo_session_retrieve(const char *name, struct file **filep); int __init luo_session_setup_outgoing(void *fdt); int __init luo_session_setup_incoming(void *fdt); int luo_session_serialize(void); int luo_session_deserialize(void); -bool luo_session_quiesce(void); -void luo_session_resume(void); int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd); void luo_file_unpreserve_files(struct luo_file_set *file_set); @@ -103,16 +103,15 @@ void luo_file_set_destroy(struct luo_file_set *file_set); int luo_flb_file_preserve(struct liveupdate_file_handler *fh); void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh); void luo_flb_file_finish(struct liveupdate_file_handler *fh); +void luo_flb_unregister_all(struct liveupdate_file_handler *fh); int __init luo_flb_setup_outgoing(void *fdt); int __init luo_flb_setup_incoming(void *fdt); void luo_flb_serialize(void); #ifdef CONFIG_LIVEUPDATE_TEST void liveupdate_test_register(struct liveupdate_file_handler *fh); -void liveupdate_test_unregister(struct liveupdate_file_handler *fh); #else static inline void liveupdate_test_register(struct liveupdate_file_handler *fh) { } -static inline void liveupdate_test_unregister(struct liveupdate_file_handler *fh) { } #endif #endif /* _LINUX_LUO_INTERNAL_H */ diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c index 25ae704d7787..a3327a28fc1f 100644 --- a/kernel/liveupdate/luo_session.c +++ b/kernel/liveupdate/luo_session.c @@ -544,7 +544,8 @@ int luo_session_deserialize(void) session = luo_session_alloc(sh->ser[i].name); if (IS_ERR(session)) { - pr_warn("Failed to allocate session [%s] during deserialization %pe\n", + pr_warn("Failed to allocate session [%.*s] during deserialization %pe\n", + (int)sizeof(sh->ser[i].name), sh->ser[i].name, session); return PTR_ERR(session); } @@ -606,46 +607,3 @@ int luo_session_serialize(void) return err; } -/** - * luo_session_quiesce - Ensure no active sessions exist and lock session lists. - * - * Acquires exclusive write locks on both incoming and outgoing session lists. - * It then validates no sessions exist in either list. - * - * This mechanism is used during file handler un/registration to ensure that no - * sessions are currently using the handler, and no new sessions can be created - * while un/registration is in progress. - * - * This prevents registering new handlers while sessions are active or - * while deserialization is in progress. - * - * Return: - * true - System is quiescent (0 sessions) and locked. - * false - Active sessions exist. The locks are released internally. - */ -bool luo_session_quiesce(void) -{ - down_write(&luo_session_global.incoming.rwsem); - down_write(&luo_session_global.outgoing.rwsem); - - if (luo_session_global.incoming.count || - luo_session_global.outgoing.count) { - up_write(&luo_session_global.outgoing.rwsem); - up_write(&luo_session_global.incoming.rwsem); - return false; - } - - return true; -} - -/** - * luo_session_resume - Unlock session lists and resume normal activity. - * - * Releases the exclusive locks acquired by a successful call to - * luo_session_quiesce(). - */ -void luo_session_resume(void) -{ - up_write(&luo_session_global.outgoing.rwsem); - up_write(&luo_session_global.incoming.rwsem); -} diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 58991ab09d84..ed1bdcf1f8ab 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -6,7 +6,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -758,8 +760,115 @@ static __init bool need_page_alloc_tagging(void) return mem_profiling_support; } +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG +/* + * Track page allocations before page_ext is initialized. + * Some pages are allocated before page_ext becomes available, leaving + * their codetag uninitialized. Track these early PFNs so we can clear + * their codetag refs later to avoid warnings when they are freed. + * + * Early allocations include: + * - Base allocations independent of CPU count + * - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init, + * such as trace ring buffers, scheduler per-cpu data) + * + * For simplicity, we fix the size to 8192. + * If insufficient, a warning will be triggered to alert the user. + * + * TODO: Replace fixed-size array with dynamic allocation using + * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion. + */ +#define EARLY_ALLOC_PFN_MAX 8192 + +static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata; +static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0); + +static void __init __alloc_tag_add_early_pfn(unsigned long pfn) +{ + int old_idx, new_idx; + + do { + old_idx = atomic_read(&early_pfn_count); + if (old_idx >= EARLY_ALLOC_PFN_MAX) { + pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n", + EARLY_ALLOC_PFN_MAX); + return; + } + new_idx = old_idx + 1; + } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx)); + + early_pfns[old_idx] = pfn; +} + +typedef void alloc_tag_add_func(unsigned long pfn); +static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata = + RCU_INITIALIZER(__alloc_tag_add_early_pfn); + +void alloc_tag_add_early_pfn(unsigned long pfn) +{ + alloc_tag_add_func *alloc_tag_add; + + if (static_key_enabled(&mem_profiling_compressed)) + return; + + rcu_read_lock(); + alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr); + if (alloc_tag_add) + alloc_tag_add(pfn); + rcu_read_unlock(); +} + +static void __init clear_early_alloc_pfn_tag_refs(void) +{ + unsigned int i; + + if (static_key_enabled(&mem_profiling_compressed)) + return; + + rcu_assign_pointer(alloc_tag_add_early_pfn_ptr, NULL); + /* Make sure we are not racing with __alloc_tag_add_early_pfn() */ + synchronize_rcu(); + + for (i = 0; i < atomic_read(&early_pfn_count); i++) { + unsigned long pfn = early_pfns[i]; + + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + /* + * An early-allocated page could be freed and reallocated + * after its page_ext is initialized but before we clear it. + * In that case, it already has a valid tag set. + * We should not overwrite that valid tag with CODETAG_EMPTY. + * + * Note: there is still a small race window between checking + * ref.ct and calling set_codetag_empty(). We accept this + * race as it's unlikely and the extra complexity of atomic + * cmpxchg is not worth it for this debug-only code path. + */ + if (ref.ct) { + put_page_tag_ref(handle); + continue; + } + + set_codetag_empty(&ref); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } + } + + } +} +#else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */ +static inline void __init clear_early_alloc_pfn_tag_refs(void) {} +#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + static __init void init_page_alloc_tagging(void) { + clear_early_alloc_pfn_tag_refs(); } struct page_ext_operations page_alloc_tagging_ops = { diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 0964d53365e6..213504915737 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -185,11 +185,73 @@ static int dmirror_fops_open(struct inode *inode, struct file *filp) return 0; } +static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) +{ + unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT; + unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT; + unsigned long npages = end_pfn - start_pfn + 1; + unsigned long i; + unsigned long *src_pfns; + unsigned long *dst_pfns; + unsigned int order = 0; + + src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); + dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); + + migrate_device_range(src_pfns, start_pfn, npages); + for (i = 0; i < npages; i++) { + struct page *dpage, *spage; + + spage = migrate_pfn_to_page(src_pfns[i]); + if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + + if (WARN_ON(!is_device_private_page(spage) && + !is_device_coherent_page(spage))) + continue; + + order = folio_order(page_folio(spage)); + spage = BACKING_PAGE(spage); + if (src_pfns[i] & MIGRATE_PFN_COMPOUND) { + dpage = folio_page(folio_alloc(GFP_HIGHUSER_MOVABLE, + order), 0); + } else { + dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); + order = 0; + } + + /* TODO Support splitting here */ + lock_page(dpage); + dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); + if (src_pfns[i] & MIGRATE_PFN_WRITE) + dst_pfns[i] |= MIGRATE_PFN_WRITE; + if (order) + dst_pfns[i] |= MIGRATE_PFN_COMPOUND; + folio_copy(page_folio(dpage), page_folio(spage)); + } + migrate_device_pages(src_pfns, dst_pfns, npages); + migrate_device_finalize(src_pfns, dst_pfns, npages); + kvfree(src_pfns); + kvfree(dst_pfns); +} + static int dmirror_fops_release(struct inode *inode, struct file *filp) { struct dmirror *dmirror = filp->private_data; + struct dmirror_device *mdevice = dmirror->mdevice; + int i; mmu_interval_notifier_remove(&dmirror->notifier); + + if (mdevice->devmem_chunks) { + for (i = 0; i < mdevice->devmem_count; i++) { + struct dmirror_chunk *devmem = + mdevice->devmem_chunks[i]; + + dmirror_device_evict_chunk(devmem); + } + } + xa_destroy(&dmirror->pt); kfree(dmirror); return 0; @@ -1377,56 +1439,6 @@ static int dmirror_snapshot(struct dmirror *dmirror, return ret; } -static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) -{ - unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT; - unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT; - unsigned long npages = end_pfn - start_pfn + 1; - unsigned long i; - unsigned long *src_pfns; - unsigned long *dst_pfns; - unsigned int order = 0; - - src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); - dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); - - migrate_device_range(src_pfns, start_pfn, npages); - for (i = 0; i < npages; i++) { - struct page *dpage, *spage; - - spage = migrate_pfn_to_page(src_pfns[i]); - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; - - if (WARN_ON(!is_device_private_page(spage) && - !is_device_coherent_page(spage))) - continue; - - order = folio_order(page_folio(spage)); - spage = BACKING_PAGE(spage); - if (src_pfns[i] & MIGRATE_PFN_COMPOUND) { - dpage = folio_page(folio_alloc(GFP_HIGHUSER_MOVABLE, - order), 0); - } else { - dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); - order = 0; - } - - /* TODO Support splitting here */ - lock_page(dpage); - dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); - if (src_pfns[i] & MIGRATE_PFN_WRITE) - dst_pfns[i] |= MIGRATE_PFN_WRITE; - if (order) - dst_pfns[i] |= MIGRATE_PFN_COMPOUND; - folio_copy(page_folio(dpage), page_folio(spage)); - } - migrate_device_pages(src_pfns, dst_pfns, npages); - migrate_device_finalize(src_pfns, dst_pfns, npages); - kvfree(src_pfns); - kvfree(dst_pfns); -} - /* Removes free pages from the free list so they can't be re-allocated */ static void dmirror_remove_free_pages(struct dmirror_chunk *devmem) { @@ -1726,6 +1738,13 @@ static const struct dev_pagemap_ops dmirror_devmem_ops = { .folio_split = dmirror_devmem_folio_split, }; +static void dmirror_device_release(struct device *dev) +{ + struct dmirror_device *mdevice = container_of(dev, struct dmirror_device, device); + + dmirror_device_remove_chunks(mdevice); +} + static int dmirror_device_init(struct dmirror_device *mdevice, int id) { dev_t dev; @@ -1737,6 +1756,8 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) cdev_init(&mdevice->cdevice, &dmirror_fops); mdevice->cdevice.owner = THIS_MODULE; + mdevice->device.release = dmirror_device_release; + device_initialize(&mdevice->device); mdevice->device.devt = dev; @@ -1744,12 +1765,16 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) if (ret) goto put_device; + /* Build a list of free ZONE_DEVICE struct pages */ + ret = dmirror_allocate_chunk(mdevice, NULL, false); + if (ret) + goto put_device; + ret = cdev_device_add(&mdevice->cdevice, &mdevice->device); if (ret) goto put_device; - /* Build a list of free ZONE_DEVICE struct pages */ - return dmirror_allocate_chunk(mdevice, NULL, false); + return 0; put_device: put_device(&mdevice->device); @@ -1758,7 +1783,6 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) static void dmirror_device_remove(struct dmirror_device *mdevice) { - dmirror_device_remove_chunks(mdevice); cdev_device_del(&mdevice->cdevice, &mdevice->device); put_device(&mdevice->device); } diff --git a/lib/test_kho.c b/lib/test_kho.c index 7ef9e4061869..aa6a0956bb8b 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -143,7 +143,8 @@ static int kho_test_preserve(struct kho_test_state *state) if (err) goto err_unpreserve_data; - err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt)); + err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt), + fdt_totalsize(folio_address(state->fdt))); if (err) goto err_unpreserve_data; @@ -318,7 +319,7 @@ static int __init kho_test_init(void) if (!kho_is_enabled()) return 0; - err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys); + err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys, NULL); if (!err) { err = kho_test_restore(fdt_phys); if (err) diff --git a/lib/tests/liveupdate.c b/lib/tests/liveupdate.c index 496d6ef91a30..e4b0ecbee32f 100644 --- a/lib/tests/liveupdate.c +++ b/lib/tests/liveupdate.c @@ -135,24 +135,6 @@ void liveupdate_test_register(struct liveupdate_file_handler *fh) TEST_NFLBS, fh->compatible); } -void liveupdate_test_unregister(struct liveupdate_file_handler *fh) -{ - int err, i; - - for (i = 0; i < TEST_NFLBS; i++) { - struct liveupdate_flb *flb = &test_flbs[i]; - - err = liveupdate_unregister_flb(fh, flb); - if (err) { - pr_err("Failed to unregister %s %pe\n", - flb->compatible, ERR_PTR(err)); - } - } - - pr_info("Unregistered %d FLBs from file handler: [%s]\n", - TEST_NFLBS, fh->compatible); -} - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pasha Tatashin "); MODULE_DESCRIPTION("In-kernel test for LUO mechanism"); diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 7638d75b27db..91b3e027b753 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -297,6 +297,17 @@ config DEBUG_KMEMLEAK_AUTO_SCAN If unsure, say Y. +config DEBUG_KMEMLEAK_VERBOSE + bool "Default kmemleak to verbose mode" + depends on DEBUG_KMEMLEAK_AUTO_SCAN + help + Say Y here to have kmemleak print unreferenced object details + (backtrace, hex dump, address) to dmesg when new memory leaks are + detected during automatic scanning. This can also be toggled at + runtime via /sys/module/kmemleak/parameters/verbose. + + If unsure, say N. + config PER_VMA_LOCK_STATS bool "Statistics for per-vma locks" depends on PER_VMA_LOCK diff --git a/mm/compaction.c b/mm/compaction.c index 1e8f8eca318c..3648ce22c807 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -518,6 +518,24 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } +static struct lruvec * +compact_folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags, + struct compact_control *cc) +{ + struct lruvec *lruvec; + + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); + compact_lock_irqsave(&lruvec->lru_lock, flags, cc); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); + goto retry; + } + + return lruvec; +} + /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. The lock should be periodically unlocked to avoid @@ -839,7 +857,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, { pg_data_t *pgdat = cc->zone->zone_pgdat; unsigned long nr_scanned = 0, nr_isolated = 0; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; struct lruvec *locked = NULL; struct folio *folio = NULL; @@ -913,7 +931,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (!(low_pfn % COMPACT_CLUSTER_MAX)) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -964,7 +982,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* for alloc_contig case */ if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -1053,7 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(page_has_movable_ops(page)) && !PageMovableOpsIsolated(page)) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -1153,18 +1171,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!folio_test_clear_lru(folio)) goto isolate_fail_put; - lruvec = folio_lruvec(folio); + if (locked) + lruvec = folio_lruvec(folio); /* If we already hold the lock, we can skip some rechecking */ - if (lruvec != locked) { + if (lruvec != locked || !locked) { if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); - compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); + lruvec = compact_folio_lruvec_lock_irqsave(folio, &flags, cc); locked = lruvec; - lruvec_memcg_debug(lruvec, folio); - /* * Try get exclusive access under lock. If marked for * skip, the scan is aborted unless the current context @@ -1226,7 +1243,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, isolate_fail_put: /* Avoid potential deadlock in freeing page under lru_lock */ if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } folio_put(folio); @@ -1242,7 +1259,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (nr_isolated) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } putback_movable_pages(&cc->migratepages); @@ -1274,7 +1291,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, isolate_abort: if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); if (folio) { folio_set_lru(folio); folio_put(folio); diff --git a/mm/damon/core.c b/mm/damon/core.c index 7f04fc3f8c8c..fa9531d8e7f8 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1573,35 +1573,6 @@ int damon_kdamond_pid(struct damon_ctx *ctx) return pid; } -/* - * damon_call_handle_inactive_ctx() - handle DAMON call request that added to - * an inactive context. - * @ctx: The inactive DAMON context. - * @control: Control variable of the call request. - * - * This function is called in a case that @control is added to @ctx but @ctx is - * not running (inactive). See if @ctx handled @control or not, and cleanup - * @control if it was not handled. - * - * Returns 0 if @control was handled by @ctx, negative error code otherwise. - */ -static int damon_call_handle_inactive_ctx( - struct damon_ctx *ctx, struct damon_call_control *control) -{ - struct damon_call_control *c; - - mutex_lock(&ctx->call_controls_lock); - list_for_each_entry(c, &ctx->call_controls, list) { - if (c == control) { - list_del(&control->list); - mutex_unlock(&ctx->call_controls_lock); - return -EINVAL; - } - } - mutex_unlock(&ctx->call_controls_lock); - return 0; -} - /** * damon_call() - Invoke a given function on DAMON worker thread (kdamond). * @ctx: DAMON context to call the function for. @@ -1619,6 +1590,10 @@ static int damon_call_handle_inactive_ctx( * synchronization. The return value of the function will be saved in * &damon_call_control->return_code. * + * Note that this function should be called only after damon_start() with the + * @ctx has succeeded. Otherwise, this function could fall into an indefinite + * wait. + * * Return: 0 on success, negative error code otherwise. */ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) @@ -1629,10 +1604,12 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) INIT_LIST_HEAD(&control->list); mutex_lock(&ctx->call_controls_lock); + if (ctx->call_controls_obsolete) { + mutex_unlock(&ctx->call_controls_lock); + return -ECANCELED; + } list_add_tail(&control->list, &ctx->call_controls); mutex_unlock(&ctx->call_controls_lock); - if (!damon_is_running(ctx)) - return damon_call_handle_inactive_ctx(ctx, control); if (control->repeat) return 0; wait_for_completion(&control->completion); @@ -1660,6 +1637,10 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) * passed at least one &damos->apply_interval_us, kdamond marks the request as * completed so that damos_walk() can wakeup and return. * + * Note that this function should be called only after damon_start() with the + * @ctx has succeeded. Otherwise, this function could fall into an indefinite + * wait. + * * Return: 0 on success, negative error code otherwise. */ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) @@ -1667,19 +1648,16 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) init_completion(&control->completion); control->canceled = false; mutex_lock(&ctx->walk_control_lock); + if (ctx->walk_control_obsolete) { + mutex_unlock(&ctx->walk_control_lock); + return -ECANCELED; + } if (ctx->walk_control) { mutex_unlock(&ctx->walk_control_lock); return -EBUSY; } ctx->walk_control = control; mutex_unlock(&ctx->walk_control_lock); - if (!damon_is_running(ctx)) { - mutex_lock(&ctx->walk_control_lock); - if (ctx->walk_control == control) - ctx->walk_control = NULL; - mutex_unlock(&ctx->walk_control_lock); - return -EINVAL; - } wait_for_completion(&control->completion); if (control->canceled) return -ECANCELED; @@ -2239,12 +2217,24 @@ static inline u64 damos_get_some_mem_psi_total(void) #endif /* CONFIG_PSI */ #ifdef CONFIG_NUMA +static bool invalid_mem_node(int nid) +{ + return nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY); +} + static __kernel_ulong_t damos_get_node_mem_bp( struct damos_quota_goal *goal) { struct sysinfo i; __kernel_ulong_t numerator; + if (invalid_mem_node(goal->nid)) { + if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP) + return 0; + else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */ + return 10000; + } + si_meminfo_node(&i, goal->nid); if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP) numerator = i.totalram - i.freeram; @@ -2261,6 +2251,13 @@ static unsigned long damos_get_node_memcg_used_bp( unsigned long used_pages, numerator; struct sysinfo i; + if (invalid_mem_node(goal->nid)) { + if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) + return 0; + else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ + return 10000; + } + memcg = mem_cgroup_get_from_id(goal->memcg_id); if (!memcg) { if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) @@ -2452,7 +2449,8 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) } /* New charge window starts */ - if (time_after_eq(jiffies, quota->charged_from + + if (!time_in_range_open(jiffies, quota->charged_from, + quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { if (damos_quota_is_set(quota) && quota->charged_sz >= quota->esz) @@ -2952,6 +2950,12 @@ static int kdamond_fn(void *data) pr_debug("kdamond (%d) starts\n", current->pid); + mutex_lock(&ctx->call_controls_lock); + ctx->call_controls_obsolete = false; + mutex_unlock(&ctx->call_controls_lock); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control_obsolete = false; + mutex_unlock(&ctx->walk_control_lock); complete(&ctx->kdamond_started); kdamond_init_ctx(ctx); @@ -3062,7 +3066,13 @@ static int kdamond_fn(void *data) damon_destroy_targets(ctx); kfree(ctx->regions_score_histogram); + mutex_lock(&ctx->call_controls_lock); + ctx->call_controls_obsolete = true; + mutex_unlock(&ctx->call_controls_lock); kdamond_call(ctx, true); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control_obsolete = true; + mutex_unlock(&ctx->walk_control_lock); damos_walk_cancel(ctx); pr_debug("kdamond (%d) finishes\n", current->pid); diff --git a/mm/damon/stat.c b/mm/damon/stat.c index 60351a719460..99ba346f9e32 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -255,8 +255,11 @@ static int damon_stat_start(void) if (!damon_stat_context) return -ENOMEM; err = damon_start(&damon_stat_context, 1, true); - if (err) + if (err) { + damon_destroy_ctx(damon_stat_context); + damon_stat_context = NULL; return err; + } damon_stat_last_refresh_jiffies = jiffies; call_control.data = damon_stat_context; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 42c983821c03..970e077019b7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1218,13 +1218,29 @@ split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags static struct deferred_split *folio_split_queue_lock(struct folio *folio) { - return split_queue_lock(folio_nid(folio), folio_memcg(folio)); + struct deferred_split *queue; + + rcu_read_lock(); + queue = split_queue_lock(folio_nid(folio), folio_memcg(folio)); + /* + * The memcg destruction path is acquiring the split queue lock for + * reparenting. Once you have it locked, it's safe to drop the rcu lock. + */ + rcu_read_unlock(); + + return queue; } static struct deferred_split * folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) { - return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); + struct deferred_split *queue; + + rcu_read_lock(); + queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); + rcu_read_unlock(); + + return queue; } static inline void split_queue_unlock(struct deferred_split *queue) @@ -3994,7 +4010,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n folio_ref_unfreeze(folio, folio_cache_ref_count(folio) + 1); if (do_lru) - unlock_page_lruvec(lruvec); + lruvec_unlock(lruvec); if (ci) swap_cluster_unlock(ci); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9413ed497be5..f24bf49be047 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4218,6 +4218,9 @@ static __init int hugetlb_add_param(char *s, int (*setup)(char *)) size_t len; char *p; + if (!s) + return -EINVAL; + if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS) return -EINVAL; @@ -4784,6 +4787,18 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) return 0; } +#ifdef CONFIG_USERFAULTFD +static bool hugetlb_can_userfault(struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + return true; +} + +static const struct vm_uffd_ops hugetlb_uffd_ops = { + .can_userfault = hugetlb_can_userfault, +}; +#endif + /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. @@ -4797,6 +4812,9 @@ const struct vm_operations_struct hugetlb_vm_ops = { .close = hugetlb_vm_op_close, .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &hugetlb_uffd_ops, +#endif }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, diff --git a/mm/kmemleak.c b/mm/kmemleak.c index fa8201e23222..2eff0d6b622b 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -241,7 +241,7 @@ static int kmemleak_skip_disable; /* If there are leaks that can be reported */ static bool kmemleak_found_leaks; -static bool kmemleak_verbose; +static bool kmemleak_verbose = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_VERBOSE); module_param_named(verbose, kmemleak_verbose, bool, 0600); static void kmemleak_disable(void); diff --git a/mm/memblock.c b/mm/memblock.c index 5629844b2804..a6a1c91e276d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2601,7 +2601,7 @@ static int __init prepare_kho_fdt(void) if (err) goto err_unpreserve_fdt; - err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt, fdt_totalsize(fdt)); if (err) goto err_unpreserve_fdt; @@ -2646,7 +2646,7 @@ static void *__init reserve_mem_kho_retrieve_fdt(void) if (fdt) return fdt; - err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys); + err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys, NULL); if (err) { if (err != -ENOENT) pr_warn("failed to retrieve FDT '%s' from KHO: %d\n", diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 437cd25784fe..433bba9dfe71 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -613,6 +613,7 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) void memcg1_swapout(struct folio *folio, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; + struct obj_cgroup *objcg; unsigned int nr_entries; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); @@ -624,12 +625,13 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) if (!do_memsw_account()) return; - memcg = folio_memcg(folio); - - VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) + objcg = folio_objcg(folio); + VM_WARN_ON_ONCE_FOLIO(!objcg, folio); + if (!objcg) return; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); /* * In case the memcg owning these pages has been offlined and doesn't * have an ID allocated to it anymore, charge the closest online @@ -644,7 +646,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) folio_unqueue_deferred_split(folio); folio->memcg_data = 0; - if (!mem_cgroup_is_root(memcg)) + if (!obj_cgroup_is_root(objcg)) page_counter_uncharge(&memcg->memory, nr_entries); if (memcg != swap_memcg) { @@ -665,7 +667,8 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) preempt_enable_nested(); memcg1_check_events(memcg, folio_nid(folio)); - css_put(&memcg->css); + rcu_read_unlock(); + obj_cgroup_put(objcg); } /* @@ -1884,6 +1887,22 @@ static const unsigned int memcg1_events[] = { PGMAJFAULT, }; +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) + reparent_memcg_state_local(memcg, parent, memcg1_stats[i]); +} + +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + int i; + + for (i = 0; i < NR_LRU_LISTS; i++) + reparent_memcg_lruvec_state_local(memcg, parent, i); +} + void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { unsigned long memory, memsw; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 1b969294ea6a..f92f81108d5e 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -73,6 +73,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_memory, int nid); void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent); +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent); + +void reparent_memcg_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx); +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx); void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages); static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 051b82ebf371..c3d98ab41f1f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -206,26 +206,100 @@ static struct obj_cgroup *obj_cgroup_alloc(void) return objcg; } -static void memcg_reparent_objcgs(struct mem_cgroup *memcg, - struct mem_cgroup *parent) +static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memcg, + struct mem_cgroup *parent, + int nid) { struct obj_cgroup *objcg, *iter; + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node *parent_pn = parent->nodeinfo[nid]; - objcg = rcu_replace_pointer(memcg->objcg, NULL, true); - - spin_lock_irq(&objcg_lock); - + objcg = rcu_replace_pointer(pn->objcg, NULL, true); /* 1) Ready to reparent active objcg. */ - list_add(&objcg->list, &memcg->objcg_list); + list_add(&objcg->list, &pn->objcg_list); /* 2) Reparent active objcg and already reparented objcgs to parent. */ - list_for_each_entry(iter, &memcg->objcg_list, list) + list_for_each_entry(iter, &pn->objcg_list, list) WRITE_ONCE(iter->memcg, parent); /* 3) Move already reparented objcgs to the parent's list */ - list_splice(&memcg->objcg_list, &parent->objcg_list); + list_splice(&pn->objcg_list, &parent_pn->objcg_list); + return objcg; +} + +#ifdef CONFIG_MEMCG_V1 +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force); + +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + /* + * Reparent stats exposed non-hierarchically. Flush @memcg's stats first + * to read its stats accurately , and conservatively flush @parent's + * stats after reparenting to avoid hiding a potentially large stat + * update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()). + */ + __mem_cgroup_flush_stats(memcg, true); + + /* The following counts are all non-hierarchical and need to be reparented. */ + reparent_memcg1_state_local(memcg, parent); + reparent_memcg1_lruvec_state_local(memcg, parent); + + __mem_cgroup_flush_stats(parent, true); +} +#else +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ +} +#endif + +static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + spin_lock_irq(&objcg_lock); + spin_lock_nested(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock, 1); + spin_lock_nested(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock, 2); +} + +static inline void reparent_unlocks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + spin_unlock(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock); + spin_unlock(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock); spin_unlock_irq(&objcg_lock); +} - percpu_ref_kill(&objcg->refcnt); +static void memcg_reparent_objcgs(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + int nid; + + for_each_node(nid) { +retry: + if (lru_gen_enabled()) + max_lru_gen_memcg(parent, nid); + + reparent_locks(memcg, parent, nid); + + if (lru_gen_enabled()) { + if (!recheck_lru_gen_max_memcg(parent, nid)) { + reparent_unlocks(memcg, parent, nid); + cond_resched(); + goto retry; + } + lru_gen_reparent_memcg(memcg, parent, nid); + } else { + lru_reparent_memcg(memcg, parent, nid); + } + + objcg = __memcg_reparent_objcgs(memcg, parent, nid); + + reparent_unlocks(memcg, parent, nid); + + percpu_ref_kill(&objcg->refcnt); + } + + reparent_state_local(memcg, parent); } /* @@ -241,7 +315,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); EXPORT_SYMBOL(memcg_bpf_enabled_key); /** - * mem_cgroup_css_from_folio - css of the memcg associated with a folio + * get_mem_cgroup_css_from_folio - acquire a css of the memcg associated with a folio * @folio: folio of interest * * If memcg is bound to the default hierarchy, css of the memcg associated @@ -251,14 +325,16 @@ EXPORT_SYMBOL(memcg_bpf_enabled_key); * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ -struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) +struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; - if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) - memcg = root_mem_cgroup; + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return &root_mem_cgroup->css; - return &memcg->css; + memcg = get_mem_cgroup_from_folio(folio); + + return memcg ? &memcg->css : &root_mem_cgroup->css; } /** @@ -449,6 +525,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } +#ifdef CONFIG_MEMCG_V1 +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn, + enum node_stat_item idx, long val); + +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx) +{ + int nid; + + for_each_node(nid) { + struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid)); + unsigned long value = lruvec_page_state_local(child_lruvec, idx); + struct mem_cgroup_per_node *child_pn, *parent_pn; + + child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec); + parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec); + + __mod_memcg_lruvec_state(child_pn, idx, -value); + __mod_memcg_lruvec_state(parent_pn, idx, value); + } +} +#endif + /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { #ifdef CONFIG_MEMCG_V1 @@ -508,7 +608,7 @@ static inline int memcg_events_index(enum vm_event_item idx) struct memcg_vmstats_percpu { /* Stats updates since the last flush */ - unsigned int stats_updates; + unsigned long stats_updates; /* Cached pointers for fast iteration in memcg_rstat_updated() */ struct memcg_vmstats_percpu __percpu *parent_pcpu; @@ -539,7 +639,7 @@ struct memcg_vmstats { unsigned long events_pending[NR_MEMCG_EVENTS]; /* Stats updates since the last flush */ - atomic_t stats_updates; + atomic_long_t stats_updates; }; /* @@ -565,16 +665,16 @@ static u64 flush_last_time; static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { - return atomic_read(&vmstats->stats_updates) > + return atomic_long_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } -static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, long val, int cpu) { struct memcg_vmstats_percpu __percpu *statc_pcpu; struct memcg_vmstats_percpu *statc; - unsigned int stats_updates; + unsigned long stats_updates; if (!val) return; @@ -597,7 +697,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, continue; stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0); - atomic_add(stats_updates, &statc->vmstats->stats_updates); + atomic_long_add(stats_updates, &statc->vmstats->stats_updates); } } @@ -605,7 +705,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) { bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats); - trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates), + trace_memcg_flush_stats(memcg, atomic_long_read(&memcg->vmstats->stats_updates), force, needs_flush); if (!force && !needs_flush) @@ -684,14 +784,76 @@ static int memcg_page_state_unit(int item); * Normalize the value passed into memcg_rstat_updated() to be in pages. Round * up non-zero sub-page updates to 1 page as zero page updates are ignored. */ -static int memcg_state_val_in_pages(int idx, int val) +static long memcg_state_val_in_pages(int idx, long val) { int unit = memcg_page_state_unit(idx); + long res; if (!val || unit == PAGE_SIZE) return val; - else - return max(val * unit / PAGE_SIZE, 1UL); + + /* Get the absolute value of (val * unit / PAGE_SIZE). */ + res = mult_frac(abs(val), unit, PAGE_SIZE); + /* Round up zero values. */ + res = res ? : 1; + + return val < 0 ? -res : res; +} + +#ifdef CONFIG_MEMCG_V1 +/* + * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with + * reparenting of non-hierarchical state_locals. + */ +static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg) +{ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return memcg; + + rcu_read_lock(); + + while (memcg_is_dying(memcg)) + memcg = parent_mem_cgroup(memcg); + + return memcg; +} + +static inline void get_non_dying_memcg_end(void) +{ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + rcu_read_unlock(); +} +#else +static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg) +{ + return memcg; +} + +static inline void get_non_dying_memcg_end(void) +{ +} +#endif + +static void __mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, long val) +{ + int i = memcg_stats_index(idx); + int cpu; + + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) + return; + + cpu = get_cpu(); + + this_cpu_add(memcg->vmstats_percpu->state[i], val); + val = memcg_state_val_in_pages(idx, val); + memcg_rstat_updated(memcg, val, cpu); + + trace_mod_memcg_state(memcg, idx, val); + + put_cpu(); } /** @@ -703,23 +865,12 @@ static int memcg_state_val_in_pages(int idx, int val) void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) { - int i = memcg_stats_index(idx); - int cpu; - if (mem_cgroup_disabled()) return; - if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) - return; - - cpu = get_cpu(); - - this_cpu_add(memcg->vmstats_percpu->state[i], val); - val = memcg_state_val_in_pages(idx, val); - memcg_rstat_updated(memcg, val, cpu); - trace_mod_memcg_state(memcg, idx, val); - - put_cpu(); + memcg = get_non_dying_memcg_start(memcg); + __mod_memcg_state(memcg, idx, val); + get_non_dying_memcg_end(); } #ifdef CONFIG_MEMCG_V1 @@ -739,23 +890,27 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) #endif return x; } + +void reparent_memcg_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx) +{ + unsigned long value = memcg_page_state_local(memcg, idx); + + __mod_memcg_state(memcg, idx, -value); + __mod_memcg_state(parent, idx, value); +} #endif -static void mod_memcg_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, - int val) +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn, + enum node_stat_item idx, long val) { - struct mem_cgroup_per_node *pn; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = pn->memcg; int i = memcg_stats_index(idx); int cpu; if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - memcg = pn->memcg; - cpu = get_cpu(); /* Update memcg */ @@ -771,6 +926,23 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec, put_cpu(); } +static void mod_memcg_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, + int val) +{ + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + struct mem_cgroup_per_node *pn; + struct mem_cgroup *memcg; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = get_non_dying_memcg_start(pn->memcg); + pn = memcg->nodeinfo[pgdat->node_id]; + + __mod_memcg_lruvec_state(pn, idx, val); + + get_non_dying_memcg_end(); +} + /** * mod_lruvec_state - update lruvec memory statistics * @lruvec: the lruvec @@ -991,17 +1163,23 @@ struct mem_cgroup *get_mem_cgroup_from_current(void) /** * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg. * @folio: folio from which memcg should be extracted. + * + * See folio_memcg() for folio->objcg/memcg binding rules. */ struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return NULL; + if (!folio_memcg_charged(folio)) + return root_mem_cgroup; + rcu_read_lock(); - if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; + do { + memcg = folio_memcg(folio); + } while (unlikely(!css_tryget(&memcg->css))); rcu_read_unlock(); return memcg; } @@ -1198,23 +1376,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, } } -#ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ - struct mem_cgroup *memcg; - - if (mem_cgroup_disabled()) - return; - - memcg = folio_memcg(folio); - - if (!memcg) - VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); - else - VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); -} -#endif - /** * folio_lruvec_lock - Lock the lruvec for a folio. * @folio: Pointer to the folio. @@ -1224,14 +1385,20 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) * - folio_test_lru false * - folio frozen (refcount of 0) * - * Return: The lruvec this folio is on with its lock held. + * Return: The lruvec this folio is on with its lock held and rcu read lock held. */ struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock(&lruvec->lru_lock); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock(&lruvec->lru_lock); + goto retry; + } return lruvec; } @@ -1246,14 +1413,20 @@ struct lruvec *folio_lruvec_lock(struct folio *folio) * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts - * disabled. + * disabled and rcu read lock held. */ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock_irq(&lruvec->lru_lock); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irq(&lruvec->lru_lock); + goto retry; + } return lruvec; } @@ -1269,15 +1442,21 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts - * disabled. + * disabled and rcu read lock held. */ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock_irqsave(&lruvec->lru_lock, *flags); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); + goto retry; + } return lruvec; } @@ -1293,7 +1472,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, * to or just after a page is removed from an lru list. */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zid, int nr_pages) + int zid, long nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; @@ -1310,7 +1489,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, size = *lru_size; if (WARN_ONCE(size < 0, - "%s(%p, %d, %d): lru_size %ld\n", + "%s(%p, %d, %ld): lru_size %ld\n", __func__, lruvec, lru, nr_pages, size)) { VM_BUG_ON(1); *lru_size = 0; @@ -2581,17 +2760,17 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) +static void commit_charge(struct folio *folio, struct obj_cgroup *objcg) { VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio); /* - * Any of the following ensures page's memcg stability: + * Any of the following ensures folio's objcg stability: * * - the page lock * - LRU isolation * - exclusive reference */ - folio->memcg_data = (unsigned long)memcg; + folio->memcg_data = (unsigned long)objcg; } #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC @@ -2693,14 +2872,26 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p) static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { - struct obj_cgroup *objcg = NULL; + int nid = numa_node_id(); + + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg); - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { - objcg = rcu_dereference(memcg->objcg); if (likely(objcg && obj_cgroup_tryget(objcg))) - break; - objcg = NULL; + return objcg; } + + return NULL; +} + +static inline struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg; + + rcu_read_lock(); + objcg = __get_obj_cgroup_from_memcg(memcg); + rcu_read_unlock(); + return objcg; } @@ -2759,6 +2950,7 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) { struct mem_cgroup *memcg; struct obj_cgroup *objcg; + int nid = numa_node_id(); if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi()) return NULL; @@ -2775,53 +2967,39 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) * Objcg reference is kept by the task, so it's safe * to use the objcg by the current task. */ - return objcg; + return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); } memcg = this_cpu_read(int_active_memcg); if (unlikely(memcg)) goto from_memcg; - return NULL; + return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); from_memcg: - objcg = NULL; - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { + for (; memcg; memcg = parent_mem_cgroup(memcg)) { /* * Memcg pointer is protected by scope (see set_active_memcg()) * and is pinning the corresponding objcg, so objcg can't go * away and can be used within the scope without any additional * protection. */ - objcg = rcu_dereference_check(memcg->objcg, 1); + objcg = rcu_dereference_check(memcg->nodeinfo[nid]->objcg, 1); if (likely(objcg)) - break; + return objcg; } - return objcg; + return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); } struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { struct obj_cgroup *objcg; - if (!memcg_kmem_online()) - return NULL; - - if (folio_memcg_kmem(folio)) { - objcg = __folio_objcg(folio); + objcg = folio_objcg(folio); + if (objcg) obj_cgroup_get(objcg); - } else { - struct mem_cgroup *memcg; - rcu_read_lock(); - memcg = __folio_memcg(folio); - if (memcg) - objcg = __get_obj_cgroup_from_memcg(memcg); - else - objcg = NULL; - rcu_read_unlock(); - } return objcg; } @@ -2922,7 +3100,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) int ret = 0; objcg = current_obj_cgroup(); - if (objcg) { + if (objcg && !obj_cgroup_is_root(objcg)) { ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { obj_cgroup_get(objcg); @@ -3251,7 +3429,7 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, * obj_cgroup_get() is used to get a permanent reference. */ objcg = current_obj_cgroup(); - if (!objcg) + if (!objcg || obj_cgroup_is_root(objcg)) return true; /* @@ -3383,33 +3561,20 @@ void folio_split_memcg_refs(struct folio *folio, unsigned old_order, return; new_refs = (1 << (old_order - new_order)) - 1; - css_get_many(&__folio_memcg(folio)->css, new_refs); + obj_cgroup_get_many(folio_objcg(folio), new_refs); } -static int memcg_online_kmem(struct mem_cgroup *memcg) +static void memcg_online_kmem(struct mem_cgroup *memcg) { - struct obj_cgroup *objcg; - if (mem_cgroup_kmem_disabled()) - return 0; + return; if (unlikely(mem_cgroup_is_root(memcg))) - return 0; - - objcg = obj_cgroup_alloc(); - if (!objcg) - return -ENOMEM; - - objcg->memcg = memcg; - rcu_assign_pointer(memcg->objcg, objcg); - obj_cgroup_get(objcg); - memcg->orig_objcg = objcg; + return; static_branch_enable(&memcg_kmem_online_key); memcg->kmemcg_id = memcg->id.id; - - return 0; } static void memcg_offline_kmem(struct mem_cgroup *memcg) @@ -3423,16 +3588,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) return; parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - memcg_reparent_list_lrus(memcg, parent); - - /* - * Objcg's reparenting must be after list_lru's, make sure list_lru - * helpers won't use parent's list_lru until child is drained. - */ - memcg_reparent_objcgs(memcg, parent); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -3705,8 +3861,6 @@ struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg, un break; } memcg = parent_mem_cgroup(memcg); - if (!memcg) - memcg = root_mem_cgroup; } return memcg; } @@ -3771,6 +3925,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn->lruvec_stats_percpu) goto fail; + INIT_LIST_HEAD(&pn->objcg_list); + lruvec_init(&pn->lruvec); pn->memcg = memcg; @@ -3785,10 +3941,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - obj_cgroup_put(memcg->orig_objcg); + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + if (!pn) + continue; - for_each_node(node) - free_mem_cgroup_per_node_info(memcg->nodeinfo[node]); + obj_cgroup_put(pn->orig_objcg); + free_mem_cgroup_per_node_info(pn); + } memcg1_free_events(memcg); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); @@ -3859,7 +4019,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) #endif memcg1_memcg_init(memcg); memcg->kmemcg_id = -1; - INIT_LIST_HEAD(&memcg->objcg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) @@ -3935,9 +4094,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct obj_cgroup *objcg; + int nid; - if (memcg_online_kmem(memcg)) - goto remove_id; + memcg_online_kmem(memcg); /* * A memcg must be visible for expand_shrinker_info() @@ -3947,6 +4107,20 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (alloc_shrinker_info(memcg)) goto offline_kmem; + for_each_node(nid) { + objcg = obj_cgroup_alloc(); + if (!objcg) + goto free_objcg; + + if (unlikely(mem_cgroup_is_root(memcg))) + objcg->is_root = true; + + objcg->memcg = memcg; + rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg); + obj_cgroup_get(objcg); + memcg->nodeinfo[nid]->orig_objcg = objcg; + } + if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) queue_delayed_work(system_dfl_wq, &stats_flush_dwork, FLUSH_TIME); @@ -3969,9 +4143,27 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL); return 0; +free_objcg: + for_each_node(nid) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + + objcg = rcu_replace_pointer(pn->objcg, NULL, true); + if (objcg) + percpu_ref_kill(&objcg->refcnt); + + if (pn->orig_objcg) { + obj_cgroup_put(pn->orig_objcg); + /* + * Reset pn->orig_objcg to NULL to prevent + * obj_cgroup_put() from being called again in + * __mem_cgroup_free(). + */ + pn->orig_objcg = NULL; + } + } + free_shrinker_info(memcg); offline_kmem: memcg_offline_kmem(memcg); -remove_id: mem_cgroup_private_id_remove(memcg); return -ENOMEM; } @@ -3989,6 +4181,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); reparent_deferred_split_queue(memcg); + /* + * The reparenting of objcg must be after the reparenting of the + * list_lru and deferred_split_queue above, which ensures that they will + * not mistakenly get the parent list_lru and deferred_split_queue. + */ + memcg_reparent_objcgs(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); lru_gen_offline_memcg(memcg); @@ -4221,8 +4419,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } WRITE_ONCE(statc->stats_updates, 0); /* We are in a per-cpu loop here, only do the atomic write once */ - if (atomic_read(&memcg->vmstats->stats_updates)) - atomic_set(&memcg->vmstats->stats_updates, 0); + if (atomic_long_read(&memcg->vmstats->stats_updates)) + atomic_long_set(&memcg->vmstats->stats_updates, 0); } static void mem_cgroup_fork(struct task_struct *task) @@ -4799,16 +4997,20 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, gfp_t gfp) { - int ret; + int ret = 0; + struct obj_cgroup *objcg; - ret = try_charge(memcg, gfp, folio_nr_pages(folio)); - if (ret) - goto out; - - css_get(&memcg->css); - commit_charge(folio, memcg); + objcg = get_obj_cgroup_from_memcg(memcg); + /* Do not account at the root objcg level. */ + if (!obj_cgroup_is_root(objcg)) + ret = try_charge_memcg(memcg, gfp, folio_nr_pages(folio)); + if (ret) { + obj_cgroup_put(objcg); + return ret; + } + commit_charge(folio, objcg); memcg1_commit_charge(folio, memcg); -out: + return ret; } @@ -4894,7 +5096,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, } struct uncharge_gather { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; @@ -4908,58 +5110,52 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = obj_cgroup_memcg(ug->objcg); if (ug->nr_memory) { - memcg_uncharge(ug->memcg, ug->nr_memory); + memcg_uncharge(memcg, ug->nr_memory); if (ug->nr_kmem) { - mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); - memcg1_account_kmem(ug->memcg, -ug->nr_kmem); + mod_memcg_state(memcg, MEMCG_KMEM, -ug->nr_kmem); + memcg1_account_kmem(memcg, -ug->nr_kmem); } - memcg1_oom_recover(ug->memcg); + memcg1_oom_recover(memcg); } - memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid); + memcg1_uncharge_batch(memcg, ug->pgpgout, ug->nr_memory, ug->nid); + rcu_read_unlock(); /* drop reference from uncharge_folio */ - css_put(&ug->memcg->css); + obj_cgroup_put(ug->objcg); } static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) { long nr_pages; - struct mem_cgroup *memcg; struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Nobody should be changing or seriously looking at - * folio memcg or objcg at this point, we have fully - * exclusive access to the folio. + * folio objcg at this point, we have fully exclusive + * access to the folio. */ - if (folio_memcg_kmem(folio)) { - objcg = __folio_objcg(folio); - /* - * This get matches the put at the end of the function and - * kmem pages do not hold memcg references anymore. - */ - memcg = get_mem_cgroup_from_objcg(objcg); - } else { - memcg = __folio_memcg(folio); - } - - if (!memcg) + objcg = folio_objcg(folio); + if (!objcg) return; - if (ug->memcg != memcg) { - if (ug->memcg) { + if (ug->objcg != objcg) { + if (ug->objcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = memcg; + ug->objcg = objcg; ug->nid = folio_nid(folio); - /* pairs with css_put in uncharge_batch */ - css_get(&memcg->css); + /* pairs with obj_cgroup_put in uncharge_batch */ + obj_cgroup_get(objcg); } nr_pages = folio_nr_pages(folio); @@ -4967,20 +5163,17 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) if (folio_memcg_kmem(folio)) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - - folio->memcg_data = 0; - obj_cgroup_put(objcg); } else { /* LRU pages aren't accounted at the root level */ - if (!mem_cgroup_is_root(memcg)) + if (!obj_cgroup_is_root(objcg)) ug->nr_memory += nr_pages; ug->pgpgout++; WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); - folio->memcg_data = 0; } - css_put(&memcg->css); + folio->memcg_data = 0; + obj_cgroup_put(objcg); } void __mem_cgroup_uncharge(struct folio *folio) @@ -5004,7 +5197,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) uncharge_gather_clear(&ug); for (i = 0; i < folios->nr; i++) uncharge_folio(folios->folios[i], &ug); - if (ug.memcg) + if (ug.objcg) uncharge_batch(&ug); } @@ -5021,6 +5214,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; + struct obj_cgroup *objcg; long nr_pages = folio_nr_pages(new); VM_BUG_ON_FOLIO(!folio_test_locked(old), old); @@ -5035,21 +5229,24 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) if (folio_memcg_charged(new)) return; - memcg = folio_memcg(old); - VM_WARN_ON_ONCE_FOLIO(!memcg, old); - if (!memcg) + objcg = folio_objcg(old); + VM_WARN_ON_ONCE_FOLIO(!objcg, old); + if (!objcg) return; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); /* Force-charge the new page. The old one will be freed soon */ - if (!mem_cgroup_is_root(memcg)) { + if (!obj_cgroup_is_root(objcg)) { page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); } - css_get(&memcg->css); - commit_charge(new, memcg); + obj_cgroup_get(objcg); + commit_charge(new, objcg); memcg1_commit_charge(new, memcg); + rcu_read_unlock(); } /** @@ -5065,7 +5262,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) */ void mem_cgroup_migrate(struct folio *old, struct folio *new) { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); @@ -5076,18 +5273,18 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) if (mem_cgroup_disabled()) return; - memcg = folio_memcg(old); + objcg = folio_objcg(old); /* - * Note that it is normal to see !memcg for a hugetlb folio. + * Note that it is normal to see !objcg for a hugetlb folio. * For e.g, it could have been allocated when memory_hugetlb_accounting * was not selected. */ - VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); - if (!memcg) + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !objcg, old); + if (!objcg) return; - /* Transfer the charge and the css ref */ - commit_charge(new, memcg); + /* Transfer the charge and the objcg ref */ + commit_charge(new, objcg); /* Warning should never happen, so don't worry about refcount non-0 */ WARN_ON_ONCE(folio_unqueue_deferred_split(old)); @@ -5270,22 +5467,27 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned int nr_pages = folio_nr_pages(folio); struct page_counter *counter; struct mem_cgroup *memcg; + struct obj_cgroup *objcg; if (do_memsw_account()) return 0; - memcg = folio_memcg(folio); - - VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) + objcg = folio_objcg(folio); + VM_WARN_ON_ONCE_FOLIO(!objcg, folio); + if (!objcg) return 0; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); if (!entry.val) { memcg_memory_event(memcg, MEMCG_SWAP_FAIL); + rcu_read_unlock(); return 0; } memcg = mem_cgroup_private_id_get_online(memcg, nr_pages); + /* memcg is pined by memcg ID. */ + rcu_read_unlock(); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { @@ -5343,27 +5545,29 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) bool mem_cgroup_swap_full(struct folio *folio) { struct mem_cgroup *memcg; + bool ret = false; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (vm_swap_full()) return true; - if (do_memsw_account()) - return false; + if (do_memsw_account() || !folio_memcg_charged(folio)) + return ret; + rcu_read_lock(); memcg = folio_memcg(folio); - if (!memcg) - return false; - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { unsigned long usage = page_counter_read(&memcg->swap); if (usage * 2 >= READ_ONCE(memcg->swap.high) || - usage * 2 >= READ_ONCE(memcg->swap.max)) - return true; + usage * 2 >= READ_ONCE(memcg->swap.max)) { + ret = true; + break; + } } + rcu_read_unlock(); - return false; + return ret; } static int __init setup_swap_account(char *s) @@ -5559,6 +5763,9 @@ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; + if (obj_cgroup_is_root(objcg)) + return; + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); /* PF_MEMALLOC context, charging must succeed */ @@ -5588,6 +5795,9 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; + if (obj_cgroup_is_root(objcg)) + return; + obj_cgroup_uncharge(objcg, size); rcu_read_lock(); diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index bc7f4f045edf..b02b503c750d 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -105,7 +105,6 @@ static int memfd_luo_preserve_folios(struct file *file, if (!size) { *nr_foliosp = 0; *out_folios_ser = NULL; - memset(kho_vmalloc, 0, sizeof(*kho_vmalloc)); return 0; } @@ -410,6 +409,7 @@ static int memfd_luo_retrieve_folios(struct file *file, struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; struct folio *folio; + long npages, nr_added_pages = 0; int err = -EIO; long i; @@ -456,21 +456,26 @@ static int memfd_luo_retrieve_folios(struct file *file, if (flags & MEMFD_LUO_FOLIO_DIRTY) folio_mark_dirty(folio); - err = shmem_inode_acct_blocks(inode, 1); + npages = folio_nr_pages(folio); + err = shmem_inode_acct_blocks(inode, npages); if (err) { - pr_err("shmem: failed to account folio index %ld: %d\n", - i, err); - goto unlock_folio; + pr_err("shmem: failed to account folio index %ld(%ld pages): %d\n", + i, npages, err); + goto remove_from_cache; } - shmem_recalc_inode(inode, 1, 0); + nr_added_pages += npages; folio_add_lru(folio); folio_unlock(folio); folio_put(folio); } + shmem_recalc_inode(inode, nr_added_pages, 0); + return 0; +remove_from_cache: + filemap_remove_folio(folio); unlock_folio: folio_unlock(folio); folio_put(folio); @@ -481,12 +486,19 @@ static int memfd_luo_retrieve_folios(struct file *file, */ for (long j = i + 1; j < nr_folios; j++) { const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; + phys_addr_t phys; - folio = kho_restore_folio(pfolio->pfn); + if (!pfolio->pfn) + continue; + + phys = PFN_PHYS(pfolio->pfn); + folio = kho_restore_folio(phys); if (folio) folio_put(folio); } + shmem_recalc_inode(inode, nr_added_pages, 0); + return err; } @@ -525,7 +537,7 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) } vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); - file->f_inode->i_size = ser->size; + i_size_write(file_inode(file), ser->size); if (ser->nr_folios) { folios_ser = kho_restore_vmalloc(&ser->folios); @@ -560,6 +572,11 @@ static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, return shmem_file(file) && !inode->i_nlink; } +static unsigned long memfd_luo_get_id(struct file *file) +{ + return (unsigned long)file_inode(file); +} + static const struct liveupdate_file_ops memfd_luo_file_ops = { .freeze = memfd_luo_freeze, .finish = memfd_luo_finish, @@ -567,6 +584,7 @@ static const struct liveupdate_file_ops memfd_luo_file_ops = { .preserve = memfd_luo_preserve, .unpreserve = memfd_luo_unpreserve, .can_preserve = memfd_luo_can_preserve, + .get_id = memfd_luo_get_id, .owner = THIS_MODULE, }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2e136b738889..99179314a444 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3706,18 +3706,19 @@ static ssize_t weighted_interleave_auto_store(struct kobject *kobj, new_wi_state->iw_table[i] = 1; mutex_lock(&wi_state_lock); - if (!input) { - old_wi_state = rcu_dereference_protected(wi_state, - lockdep_is_held(&wi_state_lock)); - if (!old_wi_state) - goto update_wi_state; - if (input == old_wi_state->mode_auto) { - mutex_unlock(&wi_state_lock); - return count; - } + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); - memcpy(new_wi_state->iw_table, old_wi_state->iw_table, - nr_node_ids * sizeof(u8)); + if (old_wi_state && input == old_wi_state->mode_auto) { + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + return count; + } + + if (!input) { + if (old_wi_state) + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); goto update_wi_state; } diff --git a/mm/migrate.c b/mm/migrate.c index 76142a02192b..8a64291ab5b4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -672,6 +672,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, struct lruvec *old_lruvec, *new_lruvec; struct mem_cgroup *memcg; + rcu_read_lock(); memcg = folio_memcg(folio); old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); @@ -699,6 +700,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } + rcu_read_unlock(); } local_irq_enable(); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 2912eba575d5..fbfe5715f635 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -175,12 +175,6 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, return migrate_vma_collect_skip(start, end, walk); } - if (softleaf_is_migration(entry)) { - softleaf_entry_wait_on_locked(entry, ptl); - spin_unlock(ptl); - return -EAGAIN; - } - if (softleaf_is_device_private_write(entry)) write = MIGRATE_PFN_WRITE; } else { diff --git a/mm/mlock.c b/mm/mlock.c index fdbd1434a35f..8c227fefa2df 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -205,7 +205,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) } if (lruvec) - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); folios_put(fbatch); } diff --git a/mm/mprotect.c b/mm/mprotect.c index 110d47a36d4b..9cbf932b028c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -117,9 +117,9 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, } /* Set nr_ptes number of ptes, starting from idx */ -static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, - int idx, bool set_write, struct mmu_gather *tlb) +static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, + int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb) { /* * Advance the position in the batch by idx; note that if idx > 0, @@ -143,7 +143,7 @@ static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long add * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce * that the ptes point to consecutive pages of the same anon large folio. */ -static int page_anon_exclusive_sub_batch(int start_idx, int max_len, +static __always_inline int page_anon_exclusive_sub_batch(int start_idx, int max_len, struct page *first_page, bool expected_anon_exclusive) { int idx; @@ -169,7 +169,7 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len, * pte of the batch. Therefore, we must individually check all pages and * retrieve sub-batches. */ -static void commit_anon_folio_batch(struct vm_area_struct *vma, +static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma, struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) { @@ -188,7 +188,7 @@ static void commit_anon_folio_batch(struct vm_area_struct *vma, } } -static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, +static __always_inline void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) { @@ -211,6 +211,111 @@ static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, commit_anon_folio_batch(vma, folio, page, addr, ptep, oldpte, ptent, nr_ptes, tlb); } +static long change_softleaf_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte, pte_t oldpte, unsigned long cp_flags) +{ + const bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + softleaf_t entry = softleaf_from_pte(oldpte); + pte_t newpte; + + if (softleaf_is_migration_write(entry)) { + const struct folio *folio = softleaf_to_folio(entry); + + /* + * A protection check is difficult so + * just be safe and disable write + */ + if (folio_test_anon(folio)) + entry = make_readable_exclusive_migration_entry(swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + } else if (softleaf_is_device_private_write(entry)) { + /* + * We do not preserve soft-dirtiness. See + * copy_nonpresent_pte() for explanation. + */ + entry = make_readable_device_private_entry(swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (softleaf_is_marker(entry)) { + /* + * Ignore error swap entries unconditionally, + * because any access should sigbus/sigsegv + * anyway. + */ + if (softleaf_is_poison_marker(entry) || + softleaf_is_guard_marker(entry)) + return 0; + /* + * If this is uffd-wp pte marker and we'd like + * to unprotect it, drop it; the next page + * fault will trigger without uffd trapping. + */ + if (uffd_wp_resolve) { + pte_clear(vma->vm_mm, addr, pte); + return 1; + } + return 0; + } else { + newpte = oldpte; + } + + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); + return 1; + } + return 0; +} + +static __always_inline void change_present_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + int nr_ptes, unsigned long end, pgprot_t newprot, + struct folio *folio, struct page *page, unsigned long cp_flags) +{ + const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + const bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + pte_t ptent, oldpte; + + oldpte = modify_prot_start_ptes(vma, addr, ptep, nr_ptes); + ptent = pte_modify(oldpte, newprot); + + if (uffd_wp) + ptent = pte_mkuffd_wp(ptent); + else if (uffd_wp_resolve) + ptent = pte_clear_uffd_wp(ptent); + + /* + * In some writable, shared mappings, we might want + * to catch actual write access -- see + * vma_wants_writenotify(). + * + * In all writable, private mappings, we have to + * properly handle COW. + * + * In both cases, we can sometimes still change PTEs + * writable and avoid the write-fault handler, for + * example, if a PTE is already dirty and no other + * COW or special handling is required. + */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && + !pte_write(ptent)) + set_write_prot_commit_flush_ptes(vma, folio, page, + addr, ptep, oldpte, ptent, nr_ptes, tlb); + else + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, + nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); +} + static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -221,7 +326,6 @@ static long change_pte_range(struct mmu_gather *tlb, bool is_private_single_threaded; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; - bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; int nr_ptes; tlb_change_page_size(tlb, PAGE_SIZE); @@ -242,7 +346,6 @@ static long change_pte_range(struct mmu_gather *tlb, int max_nr_ptes = (end - addr) >> PAGE_SHIFT; struct folio *folio = NULL; struct page *page; - pte_t ptent; /* Already in the desired state. */ if (prot_numa && pte_protnone(oldpte)) @@ -268,34 +371,20 @@ static long change_pte_range(struct mmu_gather *tlb, nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags); - oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes); - ptent = pte_modify(oldpte, newprot); - - if (uffd_wp) - ptent = pte_mkuffd_wp(ptent); - else if (uffd_wp_resolve) - ptent = pte_clear_uffd_wp(ptent); - /* - * In some writable, shared mappings, we might want - * to catch actual write access -- see - * vma_wants_writenotify(). - * - * In all writable, private mappings, we have to - * properly handle COW. - * - * In both cases, we can sometimes still change PTEs - * writable and avoid the write-fault handler, for - * example, if a PTE is already dirty and no other - * COW or special handling is required. + * Optimize for the small-folio common case by + * special-casing it here. Compiler constant propagation + * plus copious amounts of __always_inline does wonders. */ - if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && - !pte_write(ptent)) - set_write_prot_commit_flush_ptes(vma, folio, page, - addr, pte, oldpte, ptent, nr_ptes, tlb); - else - prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent, - nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); + if (likely(nr_ptes == 1)) { + change_present_ptes(tlb, vma, addr, pte, 1, + end, newprot, folio, page, cp_flags); + } else { + change_present_ptes(tlb, vma, addr, pte, + nr_ptes, end, newprot, folio, page, + cp_flags); + } + pages += nr_ptes; } else if (pte_none(oldpte)) { /* @@ -317,66 +406,7 @@ static long change_pte_range(struct mmu_gather *tlb, pages++; } } else { - softleaf_t entry = softleaf_from_pte(oldpte); - pte_t newpte; - - if (softleaf_is_migration_write(entry)) { - const struct folio *folio = softleaf_to_folio(entry); - - /* - * A protection check is difficult so - * just be safe and disable write - */ - if (folio_test_anon(folio)) - entry = make_readable_exclusive_migration_entry( - swp_offset(entry)); - else - entry = make_readable_migration_entry(swp_offset(entry)); - newpte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(oldpte)) - newpte = pte_swp_mksoft_dirty(newpte); - } else if (softleaf_is_device_private_write(entry)) { - /* - * We do not preserve soft-dirtiness. See - * copy_nonpresent_pte() for explanation. - */ - entry = make_readable_device_private_entry( - swp_offset(entry)); - newpte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(oldpte)) - newpte = pte_swp_mkuffd_wp(newpte); - } else if (softleaf_is_marker(entry)) { - /* - * Ignore error swap entries unconditionally, - * because any access should sigbus/sigsegv - * anyway. - */ - if (softleaf_is_poison_marker(entry) || - softleaf_is_guard_marker(entry)) - continue; - /* - * If this is uffd-wp pte marker and we'd like - * to unprotect it, drop it; the next page - * fault will trigger without uffd trapping. - */ - if (uffd_wp_resolve) { - pte_clear(vma->vm_mm, addr, pte); - pages++; - } - continue; - } else { - newpte = oldpte; - } - - if (uffd_wp) - newpte = pte_swp_mkuffd_wp(newpte); - else if (uffd_wp_resolve) - newpte = pte_swp_clear_uffd_wp(newpte); - - if (!pte_same(oldpte, newpte)) { - set_pte_at(vma->vm_mm, addr, pte, newpte); - pages++; - } + pages += change_softleaf_pte(vma, addr, pte, oldpte, cp_flags); } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); lazy_mmu_mode_disable(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f349ca85b70e..65e205111553 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1242,10 +1242,18 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task, union pgtag_ref_handle handle; union codetag_ref ref; - if (get_page_tag_ref(page, &ref, &handle)) { + if (likely(get_page_tag_ref(page, &ref, &handle))) { alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); update_page_tag_ref(handle, &ref); put_page_tag_ref(handle); + } else { + /* + * page_ext is not available yet, record the pfn so we can + * clear the tag ref later when page_ext is initialized. + */ + alloc_tag_add_early_pfn(page_to_pfn(page)); + if (task->alloc_tag) + alloc_tag_set_inaccurate(task->alloc_tag); } } diff --git a/mm/page_io.c b/mm/page_io.c index 330abc5ab7b4..70cea9e24d2f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -276,10 +276,14 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); goto out_unlock; } + + rcu_read_lock(); if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) { + rcu_read_unlock(); folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; } + rcu_read_unlock(); __swap_writepage(folio, swap_plug); return 0; @@ -307,11 +311,11 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) struct cgroup_subsys_state *css; struct mem_cgroup *memcg; - memcg = folio_memcg(folio); - if (!memcg) + if (!folio_memcg_charged(folio)) return; rcu_read_lock(); + memcg = folio_memcg(folio); css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); @@ -493,7 +497,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) folio_mark_uptodate(folio); folio_unlock(folio); } - count_vm_events(PSWPIN, sio->pages); + count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT); } else { for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); diff --git a/mm/percpu.c b/mm/percpu.c index a2107bdebf0b..b0676b8054ed 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1622,7 +1622,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, return true; objcg = current_obj_cgroup(); - if (!objcg) + if (!objcg || obj_cgroup_is_root(objcg)) return true; if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) diff --git a/mm/shmem.c b/mm/shmem.c index 19bf77925fa1..3b5dc21b323c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3177,119 +3177,99 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, #endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD -int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) +static struct folio *shmem_mfill_folio_alloc(struct vm_area_struct *vma, + unsigned long addr) { - struct inode *inode = file_inode(dst_vma->vm_file); - struct shmem_inode_info *info = SHMEM_I(inode); + struct inode *inode = file_inode(vma->vm_file); struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + pgoff_t pgoff = linear_page_index(vma, addr); gfp_t gfp = mapping_gfp_mask(mapping); - pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - void *page_kaddr; struct folio *folio; - int ret; - pgoff_t max_off; - if (shmem_inode_acct_blocks(inode, 1)) { - /* - * We may have got a page, returned -ENOENT triggering a retry, - * and now we find ourselves with -ENOMEM. Release the page, to - * avoid a BUG_ON in our caller. - */ - if (unlikely(*foliop)) { - folio_put(*foliop); - *foliop = NULL; - } - return -ENOMEM; + if (unlikely(pgoff >= DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) + return NULL; + + folio = shmem_alloc_folio(gfp, 0, info, pgoff); + if (!folio) + return NULL; + + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return NULL; } - if (!*foliop) { - ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, 0, info, pgoff); - if (!folio) - goto out_unacct_blocks; + return folio; +} - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { - page_kaddr = kmap_local_folio(folio, 0); - /* - * The read mmap_lock is held here. Despite the - * mmap_lock being read recursive a deadlock is still - * possible if a writer has taken a lock. For example: - * - * process A thread 1 takes read lock on own mmap_lock - * process A thread 2 calls mmap, blocks taking write lock - * process B thread 1 takes page fault, read lock on own mmap lock - * process B thread 2 calls mmap, blocks taking write lock - * process A thread 1 blocks taking read lock on process B - * process B thread 1 blocks taking read lock on process A - * - * Disable page faults to prevent potential deadlock - * and retry the copy outside the mmap_lock. - */ - pagefault_disable(); - ret = copy_from_user(page_kaddr, - (const void __user *)src_addr, - PAGE_SIZE); - pagefault_enable(); - kunmap_local(page_kaddr); +static int shmem_mfill_filemap_add(struct folio *folio, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode = file_inode(vma->vm_file); + struct address_space *mapping = inode->i_mapping; + pgoff_t pgoff = linear_page_index(vma, addr); + gfp_t gfp = mapping_gfp_mask(mapping); + int err; - /* fallback to copy_from_user outside mmap_lock */ - if (unlikely(ret)) { - *foliop = folio; - ret = -ENOENT; - /* don't free the page */ - goto out_unacct_blocks; - } - - flush_dcache_folio(folio); - } else { /* ZEROPAGE */ - clear_user_highpage(&folio->page, dst_addr); - } - } else { - folio = *foliop; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - *foliop = NULL; - } - - VM_BUG_ON(folio_test_locked(folio)); - VM_BUG_ON(folio_test_swapbacked(folio)); __folio_set_locked(folio); __folio_set_swapbacked(folio); - __folio_mark_uptodate(folio); - ret = -EFAULT; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(pgoff >= max_off)) - goto out_release; + err = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); + if (err) + goto err_unlock; - ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); - if (ret) - goto out_release; - ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); - if (ret) - goto out_release; - - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, flags); - if (ret) - goto out_delete_from_cache; + if (shmem_inode_acct_blocks(inode, 1)) { + err = -ENOMEM; + goto err_delete_from_cache; + } + folio_add_lru(folio); shmem_recalc_inode(inode, 1, 0); - folio_unlock(folio); + return 0; -out_delete_from_cache: + +err_delete_from_cache: filemap_remove_folio(folio); -out_release: +err_unlock: folio_unlock(folio); - folio_put(folio); -out_unacct_blocks: - shmem_inode_unacct_blocks(inode, 1); - return ret; + return err; } + +static void shmem_mfill_filemap_remove(struct folio *folio, + struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); + folio_unlock(folio); +} + +static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t pgoff) +{ + struct folio *folio; + int err; + + err = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + if (err) + return ERR_PTR(err); + + return folio; +} + +static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) +{ + return true; +} + +static const struct vm_uffd_ops shmem_uffd_ops = { + .can_userfault = shmem_can_userfault, + .get_folio_noalloc = shmem_get_folio_noalloc, + .alloc_folio = shmem_mfill_folio_alloc, + .filemap_add = shmem_mfill_filemap_add, + .filemap_remove = shmem_mfill_filemap_remove, +}; #endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS @@ -5325,6 +5305,9 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &shmem_uffd_ops, +#endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { @@ -5334,6 +5317,9 @@ static const struct vm_operations_struct shmem_anon_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &shmem_uffd_ops, +#endif }; int shmem_init_fs_context(struct fs_context *fc) diff --git a/mm/shrinker.c b/mm/shrinker.c index c23086bccf4d..76b3f750cf65 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -288,14 +288,10 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) { int nid, index, offset; long nr; - struct mem_cgroup *parent; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct shrinker_info *child_info, *parent_info; struct shrinker_info_unit *child_unit, *parent_unit; - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - /* Prevent from concurrent shrinker_info expand */ mutex_lock(&shrinker_mutex); for_each_node(nid) { diff --git a/mm/sparse.c b/mm/sparse.c index 007fd52c621e..effdac6b0ab1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -403,7 +403,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, ms = __nr_to_section(pnum); if (!preinited_vmemmap_section(ms)) ms->section_mem_map = 0; - ms->section_mem_map = 0; } } diff --git a/mm/swap.c b/mm/swap.c index 78b4aa811fc6..5cc44f0de987 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -91,7 +91,7 @@ static void page_cache_release(struct folio *folio) __page_cache_release(folio, &lruvec, &flags); if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) @@ -175,7 +175,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) } if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); folios_put(fbatch); } @@ -240,6 +240,7 @@ void folio_rotate_reclaimable(struct folio *folio) void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) __releases(lruvec->lru_lock) + __releases(rcu) { unsigned long cost; @@ -253,6 +254,7 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; if (!cost) { spin_unlock_irq(&lruvec->lru_lock); + rcu_read_unlock(); return; } @@ -285,8 +287,10 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, spin_unlock_irq(&lruvec->lru_lock); lruvec = parent_lruvec(lruvec); - if (!lruvec) + if (!lruvec) { + rcu_read_unlock(); break; + } spin_lock_irq(&lruvec->lru_lock); } } @@ -349,7 +353,7 @@ void folio_activate(struct folio *folio) lruvec = folio_lruvec_lock_irq(folio); lru_activate(lruvec, folio); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); folio_set_lru(folio); } #endif @@ -412,18 +416,20 @@ static void lru_gen_inc_refs(struct folio *folio) static bool lru_gen_clear_refs(struct folio *folio) { - struct lru_gen_folio *lrugen; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); + unsigned long seq; if (gen < 0) return true; set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0); - lrugen = &folio_lruvec(folio)->lrugen; + rcu_read_lock(); + seq = READ_ONCE(folio_lruvec(folio)->lrugen.min_seq[type]); + rcu_read_unlock(); /* whether can do without shuffling under the LRU lock */ - return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); + return gen == lru_gen_from_seq(seq); } #else /* !CONFIG_LRU_GEN */ @@ -963,7 +969,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) if (folio_is_zone_device(folio)) { if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } if (folio_ref_sub_and_test(folio, nr_refs)) @@ -977,7 +983,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } free_huge_folio(folio); @@ -991,7 +997,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) j++; } if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); if (!j) { folio_batch_reinit(folios); return; @@ -1084,6 +1090,39 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } +#ifdef CONFIG_MEMCG +static void lruvec_reparent_lru(struct lruvec *child_lruvec, + struct lruvec *parent_lruvec, + enum lru_list lru, int nid) +{ + int zid; + struct zone *zone; + + if (lru != LRU_UNEVICTABLE) + list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]); + + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) { + unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid); + + mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size); + } +} + +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + enum lru_list lru; + struct lruvec *child_lruvec, *parent_lruvec; + + child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid)); + parent_lruvec->anon_cost += child_lruvec->anon_cost; + parent_lruvec->file_cost += child_lruvec->file_cost; + + for_each_lru(lru) + lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid); +} +#endif + static const struct ctl_table swap_sysctl_table[] = { { .procname = "page-cluster", diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 89879c3ba344..885da1e56466 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -14,12 +14,61 @@ #include #include #include -#include #include #include #include "internal.h" #include "swap.h" +struct mfill_state { + struct userfaultfd_ctx *ctx; + unsigned long src_start; + unsigned long dst_start; + unsigned long len; + uffd_flags_t flags; + + struct vm_area_struct *vma; + unsigned long src_addr; + unsigned long dst_addr; + pmd_t *pmd; +}; + +static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) +{ + /* anonymous memory does not support MINOR mode */ + if (vm_flags & VM_UFFD_MINOR) + return false; + return true; +} + +static struct folio *anon_alloc_folio(struct vm_area_struct *vma, + unsigned long addr) +{ + struct folio *folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + addr); + + if (!folio) + return NULL; + + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return NULL; + } + + return folio; +} + +static const struct vm_uffd_ops anon_uffd_ops = { + .can_userfault = anon_can_userfault, + .alloc_folio = anon_alloc_folio, +}; + +static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return &anon_uffd_ops; + return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL; +} + static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { @@ -143,6 +192,128 @@ static void uffd_mfill_unlock(struct vm_area_struct *vma) } #endif +static void mfill_put_vma(struct mfill_state *state) +{ + if (!state->vma) + return; + + up_read(&state->ctx->map_changing_lock); + uffd_mfill_unlock(state->vma); + state->vma = NULL; +} + +static int mfill_get_vma(struct mfill_state *state) +{ + struct userfaultfd_ctx *ctx = state->ctx; + uffd_flags_t flags = state->flags; + struct vm_area_struct *dst_vma; + const struct vm_uffd_ops *ops; + int err; + + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + dst_vma = uffd_mfill_lock(ctx->mm, state->dst_start, state->len); + if (IS_ERR(dst_vma)) + return PTR_ERR(dst_vma); + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + down_read(&ctx->map_changing_lock); + state->vma = dst_vma; + err = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out_unlock; + + err = -EINVAL; + + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + + /* + * validate 'mode' now that we know the dst_vma: don't allow + * a wrprotect copy if the userfaultfd didn't register as WP. + */ + if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) + goto out_unlock; + + if (is_vm_hugetlb_page(dst_vma)) + return 0; + + ops = vma_uffd_ops(dst_vma); + if (!ops) + goto out_unlock; + + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && + !ops->get_folio_noalloc) + goto out_unlock; + + return 0; + +out_unlock: + mfill_put_vma(state); + return err; +} + +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, address); + if (!pud) + return NULL; + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + return pmd_alloc(mm, pud, address); +} + +static int mfill_establish_pmd(struct mfill_state *state) +{ + struct mm_struct *dst_mm = state->ctx->mm; + pmd_t *dst_pmd, dst_pmdval; + + dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr); + if (unlikely(!dst_pmd)) + return -ENOMEM; + + dst_pmdval = pmdp_get_lockless(dst_pmd); + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_pmd))) + return -ENOMEM; + + dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is THP don't override it and just be strict. + * (This includes the case where the PMD used to be THP and + * changed back to none after __pte_alloc().) + */ + if (unlikely(!pmd_present(dst_pmdval) || pmd_leaf(dst_pmdval))) + return -EEXIST; + if (unlikely(pmd_bad(dst_pmdval))) + return -EFAULT; + + state->pmd = dst_pmd; + return 0; +} + /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, unsigned long dst_addr) @@ -165,10 +336,10 @@ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem * and anon, and for both shared and private VMAs. */ -int mfill_atomic_install_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, struct page *page, - bool newly_allocated, uffd_flags_t flags) +static int mfill_atomic_install_pte(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, struct page *page, + uffd_flags_t flags) { int ret; struct mm_struct *dst_mm = dst_vma->vm_mm; @@ -212,9 +383,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, goto out_unlock; if (page_in_cache) { - /* Usually, cache pages are already added to LRU */ - if (newly_allocated) - folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); @@ -229,6 +397,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + if (page_in_cache) + folio_unlock(folio); + /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; @@ -238,58 +409,100 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, return ret; } -static int mfill_atomic_pte_copy(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) +static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr) { void *kaddr; int ret; + + kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); + ret = copy_from_user(kaddr, (const void __user *) src_addr, + PAGE_SIZE); + pagefault_enable(); + kunmap_local(kaddr); + + if (ret) + return -EFAULT; + + flush_dcache_folio(folio); + return ret; +} + +static int mfill_copy_folio_retry(struct mfill_state *state, struct folio *folio) +{ + unsigned long src_addr = state->src_addr; + void *kaddr; + int err; + + /* retry copying with mm_lock dropped */ + mfill_put_vma(state); + + kaddr = kmap_local_folio(folio, 0); + err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); + kunmap_local(kaddr); + if (unlikely(err)) + return -EFAULT; + + flush_dcache_folio(folio); + + /* reget VMA and PMD, they could change underneath us */ + err = mfill_get_vma(state); + if (err) + return err; + + err = mfill_establish_pmd(state); + if (err) + return err; + + return 0; +} + +static int __mfill_atomic_pte(struct mfill_state *state, + const struct vm_uffd_ops *ops) +{ + unsigned long dst_addr = state->dst_addr; + unsigned long src_addr = state->src_addr; + uffd_flags_t flags = state->flags; struct folio *folio; + int ret; - if (!*foliop) { - ret = -ENOMEM; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, - dst_addr); - if (!folio) - goto out; + folio = ops->alloc_folio(state->vma, state->dst_addr); + if (!folio) + return -ENOMEM; - kaddr = kmap_local_folio(folio, 0); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { + ret = mfill_copy_folio_locked(folio, src_addr); /* - * The read mmap_lock is held here. Despite the - * mmap_lock being read recursive a deadlock is still - * possible if a writer has taken a lock. For example: - * - * process A thread 1 takes read lock on own mmap_lock - * process A thread 2 calls mmap, blocks taking write lock - * process B thread 1 takes page fault, read lock on own mmap lock - * process B thread 2 calls mmap, blocks taking write lock - * process A thread 1 blocks taking read lock on process B - * process B thread 1 blocks taking read lock on process A - * - * Disable page faults to prevent potential deadlock - * and retry the copy outside the mmap_lock. + * Fallback to copy_from_user outside mmap_lock. + * If retry is successful, mfill_copy_folio_locked() returns + * with locks retaken by mfill_get_vma(). + * If there was an error, we must mfill_put_vma() anyway and it + * will take care of unlocking if needed. */ - pagefault_disable(); - ret = copy_from_user(kaddr, (const void __user *) src_addr, - PAGE_SIZE); - pagefault_enable(); - kunmap_local(kaddr); - - /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { - ret = -ENOENT; - *foliop = folio; - /* don't free the page */ - goto out; + ret = mfill_copy_folio_retry(state, folio); + if (ret) + goto err_folio_put; } - - flush_dcache_folio(folio); + } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { + clear_user_highpage(&folio->page, state->dst_addr); } else { - folio = *foliop; - *foliop = NULL; + VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); } /* @@ -299,63 +512,65 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, */ __folio_mark_uptodate(folio); - ret = -ENOMEM; - if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) - goto out_release; + if (ops->filemap_add) { + ret = ops->filemap_add(folio, state->vma, state->dst_addr); + if (ret) + goto err_folio_put; + } - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, flags); + ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, + &folio->page, flags); if (ret) - goto out_release; -out: - return ret; -out_release: - folio_put(folio); - goto out; -} - -static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) -{ - struct folio *folio; - int ret = -ENOMEM; - - folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); - if (!folio) - return ret; - - if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) - goto out_put; - - /* - * The memory barrier inside __folio_mark_uptodate makes sure that - * zeroing out the folio become visible before mapping the page - * using set_pte_at(). See do_anonymous_page(). - */ - __folio_mark_uptodate(folio); - - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, 0); - if (ret) - goto out_put; + goto err_filemap_remove; return 0; -out_put: + +err_filemap_remove: + if (ops->filemap_remove) + ops->filemap_remove(folio, state->vma); +err_folio_put: folio_put(folio); return ret; } -static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) +static int mfill_atomic_pte_copy(struct mfill_state *state) { + const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); + + /* + * The normal page fault path for a MAP_PRIVATE mapping in a + * file-backed VMA will invoke the fault, fill the hole in the file and + * COW it right away. The result generates plain anonymous memory. + * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll + * generate anonymous memory directly without actually filling the + * hole. For the MAP_PRIVATE case the robustness check only happens in + * the pagetable (to verify it's still none) and not in the page cache. + */ + if (!(state->vma->vm_flags & VM_SHARED)) + ops = &anon_uffd_ops; + + return __mfill_atomic_pte(state, ops); +} + +static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state) +{ + const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); + + return __mfill_atomic_pte(state, ops); +} + +static int mfill_atomic_pte_zeropage(struct mfill_state *state) +{ + struct vm_area_struct *dst_vma = state->vma; + unsigned long dst_addr = state->dst_addr; + pmd_t *dst_pmd = state->pmd; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; - if (mm_forbids_zeropage(dst_vma->vm_mm)) - return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); + if (mm_forbids_zeropage(dst_vma->vm_mm) || + (dst_vma->vm_flags & VM_SHARED)) + return mfill_atomic_pte_zeroed_folio(state); _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); @@ -381,28 +596,29 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, } /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ -static int mfill_atomic_pte_continue(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - uffd_flags_t flags) +static int mfill_atomic_pte_continue(struct mfill_state *state) { - struct inode *inode = file_inode(dst_vma->vm_file); + struct vm_area_struct *dst_vma = state->vma; + const struct vm_uffd_ops *ops = vma_uffd_ops(dst_vma); + unsigned long dst_addr = state->dst_addr; pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct inode *inode = file_inode(dst_vma->vm_file); + uffd_flags_t flags = state->flags; + pmd_t *dst_pmd = state->pmd; struct folio *folio; struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); - /* Our caller expects us to return -EFAULT if we failed to find folio */ - if (ret == -ENOENT) - ret = -EFAULT; - if (ret) - goto out; - if (!folio) { - ret = -EFAULT; - goto out; + if (!ops) { + VM_WARN_ONCE(1, "UFFDIO_CONTINUE for unsupported VMA"); + return -EOPNOTSUPP; } + folio = ops->get_folio_noalloc(inode, pgoff); + /* Our caller expects us to return -EFAULT if we failed to find folio */ + if (IS_ERR_OR_NULL(folio)) + return -EFAULT; + page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) { ret = -EIO; @@ -410,30 +626,28 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, } ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - page, false, flags); + page, flags); if (ret) goto out_release; - folio_unlock(folio); - ret = 0; -out: - return ret; + return 0; + out_release: folio_unlock(folio); folio_put(folio); - goto out; + return ret; } /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ -static int mfill_atomic_pte_poison(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - uffd_flags_t flags) +static int mfill_atomic_pte_poison(struct mfill_state *state) { - int ret; + struct vm_area_struct *dst_vma = state->vma; struct mm_struct *dst_mm = dst_vma->vm_mm; + unsigned long dst_addr = state->dst_addr; + pmd_t *dst_pmd = state->pmd; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; + int ret; _dst_pte = make_pte_marker(PTE_MARKER_POISONED); ret = -EAGAIN; @@ -462,27 +676,6 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, return ret; } -static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - - pgd = pgd_offset(mm, address); - p4d = p4d_alloc(mm, pgd, address); - if (!p4d) - return NULL; - pud = pud_alloc(mm, p4d, address); - if (!pud) - return NULL; - /* - * Note that we didn't run this because the pmd was - * missing, the *pmd may be already established and in - * turn it may also be a trans_huge_pmd. - */ - return pmd_alloc(mm, pud, address); -} - #ifdef CONFIG_HUGETLB_PAGE /* * mfill_atomic processing for HUGETLB vmas. Note that this routine is @@ -657,48 +850,21 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, uffd_flags_t flags); #endif /* CONFIG_HUGETLB_PAGE */ -static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) +static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) { - ssize_t err; + uffd_flags_t flags = state->flags; - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { - return mfill_atomic_pte_continue(dst_pmd, dst_vma, - dst_addr, flags); - } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { - return mfill_atomic_pte_poison(dst_pmd, dst_vma, - dst_addr, flags); - } + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + return mfill_atomic_pte_continue(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) + return mfill_atomic_pte_poison(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) + return mfill_atomic_pte_copy(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) + return mfill_atomic_pte_zeropage(state); - /* - * The normal page fault path for a shmem will invoke the - * fault, fill the hole in the file and COW it right away. The - * result generates plain anonymous memory. So when we are - * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll - * generate anonymous memory directly without actually filling - * the hole. For the MAP_PRIVATE case the robustness check - * only happens in the pagetable (to verify it's still none) - * and not in the radix tree. - */ - if (!(dst_vma->vm_flags & VM_SHARED)) { - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) - err = mfill_atomic_pte_copy(dst_pmd, dst_vma, - dst_addr, src_addr, - flags, foliop); - else - err = mfill_atomic_pte_zeropage(dst_pmd, - dst_vma, dst_addr); - } else { - err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, - dst_addr, src_addr, - flags, foliop); - } - - return err; + VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); + return -EOPNOTSUPP; } static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, @@ -707,13 +873,17 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, unsigned long len, uffd_flags_t flags) { - struct mm_struct *dst_mm = ctx->mm; - struct vm_area_struct *dst_vma; + struct mfill_state state = (struct mfill_state){ + .ctx = ctx, + .dst_start = dst_start, + .src_start = src_start, + .flags = flags, + .len = len, + .src_addr = src_start, + .dst_addr = dst_start, + }; + long copied = 0; ssize_t err; - pmd_t *dst_pmd; - unsigned long src_addr, dst_addr; - long copied; - struct folio *folio; /* * Sanitize the command parameters: @@ -725,125 +895,35 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, VM_WARN_ON_ONCE(src_start + len <= src_start); VM_WARN_ON_ONCE(dst_start + len <= dst_start); - src_addr = src_start; - dst_addr = dst_start; - copied = 0; - folio = NULL; -retry: - /* - * Make sure the vma is not shared, that the dst range is - * both valid and fully within a single existing vma. - */ - dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); - if (IS_ERR(dst_vma)) { - err = PTR_ERR(dst_vma); + err = mfill_get_vma(&state); + if (err) goto out; - } - - /* - * If memory mappings are changing because of non-cooperative - * operation (e.g. mremap) running in parallel, bail out and - * request the user to retry later - */ - down_read(&ctx->map_changing_lock); - err = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) - goto out_unlock; - - err = -EINVAL; - /* - * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but - * it will overwrite vm_ops, so vma_is_anonymous must return false. - */ - if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && - dst_vma->vm_flags & VM_SHARED)) - goto out_unlock; - - /* - * validate 'mode' now that we know the dst_vma: don't allow - * a wrprotect copy if the userfaultfd didn't register as WP. - */ - if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) - goto out_unlock; /* * If this is a HUGETLB vma, pass off to appropriate routine */ - if (is_vm_hugetlb_page(dst_vma)) - return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + if (is_vm_hugetlb_page(state.vma)) + return mfill_atomic_hugetlb(ctx, state.vma, dst_start, src_start, len, flags); - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) - goto out_unlock; - if (!vma_is_shmem(dst_vma) && - uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) - goto out_unlock; + while (state.src_addr < src_start + len) { + VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len); - while (src_addr < src_start + len) { - pmd_t dst_pmdval; + err = mfill_establish_pmd(&state); + if (err) + break; - VM_WARN_ON_ONCE(dst_addr >= dst_start + len); - - dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); - if (unlikely(!dst_pmd)) { - err = -ENOMEM; - break; - } - - dst_pmdval = pmdp_get_lockless(dst_pmd); - if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_pmd))) { - err = -ENOMEM; - break; - } - dst_pmdval = pmdp_get_lockless(dst_pmd); - /* - * If the dst_pmd is THP don't override it and just be strict. - * (This includes the case where the PMD used to be THP and - * changed back to none after __pte_alloc().) - */ - if (unlikely(!pmd_present(dst_pmdval) || - pmd_trans_huge(dst_pmdval))) { - err = -EEXIST; - break; - } - if (unlikely(pmd_bad(dst_pmdval))) { - err = -EFAULT; - break; - } /* * For shmem mappings, khugepaged is allowed to remove page * tables under us; pte_offset_map_lock() will deal with that. */ - err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, - src_addr, flags, &folio); + err = mfill_atomic_pte(&state); cond_resched(); - if (unlikely(err == -ENOENT)) { - void *kaddr; - - up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(dst_vma); - VM_WARN_ON_ONCE(!folio); - - kaddr = kmap_local_folio(folio, 0); - err = copy_from_user(kaddr, - (const void __user *) src_addr, - PAGE_SIZE); - kunmap_local(kaddr); - if (unlikely(err)) { - err = -EFAULT; - goto out; - } - flush_dcache_folio(folio); - goto retry; - } else - VM_WARN_ON_ONCE(folio); - if (!err) { - dst_addr += PAGE_SIZE; - src_addr += PAGE_SIZE; + state.dst_addr += PAGE_SIZE; + state.src_addr += PAGE_SIZE; copied += PAGE_SIZE; if (fatal_signal_pending(current)) @@ -853,12 +933,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, break; } -out_unlock: - up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(dst_vma); + mfill_put_vma(&state); out: - if (folio) - folio_put(folio); VM_WARN_ON_ONCE(copied < 0); VM_WARN_ON_ONCE(err > 0); VM_WARN_ON_ONCE(!copied && !err); @@ -1938,6 +2014,38 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, return moved ? moved : err; } +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async) +{ + const struct vm_uffd_ops *ops = vma_uffd_ops(vma); + + if (vma->vm_flags & VM_DROPPABLE) + return false; + + vm_flags &= __VM_UFFD_FLAGS; + + /* + * If WP is the only mode enabled and context is wp async, allow any + * memory type. + */ + if (wp_async && (vm_flags == VM_UFFD_WP)) + return true; + + /* For any other mode reject VMAs that don't implement vm_uffd_ops */ + if (!ops) + return false; + + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then only anonymous memory is supported + */ + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && + !vma_is_anonymous(vma)) + return false; + + return ops->can_userfault(vma, vm_flags); +} + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t vm_flags) { diff --git a/mm/util.c b/mm/util.c index f063fd4de1e8..232c3930a662 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1281,16 +1281,6 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) } EXPORT_SYMBOL(compat_vma_mmap); -int __vma_check_mmap_hook(struct vm_area_struct *vma) -{ - /* vm_ops->mapped is not valid if mmap() is specified. */ - if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped)) - return -EINVAL; - - return 0; -} -EXPORT_SYMBOL(__vma_check_mmap_hook); - static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 4bf091b1c8af..bd1b1aa12581 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -269,25 +269,6 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) } #endif -/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to - * and including the specified highidx - * @zone: The current zone in the iterator - * @pgdat: The pgdat which node_zones are being iterated - * @idx: The index variable - * @highidx: The index of the highest zone to return - * - * This macro iterates through all managed zones up to and including the specified highidx. - * The zone iterator enters an invalid state after macro call and must be reinitialized - * before it can be used again. - */ -#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ - for ((idx) = 0, (zone) = (pgdat)->node_zones; \ - (idx) <= (highidx); \ - (idx)++, (zone)++) \ - if (!managed_zone(zone)) \ - continue; \ - else - static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { @@ -409,8 +390,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ -static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zone_idx) +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { unsigned long size = 0; int zid; @@ -1831,7 +1811,7 @@ bool folio_isolate_lru(struct folio *folio) folio_get(folio); lruvec = folio_lruvec_lock_irq(folio); lruvec_del_folio(lruvec, folio); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); ret = true; } @@ -1885,24 +1865,27 @@ static bool too_many_isolated(struct pglist_data *pgdat, int file, /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * - * Returns the number of pages moved to the given lruvec. + * Returns the number of pages moved to the appropriate lruvec. + * + * Note: The caller must not hold any lruvec lock. */ -static unsigned int move_folios_to_lru(struct lruvec *lruvec, - struct list_head *list) +static unsigned int move_folios_to_lru(struct list_head *list) { int nr_pages, nr_moved = 0; + struct lruvec *lruvec = NULL; struct folio_batch free_folios; folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); + lruvec = folio_lruvec_relock_irq(folio, lruvec); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); list_del(&folio->lru); if (unlikely(!folio_evictable(folio))) { - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); folio_putback_lru(folio); - spin_lock_irq(&lruvec->lru_lock); + lruvec = NULL; continue; } @@ -1924,20 +1907,15 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, folio_unqueue_deferred_split(folio); if (folio_batch_add(&free_folios, folio) == 0) { - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); mem_cgroup_uncharge_folios(&free_folios); free_unref_folios(&free_folios); - spin_lock_irq(&lruvec->lru_lock); + lruvec = NULL; } continue; } - /* - * All pages were isolated from the same lruvec (and isolation - * inhibits memcg migration). - */ - VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); lruvec_add_folio(lruvec, folio); nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; @@ -1945,11 +1923,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, workingset_age_nonresident(lruvec, nr_pages); } + if (lruvec) + lruvec_unlock_irq(lruvec); + if (free_folios.nr) { - spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); free_unref_folios(&free_folios); - spin_lock_irq(&lruvec->lru_lock); } return nr_moved; @@ -1998,7 +1977,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, &nr_scanned, sc, lru); @@ -2008,7 +1987,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, mod_lruvec_state(lruvec, item, nr_scanned); mod_lruvec_state(lruvec, PGSCAN_ANON + file, nr_scanned); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); if (nr_taken == 0) return 0; @@ -2016,16 +1995,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false, lruvec_memcg(lruvec)); - spin_lock_irq(&lruvec->lru_lock); - move_folios_to_lru(lruvec, &folio_list); + move_folios_to_lru(&folio_list); mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); mod_lruvec_state(lruvec, item, nr_reclaimed); mod_lruvec_state(lruvec, PGSTEAL_ANON + file, nr_reclaimed); + lruvec_lock_irq(lruvec); lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); @@ -2104,7 +2083,7 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); @@ -2113,7 +2092,7 @@ static void shrink_active_list(unsigned long nr_to_scan, mod_lruvec_state(lruvec, PGREFILL, nr_scanned); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); while (!list_empty(&l_hold)) { struct folio *folio; @@ -2162,16 +2141,14 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move folios back to the lru list. */ - spin_lock_irq(&lruvec->lru_lock); + nr_activate = move_folios_to_lru(&l_active); + nr_deactivate = move_folios_to_lru(&l_inactive); - nr_activate = move_folios_to_lru(lruvec, &l_active); - nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); - - __count_vm_events(PGDEACTIVATE, nr_deactivate); + count_vm_events(PGDEACTIVATE, nr_deactivate); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - + lruvec_lock_irq(lruvec); lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); @@ -2886,8 +2863,9 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) return NULL; clear_bit(key, &mm->lru_gen.bitmap); + mmgrab(mm); - return mmget_not_zero(mm) ? mm : NULL; + return mm; } void lru_gen_add_mm(struct mm_struct *mm) @@ -3087,7 +3065,7 @@ static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **ite reset_bloom_filter(mm_state, walk->seq + 1); if (*iter) - mmput_async(*iter); + mmdrop(*iter); *iter = mm; @@ -3442,8 +3420,10 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, if (folio_nid(folio) != pgdat->node_id) return NULL; + rcu_read_lock(); if (folio_memcg(folio) != memcg) - return NULL; + folio = NULL; + rcu_read_unlock(); return folio; } @@ -3803,9 +3783,9 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) } if (walk->batched) { - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); reset_batch_size(walk); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); } cond_resched(); @@ -3965,7 +3945,7 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness if (seq < READ_ONCE(lrugen->max_seq)) return false; - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -3980,7 +3960,7 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness if (inc_min_seq(lruvec, type, swappiness)) continue; - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); cond_resched(); goto restart; } @@ -4015,7 +3995,7 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); unlock: - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); return success; } @@ -4213,12 +4193,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; struct pglist_data *pgdat = folio_pgdat(folio); - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); - DEFINE_MAX_SEQ(lruvec); - int gen = lru_gen_from_seq(max_seq); + struct lruvec *lruvec; + struct lru_gen_mm_state *mm_state; + unsigned long max_seq; + int gen; lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); @@ -4253,6 +4233,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) } } + memcg = get_mem_cgroup_from_folio(folio); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + max_seq = READ_ONCE((lruvec)->lrugen.max_seq); + gen = lru_gen_from_seq(max_seq); + mm_state = get_mm_state(lruvec); + lazy_mmu_mode_enable(); pte -= (addr - start) / PAGE_SIZE; @@ -4302,6 +4288,8 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) if (mm_state && suitable_to_scan(i, young)) update_bloom_filter(mm_state, max_seq, pvmw->pmd); + mem_cgroup_put(memcg); + return true; } @@ -4437,6 +4425,148 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); } +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid) +{ + struct lruvec *lruvec = get_lruvec(memcg, nid); + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + return false; + } + + return true; +} + +static void try_to_inc_max_seq_nowalk(struct mem_cgroup *memcg, + struct lruvec *lruvec) +{ + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + int swappiness = mem_cgroup_swappiness(memcg); + DEFINE_MAX_SEQ(lruvec); + bool success = false; + + /* + * We are not iterating the mm_list here, updating mm_state->seq is just + * to make mm walkers work properly. + */ + if (mm_state) { + spin_lock(&mm_list->lock); + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + if (max_seq > mm_state->seq) { + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + success = true; + } + spin_unlock(&mm_list->lock); + } else { + success = true; + } + + if (success) + inc_max_seq(lruvec, max_seq, swappiness); +} + +/* + * We need to ensure that the folios of child memcg can be reparented to the + * same gen of the parent memcg, so the gens of the parent memcg needed be + * incremented to the MAX_NR_GENS before reparenting. + */ +void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid) +{ + struct lruvec *lruvec = get_lruvec(memcg, nid); + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + while (get_nr_gens(lruvec, type) < MAX_NR_GENS) { + try_to_inc_max_seq_nowalk(memcg, lruvec); + cond_resched(); + } + } +} + +/* + * Compared to traditional LRU, MGLRU faces the following challenges: + * + * 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the + * number of generations of the parent and child memcg may be different, + * so we cannot simply transfer MGLRU folios in the child memcg to the + * parent memcg as we did for traditional LRU folios. + * 2. The generation information is stored in folio->flags, but we cannot + * traverse these folios while holding the lru lock, otherwise it may + * cause softlockup. + * 3. In walk_update_folio(), the gen of folio and corresponding lru size + * may be updated, but the folio is not immediately moved to the + * corresponding lru list. Therefore, there may be folios of different + * generations on an LRU list. + * 4. In lru_gen_del_folio(), the generation to which the folio belongs is + * found based on the generation information in folio->flags, and the + * corresponding LRU size will be updated. Therefore, we need to update + * the lru size correctly during reparenting, otherwise the lru size may + * be updated incorrectly in lru_gen_del_folio(). + * + * Finally, we choose a compromise method, which is to splice the lru list in + * the child memcg to the lru list of the same generation in the parent memcg + * during reparenting. + * + * The same generation has different meanings in the parent and child memcg, + * so this compromise method will cause the LRU inversion problem. But as the + * system runs, this problem will be fixed automatically. + */ +static void __lru_gen_reparent_memcg(struct lruvec *child_lruvec, struct lruvec *parent_lruvec, + int zone, int type) +{ + struct lru_gen_folio *child_lrugen, *parent_lrugen; + enum lru_list lru = type * LRU_INACTIVE_FILE; + int i; + + child_lrugen = &child_lruvec->lrugen; + parent_lrugen = &parent_lruvec->lrugen; + + for (i = 0; i < get_nr_gens(child_lruvec, type); i++) { + int gen = lru_gen_from_seq(child_lrugen->max_seq - i); + long nr_pages = child_lrugen->nr_pages[gen][type][zone]; + int child_lru_active = lru_gen_is_active(child_lruvec, gen) ? LRU_ACTIVE : 0; + int parent_lru_active = lru_gen_is_active(parent_lruvec, gen) ? LRU_ACTIVE : 0; + + /* Assuming that child pages are colder than parent pages */ + list_splice_tail_init(&child_lrugen->folios[gen][type][zone], + &parent_lrugen->folios[gen][type][zone]); + + WRITE_ONCE(child_lrugen->nr_pages[gen][type][zone], 0); + WRITE_ONCE(parent_lrugen->nr_pages[gen][type][zone], + parent_lrugen->nr_pages[gen][type][zone] + nr_pages); + + if (lru_gen_is_active(child_lruvec, gen) != lru_gen_is_active(parent_lruvec, gen)) { + __update_lru_size(child_lruvec, lru + child_lru_active, zone, -nr_pages); + __update_lru_size(parent_lruvec, lru + parent_lru_active, zone, nr_pages); + } + } +} + +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + struct lruvec *child_lruvec, *parent_lruvec; + int type, zid; + struct zone *zone; + enum lru_list lru; + + child_lruvec = get_lruvec(memcg, nid); + parent_lruvec = get_lruvec(parent, nid); + + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) + for (type = 0; type < ANON_AND_FILE; type++) + __lru_gen_reparent_memcg(child_lruvec, parent_lruvec, zid, type); + + for_each_lru(lru) { + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) { + unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid); + + mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size); + } + } +} + #endif /* CONFIG_MEMCG */ /****************************************************************************** @@ -4630,7 +4760,7 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, static int get_tier_idx(struct lruvec *lruvec, int type) { int tier; - struct ctrl_pos sp, pv; + struct ctrl_pos sp, pv = {}; /* * To leave a margin for fluctuations, use a larger gain factor (2:3). @@ -4649,7 +4779,7 @@ static int get_tier_idx(struct lruvec *lruvec, int type) static int get_type_to_scan(struct lruvec *lruvec, int swappiness) { - struct ctrl_pos sp, pv; + struct ctrl_pos sp, pv = {}; if (swappiness <= MIN_SWAPPINESS + 1) return LRU_GEN_FILE; @@ -4707,7 +4837,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); @@ -4716,7 +4846,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); if (list_empty(&list)) return scanned; @@ -4749,14 +4879,14 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); } - spin_lock_irq(&lruvec->lru_lock); - - move_folios_to_lru(lruvec, &list); + move_folios_to_lru(&list); walk = current->reclaim_state->mm_walk; if (walk && walk->batched) { walk->lruvec = lruvec; + lruvec_lock_irq(lruvec); reset_batch_size(walk); + lruvec_unlock_irq(lruvec); } mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), @@ -4766,8 +4896,6 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, mod_lruvec_state(lruvec, item, reclaimed); mod_lruvec_state(lruvec, PGSTEAL_ANON + type, reclaimed); - spin_unlock_irq(&lruvec->lru_lock); - list_splice_init(&clean, &list); if (!list_empty(&list)) { @@ -4843,10 +4971,6 @@ static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) int i; enum zone_watermarks mark; - /* don't abort memcg reclaim to ensure fairness */ - if (!root_reclaim(sc)) - return false; - if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) return true; @@ -4900,9 +5024,24 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * If too many file cache in the coldest generation can't be evicted * due to being dirty, wake up the flusher. */ - if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) + if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) { + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + wakeup_flusher_threads(WB_REASON_VMSCAN); + /* + * For cgroupv1 dirty throttling is achieved by waking up + * the kernel flusher here and later waiting on folios + * which are in writeback to finish (see shrink_folio_list()). + * + * Flusher may not be able to issue writeback quickly + * enough for cgroupv1 writeback throttling to work + * on a large system. + */ + if (!writeback_throttling_sane(sc)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + /* whether this lruvec should be rotated */ return nr_to_scan < 0; } @@ -5196,7 +5335,7 @@ static void lru_gen_change_state(bool enabled) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); VM_WARN_ON_ONCE(!state_is_valid(lruvec)); @@ -5204,12 +5343,12 @@ static void lru_gen_change_state(bool enabled) lruvec->lrugen.enabled = enabled; while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); cond_resched(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); } - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); } cond_resched(); @@ -7898,7 +8037,7 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } diff --git a/mm/vmstat.c b/mm/vmstat.c index c360c1b29ac9..f534972f517d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2141,7 +2141,7 @@ static void vmstat_shepherd(struct work_struct *w) if (cpu_is_isolated(cpu)) continue; - if (!delayed_work_pending(dw) && need_update(cpu)) + if (!work_busy(&dw->work) && need_update(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); } diff --git a/mm/workingset.c b/mm/workingset.c index 37a94979900f..07e6836d0502 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -244,12 +244,15 @@ static void *lru_gen_eviction(struct folio *folio) int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio); int tier = lru_tier_from_refs(refs, workingset); - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; struct pglist_data *pgdat = folio_pgdat(folio); + unsigned short memcg_id; BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON)); + rcu_read_lock(); + memcg = folio_memcg(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); @@ -257,8 +260,10 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); + memcg_id = mem_cgroup_private_id(memcg); + rcu_read_unlock(); - return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset, type); + return pack_shadow(memcg_id, pgdat, token, workingset, type); } /* @@ -541,7 +546,6 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); - struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; @@ -564,14 +568,12 @@ void workingset_refault(struct folio *folio, void *shadow) * locked to guarantee folio_memcg() stability throughout. */ nr = folio_nr_pages(folio); - memcg = folio_memcg(folio); - pgdat = folio_pgdat(folio); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - + memcg = get_mem_cgroup_from_folio(folio); + lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio)); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset, true)) - return; + goto out; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); @@ -587,6 +589,8 @@ void workingset_refault(struct folio *folio, void *shadow) lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } +out: + mem_cgroup_put(memcg); } /** @@ -599,8 +603,11 @@ void workingset_activation(struct folio *folio) * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. */ - if (mem_cgroup_disabled() || folio_memcg_charged(folio)) + if (mem_cgroup_disabled() || folio_memcg_charged(folio)) { + rcu_read_lock(); workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); + rcu_read_unlock(); + } } /* @@ -684,9 +691,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) - pages += lruvec_page_state_local(lruvec, - NR_LRU_BASE + i); + pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1); + pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( diff --git a/mm/zswap.c b/mm/zswap.c index 0823cadd02b6..4b5149173b0e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -242,6 +242,34 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp) **********************************/ static void __zswap_pool_empty(struct percpu_ref *ref); +static void acomp_ctx_free(struct crypto_acomp_ctx *acomp_ctx) +{ + if (!acomp_ctx) + return; + + /* + * If there was an error in allocating @acomp_ctx->req, it + * would be set to NULL. + */ + if (acomp_ctx->req) + acomp_request_free(acomp_ctx->req); + + acomp_ctx->req = NULL; + + /* + * We have to handle both cases here: an error pointer return from + * crypto_alloc_acomp_node(); and a) NULL initialization by zswap, or + * b) NULL assignment done in a previous call to acomp_ctx_free(). + */ + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + + acomp_ctx->acomp = NULL; + + kfree(acomp_ctx->buffer); + acomp_ctx->buffer = NULL; +} + static struct zswap_pool *zswap_pool_create(char *compressor) { struct zswap_pool *pool; @@ -263,19 +291,27 @@ static struct zswap_pool *zswap_pool_create(char *compressor) strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); - pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); + /* Many things rely on the zero-initialization. */ + pool->acomp_ctx = alloc_percpu_gfp(*pool->acomp_ctx, + GFP_KERNEL | __GFP_ZERO); if (!pool->acomp_ctx) { pr_err("percpu alloc failed\n"); goto error; } - for_each_possible_cpu(cpu) - mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex); - + /* + * This is serialized against CPU hotplug operations. Hence, cores + * cannot be offlined until this finishes. + */ ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + + /* + * cpuhp_state_add_instance() will not cleanup on failure since + * we don't register a hotunplug callback. + */ if (ret) - goto error; + goto cpuhp_add_fail; /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool @@ -292,6 +328,10 @@ static struct zswap_pool *zswap_pool_create(char *compressor) ref_fail: cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + +cpuhp_add_fail: + for_each_possible_cpu(cpu) + acomp_ctx_free(per_cpu_ptr(pool->acomp_ctx, cpu)); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -322,9 +362,15 @@ static struct zswap_pool *__zswap_pool_create_fallback(void) static void zswap_pool_destroy(struct zswap_pool *pool) { + int cpu; + zswap_pool_debug("destroying", pool); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + + for_each_possible_cpu(cpu) + acomp_ctx_free(per_cpu_ptr(pool->acomp_ctx, cpu)); + free_percpu(pool->acomp_ctx); zs_destroy_pool(pool->zs_pool); @@ -664,8 +710,10 @@ void zswap_folio_swapin(struct folio *folio) struct lruvec *lruvec; if (folio) { + rcu_read_lock(); lruvec = folio_lruvec(folio); atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins); + rcu_read_unlock(); } } @@ -736,39 +784,41 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp = NULL; - struct acomp_req *req = NULL; - u8 *buffer = NULL; - int ret; - - buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu)); - if (!buffer) { - ret = -ENOMEM; - goto fail; - } - - acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); - if (IS_ERR(acomp)) { - pr_err("could not alloc crypto acomp %s : %pe\n", - pool->tfm_name, acomp); - ret = PTR_ERR(acomp); - goto fail; - } - - req = acomp_request_alloc(acomp); - if (!req) { - pr_err("could not alloc crypto acomp_request %s\n", - pool->tfm_name); - ret = -ENOMEM; - goto fail; - } + int ret = -ENOMEM; /* - * Only hold the mutex after completing allocations, otherwise we may - * recurse into zswap through reclaim and attempt to hold the mutex - * again resulting in a deadlock. + * To handle cases where the CPU goes through online-offline-online + * transitions, we return if the acomp_ctx has already been initialized. */ - mutex_lock(&acomp_ctx->mutex); + if (acomp_ctx->acomp) { + WARN_ON_ONCE(IS_ERR(acomp_ctx->acomp)); + return 0; + } + + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return ret; + + /* + * In case of an error, crypto_alloc_acomp_node() returns an + * error pointer, never NULL. + */ + acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp_ctx->acomp)) { + pr_err("could not alloc crypto acomp %s : %pe\n", + pool->tfm_name, acomp_ctx->acomp); + ret = PTR_ERR(acomp_ctx->acomp); + goto fail; + } + + /* acomp_request_alloc() returns NULL in case of an error. */ + acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp); + if (!acomp_ctx->req) { + pr_err("could not alloc crypto acomp_request %s\n", + pool->tfm_name); + goto fail; + } + crypto_init_wait(&acomp_ctx->wait); /* @@ -776,80 +826,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) * crypto_wait_req(); if the backend of acomp is scomp, the callback * won't be called, crypto_wait_req() will return without blocking. */ - acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); - acomp_ctx->buffer = buffer; - acomp_ctx->acomp = acomp; - acomp_ctx->req = req; - mutex_unlock(&acomp_ctx->mutex); + mutex_init(&acomp_ctx->mutex); return 0; fail: - if (!IS_ERR_OR_NULL(acomp)) - crypto_free_acomp(acomp); - kfree(buffer); + acomp_ctx_free(acomp_ctx); return ret; } -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct acomp_req *req; - struct crypto_acomp *acomp; - u8 *buffer; - - if (IS_ERR_OR_NULL(acomp_ctx)) - return 0; - - mutex_lock(&acomp_ctx->mutex); - req = acomp_ctx->req; - acomp = acomp_ctx->acomp; - buffer = acomp_ctx->buffer; - acomp_ctx->req = NULL; - acomp_ctx->acomp = NULL; - acomp_ctx->buffer = NULL; - mutex_unlock(&acomp_ctx->mutex); - - /* - * Do the actual freeing after releasing the mutex to avoid subtle - * locking dependencies causing deadlocks. - */ - if (!IS_ERR_OR_NULL(req)) - acomp_request_free(req); - if (!IS_ERR_OR_NULL(acomp)) - crypto_free_acomp(acomp); - kfree(buffer); - - return 0; -} - -static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool) -{ - struct crypto_acomp_ctx *acomp_ctx; - - for (;;) { - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); - if (likely(acomp_ctx->req)) - return acomp_ctx; - /* - * It is possible that we were migrated to a different CPU after - * getting the per-CPU ctx but before the mutex was acquired. If - * the old CPU got offlined, zswap_cpu_comp_dead() could have - * already freed ctx->req (among other things) and set it to - * NULL. Just try again on the new CPU that we ended up on. - */ - mutex_unlock(&acomp_ctx->mutex); - } -} - -static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx) -{ - mutex_unlock(&acomp_ctx->mutex); -} - static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -862,7 +849,9 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, u8 *dst; bool mapped = false; - acomp_ctx = acomp_ctx_get_cpu_lock(pool); + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); @@ -893,11 +882,14 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, * to the active LRU list in the case. */ if (comp_ret || !dlen || dlen >= PAGE_SIZE) { + rcu_read_lock(); if (!mem_cgroup_zswap_writeback_enabled( folio_memcg(page_folio(page)))) { + rcu_read_unlock(); comp_ret = comp_ret ? comp_ret : -EINVAL; goto unlock; } + rcu_read_unlock(); comp_ret = 0; dlen = PAGE_SIZE; dst = kmap_local_page(page); @@ -925,7 +917,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, else if (alloc_ret) zswap_reject_alloc_fail++; - acomp_ctx_put_unlock(acomp_ctx); + mutex_unlock(&acomp_ctx->mutex); return comp_ret == 0 && alloc_ret == 0; } @@ -937,7 +929,8 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct crypto_acomp_ctx *acomp_ctx; int ret = 0, dlen; - acomp_ctx = acomp_ctx_get_cpu_lock(pool); + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); zs_obj_read_sg_begin(pool->zs_pool, entry->handle, input, entry->length); /* zswap entries of length PAGE_SIZE are not compressed. */ @@ -962,7 +955,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) } zs_obj_read_sg_end(pool->zs_pool, entry->handle); - acomp_ctx_put_unlock(acomp_ctx); + mutex_unlock(&acomp_ctx->mutex); if (!ret && dlen == PAGE_SIZE) return true; @@ -1782,7 +1775,7 @@ static int zswap_setup(void) ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, - zswap_cpu_comp_dead); + NULL); if (ret) goto hp_fail; diff --git a/tools/testing/selftests/liveupdate/liveupdate.c b/tools/testing/selftests/liveupdate/liveupdate.c index c2878e3d5ef9..37c808fbe1e9 100644 --- a/tools/testing/selftests/liveupdate/liveupdate.c +++ b/tools/testing/selftests/liveupdate/liveupdate.c @@ -345,4 +345,45 @@ TEST_F(liveupdate_device, preserve_unsupported_fd) ASSERT_EQ(close(session_fd), 0); } +/* + * Test Case: Prevent Double Preservation + * + * Verifies that a file (memfd) can only be preserved once across all active + * sessions. Attempting to preserve it a second time, whether in the same or + * a different session, should fail with EBUSY. + */ +TEST_F(liveupdate_device, prevent_double_preservation) +{ + int session_fd1, session_fd2, mem_fd; + int ret; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + session_fd1 = create_session(self->fd1, "double-preserve-session-1"); + ASSERT_GE(session_fd1, 0); + session_fd2 = create_session(self->fd1, "double-preserve-session-2"); + ASSERT_GE(session_fd2, 0); + + mem_fd = memfd_create("test-memfd", 0); + ASSERT_GE(mem_fd, 0); + + /* First preservation should succeed */ + ASSERT_EQ(preserve_fd(session_fd1, mem_fd, 0x1111), 0); + + /* Second preservation in a different session should fail with EBUSY */ + ret = preserve_fd(session_fd2, mem_fd, 0x2222); + EXPECT_EQ(ret, -EBUSY); + + /* Second preservation in the same session (different token) should fail with EBUSY */ + ret = preserve_fd(session_fd1, mem_fd, 0x3333); + EXPECT_EQ(ret, -EBUSY); + + ASSERT_EQ(close(mem_fd), 0); + ASSERT_EQ(close(session_fd1), 0); + ASSERT_EQ(close(session_fd2), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index 447769657634..44f4e703deb9 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -11,6 +11,11 @@ if [[ $(id -u) -ne 0 ]]; then exit $ksft_skip fi +if ! command -v killall >/dev/null 2>&1; then + echo "killall not available. Skipping..." + exit $ksft_skip +fi + nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) fault_limit_file=limit_in_bytes diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index dbd21d66d383..48e8b1539be3 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -21,6 +21,7 @@ #include #include #include "vm_util.h" +#include "thp_settings.h" #include "../pidfd/pidfd.h" @@ -2195,6 +2196,9 @@ TEST_F(guard_regions, collapse) char *ptr; int i; + if (!thp_available()) + SKIP(return, "Transparent Hugepages not available\n"); + /* Need file to be correct size for tests for non-anon. */ if (variant->backing != ANON_BACKED) ASSERT_EQ(ftruncate(self->fd, size), 0); diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index e8328c89d855..788689497e92 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -34,6 +34,7 @@ */ #include #include +#include struct hmm_buffer { void *ptr; @@ -548,7 +549,7 @@ TEST_F(hmm, anon_write_child) for (migrate = 0; migrate < 2; ++migrate) { for (use_thp = 0; use_thp < 2; ++use_thp) { - npages = ALIGN(use_thp ? TWOMEG : HMM_BUFFER_SIZE, + npages = ALIGN(use_thp ? read_pmd_pagesize() : HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; ASSERT_NE(npages, 0); size = npages << self->page_shift; @@ -728,7 +729,7 @@ TEST_F(hmm, anon_write_huge) int *ptr; int ret; - size = 2 * TWOMEG; + size = 2 * read_pmd_pagesize(); buffer = malloc(sizeof(*buffer)); ASSERT_NE(buffer, NULL); @@ -744,7 +745,7 @@ TEST_F(hmm, anon_write_huge) buffer->fd, 0); ASSERT_NE(buffer->ptr, MAP_FAILED); - size = TWOMEG; + size /= 2; npages = size >> self->page_shift; map = (void *)ALIGN((uintptr_t)buffer->ptr, size); ret = madvise(map, size, MADV_HUGEPAGE); @@ -770,54 +771,6 @@ TEST_F(hmm, anon_write_huge) hmm_buffer_free(buffer); } -/* - * Read numeric data from raw and tagged kernel status files. Used to read - * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). - */ -static long file_read_ulong(char *file, const char *tag) -{ - int fd; - char buf[2048]; - int len; - char *p, *q; - long val; - - fd = open(file, O_RDONLY); - if (fd < 0) { - /* Error opening the file */ - return -1; - } - - len = read(fd, buf, sizeof(buf)); - close(fd); - if (len < 0) { - /* Error in reading the file */ - return -1; - } - if (len == sizeof(buf)) { - /* Error file is too large */ - return -1; - } - buf[len] = '\0'; - - /* Search for a tag if provided */ - if (tag) { - p = strstr(buf, tag); - if (!p) - return -1; /* looks like the line we want isn't there */ - p += strlen(tag); - } else - p = buf; - - val = strtol(p, &q, 0); - if (*q != ' ') { - /* Error parsing the file */ - return -1; - } - - return val; -} - /* * Write huge TLBFS page. */ @@ -826,15 +779,13 @@ TEST_F(hmm, anon_write_hugetlbfs) struct hmm_buffer *buffer; unsigned long npages; unsigned long size; - unsigned long default_hsize; + unsigned long default_hsize = default_huge_page_size(); unsigned long i; int *ptr; int ret; - default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) + if (!default_hsize) SKIP(return, "Huge page size could not be determined"); - default_hsize = default_hsize*1024; /* KB to B */ size = ALIGN(TWOMEG, default_hsize); npages = size >> self->page_shift; @@ -1606,7 +1557,7 @@ TEST_F(hmm, compound) struct hmm_buffer *buffer; unsigned long npages; unsigned long size; - unsigned long default_hsize; + unsigned long default_hsize = default_huge_page_size(); int *ptr; unsigned char *m; int ret; @@ -1614,10 +1565,8 @@ TEST_F(hmm, compound) /* Skip test if we can't allocate a hugetlbfs page. */ - default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) + if (!default_hsize) SKIP(return, "Huge page size could not be determined"); - default_hsize = default_hsize*1024; /* KB to B */ size = ALIGN(TWOMEG, default_hsize); npages = size >> self->page_shift; @@ -2106,7 +2055,7 @@ TEST_F(hmm, migrate_anon_huge_empty) int *ptr; int ret; - size = TWOMEG; + size = read_pmd_pagesize(); buffer = malloc(sizeof(*buffer)); ASSERT_NE(buffer, NULL); @@ -2158,7 +2107,7 @@ TEST_F(hmm, migrate_anon_huge_zero) int ret; int val; - size = TWOMEG; + size = read_pmd_pagesize(); buffer = malloc(sizeof(*buffer)); ASSERT_NE(buffer, NULL); @@ -2221,7 +2170,7 @@ TEST_F(hmm, migrate_anon_huge_free) int *ptr; int ret; - size = TWOMEG; + size = read_pmd_pagesize(); buffer = malloc(sizeof(*buffer)); ASSERT_NE(buffer, NULL); @@ -2280,7 +2229,7 @@ TEST_F(hmm, migrate_anon_huge_fault) int *ptr; int ret; - size = TWOMEG; + size = read_pmd_pagesize(); buffer = malloc(sizeof(*buffer)); ASSERT_NE(buffer, NULL); @@ -2332,7 +2281,7 @@ TEST_F(hmm, migrate_partial_unmap_fault) { struct hmm_buffer *buffer; unsigned long npages; - unsigned long size = TWOMEG; + unsigned long size = read_pmd_pagesize(); unsigned long i; void *old_ptr; void *map; @@ -2398,7 +2347,7 @@ TEST_F(hmm, migrate_remap_fault) { struct hmm_buffer *buffer; unsigned long npages; - unsigned long size = TWOMEG; + unsigned long size = read_pmd_pagesize(); unsigned long i; void *old_ptr, *new_ptr = NULL; void *map; @@ -2498,7 +2447,7 @@ TEST_F(hmm, migrate_anon_huge_err) int *ptr; int ret; - size = TWOMEG; + size = read_pmd_pagesize(); buffer = malloc(sizeof(*buffer)); ASSERT_NE(buffer, NULL); @@ -2593,7 +2542,7 @@ TEST_F(hmm, migrate_anon_huge_zero_err) int *ptr; int ret; - size = TWOMEG; + size = read_pmd_pagesize(); buffer = malloc(sizeof(*buffer)); ASSERT_NE(buffer, NULL); diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c index 9ac62eb4c97d..31a054fa8134 100644 --- a/tools/testing/selftests/mm/hugetlb_dio.c +++ b/tools/testing/selftests/mm/hugetlb_dio.c @@ -17,12 +17,57 @@ #include #include #include +#include #include "vm_util.h" #include "kselftest.h" -void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) +#ifndef STATX_DIOALIGN +#define STATX_DIOALIGN 0x00002000U +#endif + +static int get_dio_alignment(int fd) +{ + struct statx stx; + int ret; + + ret = syscall(__NR_statx, fd, "", AT_EMPTY_PATH, STATX_DIOALIGN, &stx); + if (ret < 0) + return -1; + + /* + * If STATX_DIOALIGN is unsupported, assume no alignment + * constraint and let the test proceed. + */ + if (!(stx.stx_mask & STATX_DIOALIGN) || !stx.stx_dio_offset_align) + return 1; + + return stx.stx_dio_offset_align; +} + +static bool check_dio_alignment(unsigned int start_off, + unsigned int end_off, unsigned int align) +{ + unsigned int writesize = end_off - start_off; + + /* + * The kernel's DIO path checks that file offset, length, and + * buffer address are all multiples of dio_offset_align. When + * this test case's parameters don't satisfy that, the write + * would fail with -EINVAL before exercising the hugetlb unpin + * path, so skip. + */ + if (start_off % align != 0 || writesize % align != 0) { + ksft_test_result_skip("DIO align=%u incompatible with offset %u writesize %u\n", + align, start_off, writesize); + return false; + } + + return true; +} + +static void run_dio_using_hugetlb(int fd, unsigned int start_off, + unsigned int end_off, unsigned int align) { - int fd; char *buffer = NULL; char *orig_buffer = NULL; size_t h_pagesize = 0; @@ -32,6 +77,9 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) const int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; const int mmap_prot = PROT_READ | PROT_WRITE; + if (!check_dio_alignment(start_off, end_off, align)) + return; + writesize = end_off - start_off; /* Get the default huge page size */ @@ -39,10 +87,9 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) if (!h_pagesize) ksft_exit_fail_msg("Unable to determine huge page size\n"); - /* Open the file to DIO */ - fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664); - if (fd < 0) - ksft_exit_fail_perror("Error opening file\n"); + /* Reset file position since fd is shared across tests */ + if (lseek(fd, 0, SEEK_SET) < 0) + ksft_exit_fail_perror("lseek failed\n"); /* Get the free huge pages before allocation */ free_hpage_b = get_free_hugepages(); @@ -71,7 +118,6 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) /* unmap the huge page */ munmap(orig_buffer, h_pagesize); - close(fd); /* Get the free huge pages after unmap*/ free_hpage_a = get_free_hugepages(); @@ -89,37 +135,38 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) int main(void) { - size_t pagesize = 0; - int fd; + int fd, align; + const size_t pagesize = psize(); ksft_print_header(); - /* Open the file to DIO */ - fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664); - if (fd < 0) - ksft_exit_skip("Unable to allocate file: %s\n", strerror(errno)); - close(fd); - /* Check if huge pages are free */ if (!get_free_hugepages()) ksft_exit_skip("No free hugepage, exiting\n"); + fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664); + if (fd < 0) + ksft_exit_skip("Unable to allocate file: %s\n", strerror(errno)); + + align = get_dio_alignment(fd); + if (align < 0) + ksft_exit_skip("Unable to obtain DIO alignment: %s\n", + strerror(errno)); ksft_set_plan(4); - /* Get base page size */ - pagesize = psize(); - /* start and end is aligned to pagesize */ - run_dio_using_hugetlb(0, (pagesize * 3)); + run_dio_using_hugetlb(fd, 0, (pagesize * 3), align); /* start is aligned but end is not aligned */ - run_dio_using_hugetlb(0, (pagesize * 3) - (pagesize / 2)); + run_dio_using_hugetlb(fd, 0, (pagesize * 3) - (pagesize / 2), align); /* start is unaligned and end is aligned */ - run_dio_using_hugetlb(pagesize / 2, (pagesize * 3)); + run_dio_using_hugetlb(fd, pagesize / 2, (pagesize * 3), align); /* both start and end are unaligned */ - run_dio_using_hugetlb(pagesize / 2, (pagesize * 3) + (pagesize / 2)); + run_dio_using_hugetlb(fd, pagesize / 2, (pagesize * 3) + (pagesize / 2), align); + + close(fd); ksft_finished(); } diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c index 10b686102b79..519e5ac02db7 100644 --- a/tools/testing/selftests/mm/merge.c +++ b/tools/testing/selftests/mm/merge.c @@ -48,6 +48,19 @@ static pid_t do_fork(struct procmap_fd *procmap) return 0; } +#ifdef __NR_mseal +static int sys_mseal(void *ptr, size_t len, unsigned long flags) +{ + return syscall(__NR_mseal, (unsigned long)ptr, len, flags); +} +#else +static int sys_mseal(void *ptr, size_t len, unsigned long flags) +{ + errno = ENOSYS; + return -1; +} +#endif + FIXTURE_SETUP(merge) { self->page_size = psize(); @@ -1217,6 +1230,81 @@ TEST_F(merge, mremap_correct_placed_faulted) ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 15 * page_size); } +TEST_F(merge, merge_vmas_with_mseal) +{ + unsigned int page_size = self->page_size; + struct procmap_fd *procmap = &self->procmap; + char *ptr, *ptr2, *ptr3; + /* We need our own as cannot munmap() once sealed. */ + char *carveout; + + /* Invalid mseal() call to see if implemented. */ + ASSERT_EQ(sys_mseal(NULL, 0, ~0UL), -1); + if (errno == ENOSYS) + SKIP(return, "mseal not supported, skipping."); + + /* Map carveout. */ + carveout = mmap(NULL, 5 * page_size, PROT_NONE, + MAP_PRIVATE | MAP_ANON, -1, 0); + ASSERT_NE(carveout, MAP_FAILED); + + /* + * Map 3 separate VMAs: + * + * |-----------|-----------|-----------| + * | RW | RWE | RO | + * |-----------|-----------|-----------| + * ptr ptr2 ptr3 + */ + ptr = mmap(&carveout[page_size], page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + ptr2 = mmap(&carveout[2 * page_size], page_size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + ASSERT_NE(ptr2, MAP_FAILED); + ptr3 = mmap(&carveout[3 * page_size], page_size, PROT_READ, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + ASSERT_NE(ptr3, MAP_FAILED); + + /* + * mseal the second VMA: + * + * |-----------|-----------|-----------| + * | RW | RWES | RO | + * |-----------|-----------|-----------| + * ptr ptr2 ptr3 + */ + ASSERT_EQ(sys_mseal(ptr2, page_size, 0), 0); + + /* Make first VMA mergeable upon mseal. */ + ASSERT_EQ(mprotect(ptr, page_size, + PROT_READ | PROT_WRITE | PROT_EXEC), 0); + /* + * At this point we have: + * + * |-----------|-----------|-----------| + * | RWE | RWES | RO | + * |-----------|-----------|-----------| + * ptr ptr2 ptr3 + * + * Now mseal all of the VMAs. + */ + ASSERT_EQ(sys_mseal(ptr, 3 * page_size, 0), 0); + + /* + * We should end up with: + * + * |-----------------------|-----------| + * | RWES | ROS | + * |-----------------------|-----------| + * ptr ptr3 + */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 2 * page_size); +} + TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev) { struct procmap_fd *procmap = &self->procmap; diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 59c0dbe99a9b..bcfcac99b436 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -82,7 +82,9 @@ static void test_hugepage(int pagemap_fd, int pagesize) int i, ret; if (!thp_is_enabled()) { - ksft_test_result_skip("Transparent Hugepages not available\n"); + ksft_print_msg("Transparent Hugepages not available\n"); + ksft_test_result_skip("Test %s huge page allocation\n", __func__); + ksft_test_result_skip("Test %s huge page dirty bit\n", __func__); return; } diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index e0167111bdd1..500d07c4938b 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -21,6 +21,7 @@ #include #include "vm_util.h" #include "kselftest.h" +#include "thp_settings.h" uint64_t pagesize; unsigned int pageshift; @@ -255,21 +256,6 @@ static int check_after_split_folio_orders(char *vaddr_start, size_t len, return status; } -static void write_file(const char *path, const char *buf, size_t buflen) -{ - int fd; - ssize_t numwritten; - - fd = open(path, O_WRONLY); - if (fd == -1) - ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno)); - - numwritten = write(fd, buf, buflen - 1); - close(fd); - if (numwritten < 1) - ksft_exit_fail_msg("Write failed\n"); -} - static void write_debugfs(const char *fmt, ...) { char input[INPUT_MAX]; @@ -772,6 +758,9 @@ int main(int argc, char **argv) ksft_finished(); } + if (!thp_is_enabled()) + ksft_exit_skip("Transparent Hugepages not available\n"); + if (argc > 1) optional_xfs_path = argv[1]; diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c index 574bd0f8ae48..e748ebfb3d4e 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/thp_settings.c @@ -6,6 +6,7 @@ #include #include +#include "vm_util.h" #include "thp_settings.h" #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" @@ -64,29 +65,6 @@ int read_file(const char *path, char *buf, size_t buflen) return (unsigned int) numread; } -int write_file(const char *path, const char *buf, size_t buflen) -{ - int fd; - ssize_t numwritten; - - fd = open(path, O_WRONLY); - if (fd == -1) { - printf("open(%s)\n", path); - exit(EXIT_FAILURE); - return 0; - } - - numwritten = write(fd, buf, buflen - 1); - close(fd); - if (numwritten < 1) { - printf("write(%s)\n", buf); - exit(EXIT_FAILURE); - return 0; - } - - return (unsigned int) numwritten; -} - unsigned long read_num(const char *path) { char buf[21]; @@ -104,10 +82,7 @@ void write_num(const char *path, unsigned long num) char buf[21]; sprintf(buf, "%ld", num); - if (!write_file(path, buf, strlen(buf) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } + write_file(path, buf, strlen(buf) + 1); } int thp_read_string(const char *name, const char * const strings[]) @@ -165,11 +140,7 @@ void thp_write_string(const char *name, const char *val) printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } - - if (!write_file(path, val, strlen(val) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } + write_file(path, val, strlen(val) + 1); } unsigned long thp_read_num(const char *name) diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h index 76eeb712e5f1..7748a9009191 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/thp_settings.h @@ -63,7 +63,6 @@ struct thp_settings { }; int read_file(const char *path, char *buf, size_t buflen); -int write_file(const char *path, const char *buf, size_t buflen); unsigned long read_num(const char *path); void write_num(const char *path, unsigned long num); diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c index bcad47c09518..7a9f1035099b 100644 --- a/tools/testing/selftests/mm/transhuge-stress.c +++ b/tools/testing/selftests/mm/transhuge-stress.c @@ -17,6 +17,7 @@ #include #include "vm_util.h" #include "kselftest.h" +#include "thp_settings.h" int backing_fd = -1; int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; @@ -37,6 +38,9 @@ int main(int argc, char **argv) ksft_print_header(); + if (!thp_is_enabled()) + ksft_exit_skip("Transparent Hugepages not available\n"); + ram = sysconf(_SC_PHYS_PAGES); if (ram > SIZE_MAX / psize() / 4) ram = SIZE_MAX / 4; diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index a6d4ff7dfdc0..db94564f4431 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -764,3 +764,27 @@ int unpoison_memory(unsigned long pfn) return ret > 0 ? 0 : -errno; } + +void write_file(const char *path, const char *buf, size_t buflen) +{ + int fd, saved_errno; + ssize_t numwritten; + + if (buflen < 2) + ksft_exit_fail_msg("Incorrect buffer len: %zu\n", buflen); + + fd = open(path, O_WRONLY); + if (fd == -1) + ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno)); + + numwritten = write(fd, buf, buflen - 1); + saved_errno = errno; + close(fd); + errno = saved_errno; + if (numwritten < 0) + ksft_exit_fail_msg("%s write(%.*s) failed: %s\n", path, (int)(buflen - 1), + buf, strerror(errno)); + if (numwritten != buflen - 1) + ksft_exit_fail_msg("%s write(%.*s) is truncated, expected %zu bytes, got %zd bytes\n", + path, (int)(buflen - 1), buf, buflen - 1, numwritten); +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index e9c4e24769c1..1a07305ceff4 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -166,3 +166,5 @@ int unpoison_memory(unsigned long pfn); #define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) #define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) + +void write_file(const char *path, const char *buf, size_t buflen);