linux/drivers/android/binder/page_range.rs
Linus Torvalds 334fbe734e mm.git review status for linus..mm-stable
Everything:
 
 Total patches:       368
 Reviews/patch:       1.56
 Reviewed rate:       74%
 
 Excluding DAMON:
 
 Total patches:       316
 Reviews/patch:       1.77
 Reviewed rate:       81%
 
 Excluding DAMON and zram:
 
 Total patches:       306
 Reviews/patch:       1.81
 Reviewed rate:       82%
 
 Excluding DAMON, zram and maple_tree:
 
 Total patches:       276
 Reviews/patch:       2.01
 Reviewed rate:       91%
 
 Significant patch series in this merge:
 
 - The 30 patch series "maple_tree: Replace big node with maple copy"
   from Liam Howlett is mainly prepararatory work for ongoing development
   but it does reduce stack usage and is an improvement.
 
 - The 12 patch series "mm, swap: swap table phase III: remove swap_map"
   from Kairui Song offers memory savings by removing the static swap_map.
   It also yields some CPU savings and implements several cleanups.
 
 - The 2 patch series "mm: memfd_luo: preserve file seals" from Pratyush
   Yadav adds file seal preservation to LUO's memfd code.
 
 - The 2 patch series "mm: zswap: add per-memcg stat for incompressible
   pages" from Jiayuan Chen adds additional userspace stats reportng to
   zswap.
 
 - The 4 patch series "arch, mm: consolidate empty_zero_page" from Mike
   Rapoport implements some cleanups for our handling of ZERO_PAGE() and
   zero_pfn.
 
 - The 2 patch series "mm/kmemleak: Improve scan_should_stop()
   implementation" from Zhongqiu Han provides an robustness improvement and
   some cleanups in the kmemleak code.
 
 - The 4 patch series "Improve khugepaged scan logic" from Vernon Yang
   "improves the khugepaged scan logic and reduces CPU consumption by
   prioritizing scanning tasks that access memory frequently".
 
 - The 2 patch series "Make KHO Stateless" from Jason Miu simplifies
   Kexec Handover by "transitioning KHO from an xarray-based metadata
   tracking system with serialization to a radix tree data structure that
   can be passed directly to the next kernel"
 
 - The 3 patch series "mm: vmscan: add PID and cgroup ID to vmscan
   tracepoints" from Thomas Ballasi and Steven Rostedt enhances vmscan's
   tracepointing.
 
 - The 5 patch series "mm: arch/shstk: Common shadow stack mapping helper
   and VM_NOHUGEPAGE" from Catalin Marinas is a cleanup for the shadow
   stack code: remove per-arch code in favour of a generic implementation.
 
 - The 2 patch series "Fix KASAN support for KHO restored vmalloc
   regions" from Pasha Tatashin fixes a WARN() which can be emitted the KHO
   restores a vmalloc area.
 
 - The 4 patch series "mm: Remove stray references to pagevec" from Tal
   Zussman provides several cleanups, mainly udpating references to "struct
   pagevec", which became folio_batch three years ago.
 
 - The 17 patch series "mm: Eliminate fake head pages from vmemmap
   optimization" from Kiryl Shutsemau simplifies the HugeTLB vmemmap
   optimization (HVO) by changing how tail pages encode their relationship
   to the head page.
 
 - The 2 patch series "mm/damon/core: improve DAMOS quota efficiency for
   core layer filters" from SeongJae Park improves two problematic
   behaviors of DAMOS that makes it less efficient when core layer filters
   are used.
 
 - The 3 patch series "mm/damon: strictly respect min_nr_regions" from
   SeongJae Park improves DAMON usability by extending the treatment of the
   min_nr_regions user-settable parameter.
 
 - The 3 patch series "mm/page_alloc: pcp locking cleanup" from Vlastimil
   Babka is a proper fix for a previously hotfixed SMP=n issue.  Code
   simplifications and cleanups ennsed.
 
 - The 16 patch series "mm: cleanups around unmapping / zapping" from
   David Hildenbrand implements "a bunch of cleanups around unmapping and
   zapping.  Mostly simplifications, code movements, documentation and
   renaming of zapping functions".
 
 - The 6 patch series "support batched checking of the young flag for
   MGLRU" from Baolin Wang supports batched checking of the young flag for
   MGLRU.  It's part cleanups; one benchmark shows large performance
   benefits for arm64.
 
 - The 5 patch series "memcg: obj stock and slab stat caching cleanups"
   from Johannes Weiner provides memcg cleanup and robustness improvements.
 
 - The 5 patch series "Allow order zero pages in page reporting" from
   Yuvraj Sakshith enhances page_reporting's free page reporting - it is
   presently and undesirably order-0 pages when reporting free memory.
 
 - The 6 patch series "mm: vma flag tweaks" from Lorenzo Stoakes is
   cleanup work following from the recent conversion of the VMA flags to a
   bitmap.
 
 - The 10 patch series "mm/damon: add optional debugging-purpose sanity
   checks" from SeongJae Park adds some more developer-facing debug checks
   into DAMON core.
 
 - The 2 patch series "mm/damon: test and document power-of-2
   min_region_sz requirement" from SeongJae Park adds an additional DAMON
   kunit test and makes some adjustments to the addr_unit parameter
   handling.
 
 - The 3 patch series "mm/damon/core: make passed_sample_intervals
   comparisons overflow-safe" from SeongJae Park fixes a hard-to-hit time
   overflow issue in DAMON core.
 
 - The 7 patch series "mm/damon: improve/fixup/update ratio calculation,
   test and documentation" from SeongJae Park is a "batch of misc/minor
   improvements and fixups" for DAMON.
 
 - The 4 patch series "mm: move vma_(kernel|mmu)_pagesize() out of
   hugetlb.c" from David Hildenbrand fixes a possible issue with dax-device
   when CONFIG_HUGETLB=n.  Some code movement was required.
 
 - The 6 patch series "zram: recompression cleanups and tweaks" from
   Sergey Senozhatsky provides "a somewhat random mix of fixups,
   recompression cleanups and improvements" in the zram code.
 
 - The 11 patch series "mm/damon: support multiple goal-based quota
   tuning algorithms" from SeongJae Park extend DAMOS quotas goal
   auto-tuning to support multiple tuning algorithms that users can select.
 
 - The 4 patch series "mm: thp: reduce unnecessary
   start_stop_khugepaged()" from Breno Leitao fixes the khugpaged sysfs
   handling so we no longer spam the logs with reams of junk when
   starting/stopping khugepaged.
 
 - The 3 patch series "mm: improve map count checks" from Lorenzo Stoakes
   provides some cleanups and slight fixes in the mremap, mmap and vma
   code.
 
 - The 5 patch series "mm/damon: support addr_unit on default monitoring
   targets for modules" from SeongJae Park extends the use of DAMON core's
   addr_unit tunable.
 
 - The 5 patch series "mm: khugepaged cleanups and mTHP prerequisites"
   from Nico Pache provides cleanups in the khugepaged and is a base for
   Nico's planned khugepaged mTHP support.
 
 - The 15 patch series "mm: memory hot(un)plug and SPARSEMEM cleanups"
   from David Hildenbrand implements code movement and cleanups in the
   memhotplug and sparsemem code.
 
 - The 2 patch series "mm: remove CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE and
   cleanup CONFIG_MIGRATION" from David Hildenbrand rationalizes some
   memhotplug Kconfig support.
 
 - The 6 patch series "change young flag check functions to return bool"
   from Baolin Wang is "a cleanup patchset to change all young flag check
   functions to return bool".
 
 - The 3 patch series "mm/damon/sysfs: fix memory leak and NULL
   dereference issues" from Josh Law and SeongJae Park fixes a few
   potential DAMON bugs.
 
 - The 25 patch series "mm/vma: convert vm_flags_t to vma_flags_t in vma
   code" from "converts a lot of the existing use of the legacy vm_flags_t
   data type to the new vma_flags_t type which replaces it".  Mainly in the
   vma code.
 
 - The 21 patch series "mm: expand mmap_prepare functionality and usage"
   from Lorenzo Stoakes "expands the mmap_prepare functionality, which is
   intended to replace the deprecated f_op->mmap hook which has been the
   source of bugs and security issues for some time".  Cleanups,
   documentation, extension of mmap_prepare into filesystem drivers.
 
 - The 13 patch series "mm/huge_memory: refactor zap_huge_pmd()" from
   Lorenzo Stoakes simplifies and cleans up zap_huge_pmd().  Additional
   cleanups around vm_normal_folio_pmd() and the softleaf functionality are
   performed.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCad3HDQAKCRDdBJ7gKXxA
 jrUQAPwNhPk5nPSxnyxjAeQtOBHqgCdnICeEismLajPKd9aYRgEA0s2XAu3tSUYi
 GrBnWImHG3s4ePQxVcPCegWTsOUrXgQ=
 =1Q7o
 -----END PGP SIGNATURE-----

Merge tag 'mm-stable-2026-04-13-21-45' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull MM updates from Andrew Morton:

 - "maple_tree: Replace big node with maple copy" (Liam Howlett)

   Mainly prepararatory work for ongoing development but it does reduce
   stack usage and is an improvement.

 - "mm, swap: swap table phase III: remove swap_map" (Kairui Song)

   Offers memory savings by removing the static swap_map. It also yields
   some CPU savings and implements several cleanups.

 - "mm: memfd_luo: preserve file seals" (Pratyush Yadav)

   File seal preservation to LUO's memfd code

 - "mm: zswap: add per-memcg stat for incompressible pages" (Jiayuan
   Chen)

   Additional userspace stats reportng to zswap

 - "arch, mm: consolidate empty_zero_page" (Mike Rapoport)

   Some cleanups for our handling of ZERO_PAGE() and zero_pfn

 - "mm/kmemleak: Improve scan_should_stop() implementation" (Zhongqiu
   Han)

   A robustness improvement and some cleanups in the kmemleak code

 - "Improve khugepaged scan logic" (Vernon Yang)

   Improve khugepaged scan logic and reduce CPU consumption by
   prioritizing scanning tasks that access memory frequently

 - "Make KHO Stateless" (Jason Miu)

   Simplify Kexec Handover by transitioning KHO from an xarray-based
   metadata tracking system with serialization to a radix tree data
   structure that can be passed directly to the next kernel

 - "mm: vmscan: add PID and cgroup ID to vmscan tracepoints" (Thomas
   Ballasi and Steven Rostedt)

   Enhance vmscan's tracepointing

 - "mm: arch/shstk: Common shadow stack mapping helper and
   VM_NOHUGEPAGE" (Catalin Marinas)

   Cleanup for the shadow stack code: remove per-arch code in favour of
   a generic implementation

 - "Fix KASAN support for KHO restored vmalloc regions" (Pasha Tatashin)

   Fix a WARN() which can be emitted the KHO restores a vmalloc area

 - "mm: Remove stray references to pagevec" (Tal Zussman)

   Several cleanups, mainly udpating references to "struct pagevec",
   which became folio_batch three years ago

 - "mm: Eliminate fake head pages from vmemmap optimization" (Kiryl
   Shutsemau)

   Simplify the HugeTLB vmemmap optimization (HVO) by changing how tail
   pages encode their relationship to the head page

 - "mm/damon/core: improve DAMOS quota efficiency for core layer
   filters" (SeongJae Park)

   Improve two problematic behaviors of DAMOS that makes it less
   efficient when core layer filters are used

 - "mm/damon: strictly respect min_nr_regions" (SeongJae Park)

   Improve DAMON usability by extending the treatment of the
   min_nr_regions user-settable parameter

 - "mm/page_alloc: pcp locking cleanup" (Vlastimil Babka)

   The proper fix for a previously hotfixed SMP=n issue. Code
   simplifications and cleanups ensued

 - "mm: cleanups around unmapping / zapping" (David Hildenbrand)

   A bunch of cleanups around unmapping and zapping. Mostly
   simplifications, code movements, documentation and renaming of
   zapping functions

 - "support batched checking of the young flag for MGLRU" (Baolin Wang)

   Batched checking of the young flag for MGLRU. It's part cleanups; one
   benchmark shows large performance benefits for arm64

 - "memcg: obj stock and slab stat caching cleanups" (Johannes Weiner)

   memcg cleanup and robustness improvements

 - "Allow order zero pages in page reporting" (Yuvraj Sakshith)

   Enhance free page reporting - it is presently and undesirably order-0
   pages when reporting free memory.

 - "mm: vma flag tweaks" (Lorenzo Stoakes)

   Cleanup work following from the recent conversion of the VMA flags to
   a bitmap

 - "mm/damon: add optional debugging-purpose sanity checks" (SeongJae
   Park)

   Add some more developer-facing debug checks into DAMON core

 - "mm/damon: test and document power-of-2 min_region_sz requirement"
   (SeongJae Park)

   An additional DAMON kunit test and makes some adjustments to the
   addr_unit parameter handling

 - "mm/damon/core: make passed_sample_intervals comparisons
   overflow-safe" (SeongJae Park)

   Fix a hard-to-hit time overflow issue in DAMON core

 - "mm/damon: improve/fixup/update ratio calculation, test and
   documentation" (SeongJae Park)

   A batch of misc/minor improvements and fixups for DAMON

 - "mm: move vma_(kernel|mmu)_pagesize() out of hugetlb.c" (David
   Hildenbrand)

   Fix a possible issue with dax-device when CONFIG_HUGETLB=n. Some code
   movement was required.

 - "zram: recompression cleanups and tweaks" (Sergey Senozhatsky)

   A somewhat random mix of fixups, recompression cleanups and
   improvements in the zram code

 - "mm/damon: support multiple goal-based quota tuning algorithms"
   (SeongJae Park)

   Extend DAMOS quotas goal auto-tuning to support multiple tuning
   algorithms that users can select

 - "mm: thp: reduce unnecessary start_stop_khugepaged()" (Breno Leitao)

   Fix the khugpaged sysfs handling so we no longer spam the logs with
   reams of junk when starting/stopping khugepaged

 - "mm: improve map count checks" (Lorenzo Stoakes)

   Provide some cleanups and slight fixes in the mremap, mmap and vma
   code

 - "mm/damon: support addr_unit on default monitoring targets for
   modules" (SeongJae Park)

   Extend the use of DAMON core's addr_unit tunable

 - "mm: khugepaged cleanups and mTHP prerequisites" (Nico Pache)

   Cleanups to khugepaged and is a base for Nico's planned khugepaged
   mTHP support

 - "mm: memory hot(un)plug and SPARSEMEM cleanups" (David Hildenbrand)

   Code movement and cleanups in the memhotplug and sparsemem code

 - "mm: remove CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE and cleanup
   CONFIG_MIGRATION" (David Hildenbrand)

   Rationalize some memhotplug Kconfig support

 - "change young flag check functions to return bool" (Baolin Wang)

   Cleanups to change all young flag check functions to return bool

 - "mm/damon/sysfs: fix memory leak and NULL dereference issues" (Josh
   Law and SeongJae Park)

   Fix a few potential DAMON bugs

 - "mm/vma: convert vm_flags_t to vma_flags_t in vma code" (Lorenzo
   Stoakes)

   Convert a lot of the existing use of the legacy vm_flags_t data type
   to the new vma_flags_t type which replaces it. Mainly in the vma
   code.

 - "mm: expand mmap_prepare functionality and usage" (Lorenzo Stoakes)

   Expand the mmap_prepare functionality, which is intended to replace
   the deprecated f_op->mmap hook which has been the source of bugs and
   security issues for some time. Cleanups, documentation, extension of
   mmap_prepare into filesystem drivers

 - "mm/huge_memory: refactor zap_huge_pmd()" (Lorenzo Stoakes)

   Simplify and clean up zap_huge_pmd(). Additional cleanups around
   vm_normal_folio_pmd() and the softleaf functionality are performed.

* tag 'mm-stable-2026-04-13-21-45' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
  mm: fix deferred split queue races during migration
  mm/khugepaged: fix issue with tracking lock
  mm/huge_memory: add and use has_deposited_pgtable()
  mm/huge_memory: add and use normal_or_softleaf_folio_pmd()
  mm: add softleaf_is_valid_pmd_entry(), pmd_to_softleaf_folio()
  mm/huge_memory: separate out the folio part of zap_huge_pmd()
  mm/huge_memory: use mm instead of tlb->mm
  mm/huge_memory: remove unnecessary sanity checks
  mm/huge_memory: deduplicate zap deposited table call
  mm/huge_memory: remove unnecessary VM_BUG_ON_PAGE()
  mm/huge_memory: add a common exit path to zap_huge_pmd()
  mm/huge_memory: handle buggy PMD entry in zap_huge_pmd()
  mm/huge_memory: have zap_huge_pmd return a boolean, add kdoc
  mm/huge: avoid big else branch in zap_huge_pmd()
  mm/huge_memory: simplify vma_is_specal_huge()
  mm: on remap assert that input range within the proposed VMA
  mm: add mmap_action_map_kernel_pages[_full]()
  uio: replace deprecated mmap hook with mmap_prepare in uio_info
  drivers: hv: vmbus: replace deprecated mmap hook with mmap_prepare
  mm: allow handling of stacked mmap_prepare hooks in more drivers
  ...
2026-04-15 12:59:16 -07:00

778 lines
29 KiB
Rust

// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2025 Google LLC.
//! This module has utilities for managing a page range where unused pages may be reclaimed by a
//! vma shrinker.
// To avoid deadlocks, locks are taken in the order:
//
// 1. mmap lock
// 2. spinlock
// 3. lru spinlock
//
// The shrinker will use trylock methods because it locks them in a different order.
use crate::AssertSync;
use core::{
marker::PhantomPinned,
mem::{size_of, size_of_val, MaybeUninit},
ptr,
};
use kernel::{
bindings,
error::Result,
ffi::{c_ulong, c_void},
mm::{virt, Mm, MmWithUser},
new_mutex, new_spinlock,
page::{Page, PAGE_SHIFT, PAGE_SIZE},
prelude::*,
str::CStr,
sync::{aref::ARef, Mutex, SpinLock},
task::Pid,
transmute::FromBytes,
types::Opaque,
uaccess::UserSliceReader,
};
/// Represents a shrinker that can be registered with the kernel.
///
/// Each shrinker can be used by many `ShrinkablePageRange` objects.
#[repr(C)]
pub(crate) struct Shrinker {
inner: Opaque<*mut bindings::shrinker>,
list_lru: Opaque<bindings::list_lru>,
}
// SAFETY: The shrinker and list_lru are thread safe.
unsafe impl Send for Shrinker {}
// SAFETY: The shrinker and list_lru are thread safe.
unsafe impl Sync for Shrinker {}
impl Shrinker {
/// Create a new shrinker.
///
/// # Safety
///
/// Before using this shrinker with a `ShrinkablePageRange`, the `register` method must have
/// been called exactly once, and it must not have returned an error.
pub(crate) const unsafe fn new() -> Self {
Self {
inner: Opaque::uninit(),
list_lru: Opaque::uninit(),
}
}
/// Register this shrinker with the kernel.
pub(crate) fn register(&'static self, name: &CStr) -> Result<()> {
// SAFETY: These fields are not yet used, so it's okay to zero them.
unsafe {
self.inner.get().write(ptr::null_mut());
self.list_lru.get().write_bytes(0, 1);
}
// SAFETY: The field is not yet used, so we can initialize it.
let ret = unsafe { bindings::__list_lru_init(self.list_lru.get(), false, ptr::null_mut()) };
if ret != 0 {
return Err(Error::from_errno(ret));
}
// SAFETY: The `name` points at a valid c string.
let shrinker = unsafe { bindings::shrinker_alloc(0, name.as_char_ptr()) };
if shrinker.is_null() {
// SAFETY: We initialized it, so its okay to destroy it.
unsafe { bindings::list_lru_destroy(self.list_lru.get()) };
return Err(Error::from_errno(ret));
}
// SAFETY: We're about to register the shrinker, and these are the fields we need to
// initialize. (All other fields are already zeroed.)
unsafe {
(&raw mut (*shrinker).count_objects).write(Some(rust_shrink_count));
(&raw mut (*shrinker).scan_objects).write(Some(rust_shrink_scan));
(&raw mut (*shrinker).private_data).write(self.list_lru.get().cast());
}
// SAFETY: The new shrinker has been fully initialized, so we can register it.
unsafe { bindings::shrinker_register(shrinker) };
// SAFETY: This initializes the pointer to the shrinker so that we can use it.
unsafe { self.inner.get().write(shrinker) };
Ok(())
}
}
/// A container that manages a page range in a vma.
///
/// The pages can be thought of as an array of booleans of whether the pages are usable. The
/// methods `use_range` and `stop_using_range` set all booleans in a range to true or false
/// respectively. Initially, no pages are allocated. When a page is not used, it is not freed
/// immediately. Instead, it is made available to the memory shrinker to free it if the device is
/// under memory pressure.
///
/// It's okay for `use_range` and `stop_using_range` to race with each other, although there's no
/// way to know whether an index ends up with true or false if a call to `use_range` races with
/// another call to `stop_using_range` on a given index.
///
/// It's also okay for the two methods to race with themselves, e.g. if two threads call
/// `use_range` on the same index, then that's fine and neither call will return until the page is
/// allocated and mapped.
///
/// The methods that read or write to a range require that the page is marked as in use. So it is
/// _not_ okay to call `stop_using_range` on a page that is in use by the methods that read or
/// write to the page.
#[pin_data(PinnedDrop)]
pub(crate) struct ShrinkablePageRange {
/// Shrinker object registered with the kernel.
shrinker: &'static Shrinker,
/// Pid using this page range. Only used as debugging information.
pid: Pid,
/// The mm for the relevant process.
mm: ARef<Mm>,
/// Used to synchronize calls to `vm_insert_page` and `zap_vma_range`.
#[pin]
mm_lock: Mutex<()>,
/// Spinlock protecting changes to pages.
#[pin]
lock: SpinLock<Inner>,
/// Must not move, since page info has pointers back.
#[pin]
_pin: PhantomPinned,
}
// We do not define any ops. For now, used only to check identity of vmas.
static BINDER_VM_OPS: AssertSync<bindings::vm_operations_struct> = AssertSync(pin_init::zeroed());
// To ensure that we do not accidentally install pages into or zap pages from the wrong vma, we
// check its vm_ops and private data before using it.
fn check_vma(vma: &virt::VmaRef, owner: *const ShrinkablePageRange) -> Option<&virt::VmaMixedMap> {
// SAFETY: Just reading the vm_ops pointer of any active vma is safe.
let vm_ops = unsafe { (*vma.as_ptr()).vm_ops };
if !ptr::eq(vm_ops, &BINDER_VM_OPS.0) {
return None;
}
// SAFETY: Reading the vm_private_data pointer of a binder-owned vma is safe.
let vm_private_data = unsafe { (*vma.as_ptr()).vm_private_data };
// The ShrinkablePageRange is only dropped when the Process is dropped, which only happens once
// the file's ->release handler is invoked, which means the ShrinkablePageRange outlives any
// VMA associated with it, so there can't be any false positives due to pointer reuse here.
if !ptr::eq(vm_private_data, owner.cast()) {
return None;
}
vma.as_mixedmap_vma()
}
struct Inner {
/// Array of pages.
///
/// Since this is also accessed by the shrinker, we can't use a `Box`, which asserts exclusive
/// ownership. To deal with that, we manage it using raw pointers.
pages: *mut PageInfo,
/// Length of the `pages` array.
size: usize,
/// The address of the vma to insert the pages into.
vma_addr: usize,
}
// SAFETY: proper locking is in place for `Inner`
unsafe impl Send for Inner {}
type StableMmGuard =
kernel::sync::lock::Guard<'static, (), kernel::sync::lock::mutex::MutexBackend>;
/// An array element that describes the current state of a page.
///
/// There are three states:
///
/// * Free. The page is None. The `lru` element is not queued.
/// * Available. The page is Some. The `lru` element is queued to the shrinker's lru.
/// * Used. The page is Some. The `lru` element is not queued.
///
/// When an element is available, the shrinker is able to free the page.
#[repr(C)]
struct PageInfo {
lru: bindings::list_head,
page: Option<Page>,
range: *const ShrinkablePageRange,
}
impl PageInfo {
/// # Safety
///
/// The caller ensures that writing to `me.page` is ok, and that the page is not currently set.
unsafe fn set_page(me: *mut PageInfo, page: Page) {
// SAFETY: This pointer offset is in bounds.
let ptr = unsafe { &raw mut (*me).page };
// SAFETY: The pointer is valid for writing, so also valid for reading.
if unsafe { (*ptr).is_some() } {
pr_err!("set_page called when there is already a page");
// SAFETY: We will initialize the page again below.
unsafe { ptr::drop_in_place(ptr) };
}
// SAFETY: The pointer is valid for writing.
unsafe { ptr::write(ptr, Some(page)) };
}
/// # Safety
///
/// The caller ensures that reading from `me.page` is ok for the duration of 'a.
unsafe fn get_page<'a>(me: *const PageInfo) -> Option<&'a Page> {
// SAFETY: This pointer offset is in bounds.
let ptr = unsafe { &raw const (*me).page };
// SAFETY: The pointer is valid for reading.
unsafe { (*ptr).as_ref() }
}
/// # Safety
///
/// The caller ensures that writing to `me.page` is ok for the duration of 'a.
unsafe fn take_page(me: *mut PageInfo) -> Option<Page> {
// SAFETY: This pointer offset is in bounds.
let ptr = unsafe { &raw mut (*me).page };
// SAFETY: The pointer is valid for reading.
unsafe { (*ptr).take() }
}
/// Add this page to the lru list, if not already in the list.
///
/// # Safety
///
/// The pointer must be valid, and it must be the right shrinker and nid.
unsafe fn list_lru_add(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) {
// SAFETY: This pointer offset is in bounds.
let lru_ptr = unsafe { &raw mut (*me).lru };
// SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
unsafe { bindings::list_lru_add(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) };
}
/// Remove this page from the lru list, if it is in the list.
///
/// # Safety
///
/// The pointer must be valid, and it must be the right shrinker and nid.
unsafe fn list_lru_del(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) {
// SAFETY: This pointer offset is in bounds.
let lru_ptr = unsafe { &raw mut (*me).lru };
// SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
unsafe { bindings::list_lru_del(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) };
}
}
impl ShrinkablePageRange {
/// Create a new `ShrinkablePageRange` using the given shrinker.
pub(crate) fn new(shrinker: &'static Shrinker) -> impl PinInit<Self, Error> {
try_pin_init!(Self {
shrinker,
pid: kernel::current!().pid(),
mm: ARef::from(&**kernel::current!().mm().ok_or(ESRCH)?),
mm_lock <- new_mutex!((), "ShrinkablePageRange::mm"),
lock <- new_spinlock!(Inner {
pages: ptr::null_mut(),
size: 0,
vma_addr: 0,
}, "ShrinkablePageRange"),
_pin: PhantomPinned,
})
}
pub(crate) fn stable_trylock_mm(&self) -> Option<StableMmGuard> {
// SAFETY: This extends the duration of the reference. Since this call happens before
// `mm_lock` is taken in the destructor of `ShrinkablePageRange`, the destructor will block
// until the returned guard is dropped. This ensures that the guard is valid until dropped.
let mm_lock = unsafe { &*ptr::from_ref(&self.mm_lock) };
mm_lock.try_lock()
}
/// Register a vma with this page range. Returns the size of the region.
pub(crate) fn register_with_vma(&self, vma: &virt::VmaNew) -> Result<usize> {
let num_bytes = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize);
let num_pages = num_bytes >> PAGE_SHIFT;
if !ptr::eq::<Mm>(&*self.mm, &**vma.mm()) {
pr_debug!("Failed to register with vma: invalid vma->vm_mm");
return Err(EINVAL);
}
if num_pages == 0 {
pr_debug!("Failed to register with vma: size zero");
return Err(EINVAL);
}
let mut pages = KVVec::<PageInfo>::with_capacity(num_pages, GFP_KERNEL)?;
// SAFETY: This just initializes the pages array.
unsafe {
let self_ptr = self as *const ShrinkablePageRange;
for i in 0..num_pages {
let info = pages.as_mut_ptr().add(i);
(&raw mut (*info).range).write(self_ptr);
(&raw mut (*info).page).write(None);
let lru = &raw mut (*info).lru;
(&raw mut (*lru).next).write(lru);
(&raw mut (*lru).prev).write(lru);
}
}
let mut inner = self.lock.lock();
if inner.size > 0 {
pr_debug!("Failed to register with vma: already registered");
drop(inner);
return Err(EBUSY);
}
inner.pages = pages.into_raw_parts().0;
inner.size = num_pages;
inner.vma_addr = vma.start();
// This pointer is only used for comparison - it's not dereferenced.
//
// SAFETY: We own the vma, and we don't use any methods on VmaNew that rely on
// `vm_private_data`.
unsafe {
(*vma.as_ptr()).vm_private_data = ptr::from_ref(self).cast_mut().cast::<c_void>()
};
// SAFETY: We own the vma, and we don't use any methods on VmaNew that rely on
// `vm_ops`.
unsafe { (*vma.as_ptr()).vm_ops = &BINDER_VM_OPS.0 };
Ok(num_pages)
}
/// Make sure that the given pages are allocated and mapped.
///
/// Must not be called from an atomic context.
pub(crate) fn use_range(&self, start: usize, end: usize) -> Result<()> {
if start >= end {
return Ok(());
}
let mut inner = self.lock.lock();
assert!(end <= inner.size);
for i in start..end {
// SAFETY: This pointer offset is in bounds.
let page_info = unsafe { inner.pages.add(i) };
// SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
if let Some(page) = unsafe { PageInfo::get_page(page_info) } {
// Since we're going to use the page, we should remove it from the lru list so that
// the shrinker will not free it.
//
// SAFETY: The pointer is valid, and this is the right shrinker.
//
// The shrinker can't free the page between the check and this call to
// `list_lru_del` because we hold the lock.
unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) };
} else {
// We have to allocate a new page. Use the slow path.
drop(inner);
// SAFETY: `i < end <= inner.size` so `i` is in bounds.
match unsafe { self.use_page_slow(i) } {
Ok(()) => {}
Err(err) => {
pr_warn!("Error in use_page_slow: {:?}", err);
return Err(err);
}
}
inner = self.lock.lock();
}
}
Ok(())
}
/// Mark the given page as in use, slow path.
///
/// Must not be called from an atomic context.
///
/// # Safety
///
/// Assumes that `i` is in bounds.
#[cold]
unsafe fn use_page_slow(&self, i: usize) -> Result<()> {
let new_page = Page::alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO)?;
let mm_mutex = self.mm_lock.lock();
let inner = self.lock.lock();
// SAFETY: This pointer offset is in bounds.
let page_info = unsafe { inner.pages.add(i) };
// SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
if let Some(page) = unsafe { PageInfo::get_page(page_info) } {
// The page was already there, or someone else added the page while we didn't hold the
// spinlock.
//
// SAFETY: The pointer is valid, and this is the right shrinker.
//
// The shrinker can't free the page between the check and this call to
// `list_lru_del` because we hold the lock.
unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) };
return Ok(());
}
let vma_addr = inner.vma_addr;
// Release the spinlock while we insert the page into the vma.
drop(inner);
// No overflow since we stay in bounds of the vma.
let user_page_addr = vma_addr + (i << PAGE_SHIFT);
// We use `mmput_async` when dropping the `mm` because `use_page_slow` is usually used from
// a remote process. If the call to `mmput` races with the process shutting down, then the
// caller of `use_page_slow` becomes responsible for cleaning up the `mm`, which doesn't
// happen until it returns to userspace. However, the caller might instead go to sleep and
// wait for the owner of the `mm` to wake it up, which doesn't happen because it's in the
// middle of a shutdown process that won't complete until the `mm` is dropped. This can
// amount to a deadlock.
//
// Using `mmput_async` avoids this, because then the `mm` cleanup is instead queued to a
// workqueue.
let mm = MmWithUser::into_mmput_async(self.mm.mmget_not_zero().ok_or(ESRCH)?);
{
let vma_read;
let mmap_read;
let vma = if let Some(ret) = mm.lock_vma_under_rcu(vma_addr) {
vma_read = ret;
check_vma(&vma_read, self)
} else {
mmap_read = mm.mmap_read_lock();
mmap_read
.vma_lookup(vma_addr)
.and_then(|vma| check_vma(vma, self))
};
match vma {
Some(vma) => vma.vm_insert_page(user_page_addr, &new_page)?,
None => return Err(ESRCH),
}
}
let inner = self.lock.lock();
// SAFETY: The `page_info` pointer is valid and currently does not have a page. The page
// can be written to since we hold the lock.
//
// We released and reacquired the spinlock since we checked that the page is null, but we
// always hold the mm_lock mutex when setting the page to a non-null value, so it's not
// possible for someone else to have changed it since our check.
unsafe { PageInfo::set_page(page_info, new_page) };
drop(inner);
drop(mm_mutex);
Ok(())
}
/// If the given page is in use, then mark it as available so that the shrinker can free it.
///
/// May be called from an atomic context.
pub(crate) fn stop_using_range(&self, start: usize, end: usize) {
if start >= end {
return;
}
let inner = self.lock.lock();
assert!(end <= inner.size);
for i in (start..end).rev() {
// SAFETY: The pointer is in bounds.
let page_info = unsafe { inner.pages.add(i) };
// SAFETY: Okay for reading since we have the lock.
if let Some(page) = unsafe { PageInfo::get_page(page_info) } {
// SAFETY: The pointer is valid, and it's the right shrinker.
unsafe { PageInfo::list_lru_add(page_info, page.nid(), self.shrinker) };
}
}
}
/// Helper for reading or writing to a range of bytes that may overlap with several pages.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
unsafe fn iterate<T>(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result
where
T: FnMut(&Page, usize, usize) -> Result,
{
if size == 0 {
return Ok(());
}
let (pages, num_pages) = {
let inner = self.lock.lock();
(inner.pages, inner.size)
};
let num_bytes = num_pages << PAGE_SHIFT;
// Check that the request is within the buffer.
if offset.checked_add(size).ok_or(EFAULT)? > num_bytes {
return Err(EFAULT);
}
let mut page_index = offset >> PAGE_SHIFT;
offset &= PAGE_SIZE - 1;
while size > 0 {
let available = usize::min(size, PAGE_SIZE - offset);
// SAFETY: The pointer is in bounds.
let page_info = unsafe { pages.add(page_index) };
// SAFETY: The caller guarantees that this page is in the "in use" state for the
// duration of this call to `iterate`, so nobody will change the page.
let page = unsafe { PageInfo::get_page(page_info) };
if page.is_none() {
pr_warn!("Page is null!");
}
let page = page.ok_or(EFAULT)?;
cb(page, offset, available)?;
size -= available;
page_index += 1;
offset = 0;
}
Ok(())
}
/// Copy from userspace into this page range.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn copy_from_user_slice(
&self,
reader: &mut UserSliceReader,
offset: usize,
size: usize,
) -> Result {
// SAFETY: `self.iterate` has the same safety requirements as `copy_from_user_slice`.
unsafe {
self.iterate(offset, size, |page, offset, to_copy| {
page.copy_from_user_slice_raw(reader, offset, to_copy)
})
}
}
/// Copy from this page range into kernel space.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn read<T: FromBytes>(&self, offset: usize) -> Result<T> {
let mut out = MaybeUninit::<T>::uninit();
let mut out_offset = 0;
// SAFETY: `self.iterate` has the same safety requirements as `read`.
unsafe {
self.iterate(offset, size_of::<T>(), |page, offset, to_copy| {
// SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
let obj_ptr = (out.as_mut_ptr() as *mut u8).add(out_offset);
// SAFETY: The pointer points is in-bounds of the `out` variable, so it is valid.
page.read_raw(obj_ptr, offset, to_copy)?;
out_offset += to_copy;
Ok(())
})?;
}
// SAFETY: We just initialised the data.
Ok(unsafe { out.assume_init() })
}
/// Copy from kernel space into this page range.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result {
let mut obj_offset = 0;
// SAFETY: `self.iterate` has the same safety requirements as `write`.
unsafe {
self.iterate(offset, size_of_val(obj), |page, offset, to_copy| {
// SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
let obj_ptr = (obj as *const T as *const u8).add(obj_offset);
// SAFETY: We have a reference to the object, so the pointer is valid.
page.write_raw(obj_ptr, offset, to_copy)?;
obj_offset += to_copy;
Ok(())
})
}
}
/// Write zeroes to the given range.
///
/// # Safety
///
/// All pages touched by this operation must be in use for the duration of this call.
pub(crate) unsafe fn fill_zero(&self, offset: usize, size: usize) -> Result {
// SAFETY: `self.iterate` has the same safety requirements as `copy_into`.
unsafe {
self.iterate(offset, size, |page, offset, len| {
page.fill_zero_raw(offset, len)
})
}
}
}
#[pinned_drop]
impl PinnedDrop for ShrinkablePageRange {
fn drop(self: Pin<&mut Self>) {
let (pages, size) = {
let lock = self.lock.lock();
(lock.pages, lock.size)
};
if size == 0 {
return;
}
// Note: This call is also necessary for the safety of `stable_trylock_mm`.
let mm_lock = self.mm_lock.lock();
// This is the destructor, so unlike the other methods, we only need to worry about races
// with the shrinker here. Since we hold the `mm_lock`, we also can't race with the
// shrinker, and after this loop, the shrinker will not access any of our pages since we
// removed them from the lru list.
for i in 0..size {
// SAFETY: Loop is in-bounds of the size.
let p_ptr = unsafe { pages.add(i) };
// SAFETY: No other readers, so we can read.
if let Some(p) = unsafe { PageInfo::get_page(p_ptr) } {
// SAFETY: The pointer is valid and it's the right shrinker.
unsafe { PageInfo::list_lru_del(p_ptr, p.nid(), self.shrinker) };
}
}
drop(mm_lock);
// SAFETY: `pages` was allocated as an `KVVec<PageInfo>` with capacity `size`. Furthermore,
// all `size` elements are initialized. Also, the array is no longer shared with the
// shrinker due to the above loop.
drop(unsafe { KVVec::from_raw_parts(pages, size, size) });
}
}
/// # Safety
/// Called by the shrinker.
#[no_mangle]
unsafe extern "C" fn rust_shrink_count(
shrink: *mut bindings::shrinker,
_sc: *mut bindings::shrink_control,
) -> c_ulong {
// SAFETY: We can access our own private data.
let list_lru = unsafe { (*shrink).private_data.cast::<bindings::list_lru>() };
// SAFETY: Accessing the lru list is okay. Just an FFI call.
unsafe { bindings::list_lru_count(list_lru) }
}
/// # Safety
/// Called by the shrinker.
#[no_mangle]
unsafe extern "C" fn rust_shrink_scan(
shrink: *mut bindings::shrinker,
sc: *mut bindings::shrink_control,
) -> c_ulong {
// SAFETY: We can access our own private data.
let list_lru = unsafe { (*shrink).private_data.cast::<bindings::list_lru>() };
// SAFETY: Caller guarantees that it is safe to read this field.
let nr_to_scan = unsafe { (*sc).nr_to_scan };
// SAFETY: Accessing the lru list is okay. Just an FFI call.
unsafe {
bindings::list_lru_walk(
list_lru,
Some(rust_shrink_free_page),
ptr::null_mut(),
nr_to_scan,
)
}
}
const LRU_SKIP: bindings::lru_status = bindings::lru_status::LRU_SKIP;
const LRU_REMOVED_ENTRY: bindings::lru_status = bindings::lru_status::LRU_REMOVED_RETRY;
/// # Safety
/// Called by the shrinker.
#[no_mangle]
unsafe extern "C" fn rust_shrink_free_page(
item: *mut bindings::list_head,
lru: *mut bindings::list_lru_one,
_cb_arg: *mut c_void,
) -> bindings::lru_status {
// Fields that should survive after unlocking the lru lock.
let page;
let page_index;
let mm;
let mmap_read;
let mm_mutex;
let vma_addr;
let range_ptr;
{
// CAST: The `list_head` field is first in `PageInfo`.
let info = item as *mut PageInfo;
// SAFETY: The `range` field of `PageInfo` is immutable.
range_ptr = unsafe { (*info).range };
// SAFETY: The `range` outlives its `PageInfo` values.
let range = unsafe { &*range_ptr };
mm = match range.mm.mmget_not_zero() {
Some(mm) => MmWithUser::into_mmput_async(mm),
None => return LRU_SKIP,
};
mm_mutex = match range.stable_trylock_mm() {
Some(guard) => guard,
None => return LRU_SKIP,
};
mmap_read = match mm.mmap_read_trylock() {
Some(guard) => guard,
None => return LRU_SKIP,
};
// We can't lock it normally here, since we hold the lru lock.
let inner = match range.lock.try_lock() {
Some(inner) => inner,
None => return LRU_SKIP,
};
// SAFETY: The item is in this lru list, so it's okay to remove it.
unsafe { bindings::list_lru_isolate(lru, item) };
// SAFETY: Both pointers are in bounds of the same allocation.
page_index = unsafe { info.offset_from(inner.pages) } as usize;
// SAFETY: We hold the spinlock, so we can take the page.
//
// This sets the page pointer to zero before we unmap it from the vma. However, we call
// `zap_page_range` before we release the mmap lock, so `use_page_slow` will not be able to
// insert a new page until after our call to `zap_page_range`.
page = unsafe { PageInfo::take_page(info) };
vma_addr = inner.vma_addr;
// From this point on, we don't access this PageInfo or ShrinkablePageRange again, because
// they can be freed at any point after we unlock `lru_lock`. This is with the exception of
// `mm_mutex` which is kept alive by holding the lock.
}
// SAFETY: The lru lock is locked when this method is called.
unsafe { bindings::spin_unlock(&raw mut (*lru).lock) };
if let Some(unchecked_vma) = mmap_read.vma_lookup(vma_addr) {
if let Some(vma) = check_vma(unchecked_vma, range_ptr) {
let user_page_addr = vma_addr + (page_index << PAGE_SHIFT);
vma.zap_vma_range(user_page_addr, PAGE_SIZE);
}
}
drop(mmap_read);
drop(mm_mutex);
drop(mm);
drop(page);
LRU_REMOVED_ENTRY
}