mirror of
https://github.com/torvalds/linux.git
synced 2026-05-12 16:18:45 +02:00
Patch series "mm: Optimize mseal checks", v3. Optimize mseal checks by removing the separate can_modify_mm() step, and just doing checks on the individual vmas, when various operations are themselves iterating through the tree. This provides a nice speedup and restores performance parity with pre-mseal[3]. will-it-scale mmap1_process[1] -t 1 results: commit 3450fe2b574b4345e4296ccae395149e1a357fee: min:277605 max:277605 total:277605 min:281784 max:281784 total:281784 min:277238 max:277238 total:277238 min:281761 max:281761 total:281761 min:274279 max:274279 total:274279 min:254854 max:254854 total:254854 measurement min:269143 max:269143 total:269143 min:270454 max:270454 total:270454 min:243523 max:243523 total:243523 min:251148 max:251148 total:251148 min:209669 max:209669 total:209669 min:190426 max:190426 total:190426 min:231219 max:231219 total:231219 min:275364 max:275364 total:275364 min:266540 max:266540 total:266540 min:242572 max:242572 total:242572 min:284469 max:284469 total:284469 min:278882 max:278882 total:278882 min:283269 max:283269 total:283269 min:281204 max:281204 total:281204 After this patch set: min:280580 max:280580 total:280580 min:290514 max:290514 total:290514 min:291006 max:291006 total:291006 min:290352 max:290352 total:290352 min:294582 max:294582 total:294582 min:293075 max:293075 total:293075 measurement min:295613 max:295613 total:295613 min:294070 max:294070 total:294070 min:293193 max:293193 total:293193 min:291631 max:291631 total:291631 min:295278 max:295278 total:295278 min:293782 max:293782 total:293782 min:290361 max:290361 total:290361 min:294517 max:294517 total:294517 min:293750 max:293750 total:293750 min:293572 max:293572 total:293572 min:295239 max:295239 total:295239 min:292932 max:292932 total:292932 min:293319 max:293319 total:293319 min:294954 max:294954 total:294954 This was a Completely Unscientific test but seems to show there were around 5-10% gains on ops per second. Oliver performed his own tests and showed[3] a similar ~5% gain in them. [1]: mmap1_process does mmap and munmap in a loop. I didn't bother testing multithreading cases. [2]: https://lore.kernel.org/all/20240807124103.85644-1-mpe@ellerman.id.au/ [3]: https://lore.kernel.org/all/ZrMMJfe9aXSWxJz6@xsang-OptiPlex-9020/ Link: https://lore.kernel.org/all/202408041602.caa0372-oliver.sang@intel.com/ This patch (of 7): Move can_modify_vma to vma.h so it can be inlined properly (with the intent to remove can_modify_mm callsites). Link: https://lkml.kernel.org/r/20240817-mseal-depessimize-v3-1-d8d2e037df30@gmail.com Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Jeff Xu <jeffxu@chromium.org> Cc: Kees Cook <kees@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Pedro Falcato <pedro.falcato@gmail.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
299 lines
6.8 KiB
C
299 lines
6.8 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Implement mseal() syscall.
|
|
*
|
|
* Copyright (c) 2023,2024 Google, Inc.
|
|
*
|
|
* Author: Jeff Xu <jeffxu@chromium.org>
|
|
*/
|
|
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mm_inline.h>
|
|
#include <linux/mmu_context.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/sched.h>
|
|
#include "internal.h"
|
|
|
|
static inline void set_vma_sealed(struct vm_area_struct *vma)
|
|
{
|
|
vm_flags_set(vma, VM_SEALED);
|
|
}
|
|
|
|
static bool is_madv_discard(int behavior)
|
|
{
|
|
switch (behavior) {
|
|
case MADV_FREE:
|
|
case MADV_DONTNEED:
|
|
case MADV_DONTNEED_LOCKED:
|
|
case MADV_REMOVE:
|
|
case MADV_DONTFORK:
|
|
case MADV_WIPEONFORK:
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static bool is_ro_anon(struct vm_area_struct *vma)
|
|
{
|
|
/* check anonymous mapping. */
|
|
if (vma->vm_file || vma->vm_flags & VM_SHARED)
|
|
return false;
|
|
|
|
/*
|
|
* check for non-writable:
|
|
* PROT=RO or PKRU is not writeable.
|
|
*/
|
|
if (!(vma->vm_flags & VM_WRITE) ||
|
|
!arch_vma_access_permitted(vma, true, false, false))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Check if the vmas of a memory range are allowed to be modified.
|
|
* the memory ranger can have a gap (unallocated memory).
|
|
* return true, if it is allowed.
|
|
*/
|
|
bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
|
|
VMA_ITERATOR(vmi, mm, start);
|
|
|
|
/* going through each vma to check. */
|
|
for_each_vma_range(vmi, vma, end) {
|
|
if (unlikely(!can_modify_vma(vma)))
|
|
return false;
|
|
}
|
|
|
|
/* Allow by default. */
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Check if the vmas of a memory range are allowed to be modified by madvise.
|
|
* the memory ranger can have a gap (unallocated memory).
|
|
* return true, if it is allowed.
|
|
*/
|
|
bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
|
|
int behavior)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
|
|
VMA_ITERATOR(vmi, mm, start);
|
|
|
|
if (!is_madv_discard(behavior))
|
|
return true;
|
|
|
|
/* going through each vma to check. */
|
|
for_each_vma_range(vmi, vma, end)
|
|
if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
|
|
return false;
|
|
|
|
/* Allow by default. */
|
|
return true;
|
|
}
|
|
|
|
static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|
struct vm_area_struct **prev, unsigned long start,
|
|
unsigned long end, vm_flags_t newflags)
|
|
{
|
|
int ret = 0;
|
|
vm_flags_t oldflags = vma->vm_flags;
|
|
|
|
if (newflags == oldflags)
|
|
goto out;
|
|
|
|
vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
|
|
if (IS_ERR(vma)) {
|
|
ret = PTR_ERR(vma);
|
|
goto out;
|
|
}
|
|
|
|
set_vma_sealed(vma);
|
|
out:
|
|
*prev = vma;
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Check for do_mseal:
|
|
* 1> start is part of a valid vma.
|
|
* 2> end is part of a valid vma.
|
|
* 3> No gap (unallocated address) between start and end.
|
|
* 4> map is sealable.
|
|
*/
|
|
static int check_mm_seal(unsigned long start, unsigned long end)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
unsigned long nstart = start;
|
|
|
|
VMA_ITERATOR(vmi, current->mm, start);
|
|
|
|
/* going through each vma to check. */
|
|
for_each_vma_range(vmi, vma, end) {
|
|
if (vma->vm_start > nstart)
|
|
/* unallocated memory found. */
|
|
return -ENOMEM;
|
|
|
|
if (vma->vm_end >= end)
|
|
return 0;
|
|
|
|
nstart = vma->vm_end;
|
|
}
|
|
|
|
return -ENOMEM;
|
|
}
|
|
|
|
/*
|
|
* Apply sealing.
|
|
*/
|
|
static int apply_mm_seal(unsigned long start, unsigned long end)
|
|
{
|
|
unsigned long nstart;
|
|
struct vm_area_struct *vma, *prev;
|
|
|
|
VMA_ITERATOR(vmi, current->mm, start);
|
|
|
|
vma = vma_iter_load(&vmi);
|
|
/*
|
|
* Note: check_mm_seal should already checked ENOMEM case.
|
|
* so vma should not be null, same for the other ENOMEM cases.
|
|
*/
|
|
prev = vma_prev(&vmi);
|
|
if (start > vma->vm_start)
|
|
prev = vma;
|
|
|
|
nstart = start;
|
|
for_each_vma_range(vmi, vma, end) {
|
|
int error;
|
|
unsigned long tmp;
|
|
vm_flags_t newflags;
|
|
|
|
newflags = vma->vm_flags | VM_SEALED;
|
|
tmp = vma->vm_end;
|
|
if (tmp > end)
|
|
tmp = end;
|
|
error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
|
|
if (error)
|
|
return error;
|
|
nstart = vma_iter_end(&vmi);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* mseal(2) seals the VM's meta data from
|
|
* selected syscalls.
|
|
*
|
|
* addr/len: VM address range.
|
|
*
|
|
* The address range by addr/len must meet:
|
|
* start (addr) must be in a valid VMA.
|
|
* end (addr + len) must be in a valid VMA.
|
|
* no gap (unallocated memory) between start and end.
|
|
* start (addr) must be page aligned.
|
|
*
|
|
* len: len will be page aligned implicitly.
|
|
*
|
|
* Below VMA operations are blocked after sealing.
|
|
* 1> Unmapping, moving to another location, and shrinking
|
|
* the size, via munmap() and mremap(), can leave an empty
|
|
* space, therefore can be replaced with a VMA with a new
|
|
* set of attributes.
|
|
* 2> Moving or expanding a different vma into the current location,
|
|
* via mremap().
|
|
* 3> Modifying a VMA via mmap(MAP_FIXED).
|
|
* 4> Size expansion, via mremap(), does not appear to pose any
|
|
* specific risks to sealed VMAs. It is included anyway because
|
|
* the use case is unclear. In any case, users can rely on
|
|
* merging to expand a sealed VMA.
|
|
* 5> mprotect and pkey_mprotect.
|
|
* 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
|
|
* for anonymous memory, when users don't have write permission to the
|
|
* memory. Those behaviors can alter region contents by discarding pages,
|
|
* effectively a memset(0) for anonymous memory.
|
|
*
|
|
* flags: reserved.
|
|
*
|
|
* return values:
|
|
* zero: success.
|
|
* -EINVAL:
|
|
* invalid input flags.
|
|
* start address is not page aligned.
|
|
* Address arange (start + len) overflow.
|
|
* -ENOMEM:
|
|
* addr is not a valid address (not allocated).
|
|
* end (start + len) is not a valid address.
|
|
* a gap (unallocated memory) between start and end.
|
|
* -EPERM:
|
|
* - In 32 bit architecture, sealing is not supported.
|
|
* Note:
|
|
* user can call mseal(2) multiple times, adding a seal on an
|
|
* already sealed memory is a no-action (no error).
|
|
*
|
|
* unseal() is not supported.
|
|
*/
|
|
static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
|
|
{
|
|
size_t len;
|
|
int ret = 0;
|
|
unsigned long end;
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
ret = can_do_mseal(flags);
|
|
if (ret)
|
|
return ret;
|
|
|
|
start = untagged_addr(start);
|
|
if (!PAGE_ALIGNED(start))
|
|
return -EINVAL;
|
|
|
|
len = PAGE_ALIGN(len_in);
|
|
/* Check to see whether len was rounded up from small -ve to zero. */
|
|
if (len_in && !len)
|
|
return -EINVAL;
|
|
|
|
end = start + len;
|
|
if (end < start)
|
|
return -EINVAL;
|
|
|
|
if (end == start)
|
|
return 0;
|
|
|
|
if (mmap_write_lock_killable(mm))
|
|
return -EINTR;
|
|
|
|
/*
|
|
* First pass, this helps to avoid
|
|
* partial sealing in case of error in input address range,
|
|
* e.g. ENOMEM error.
|
|
*/
|
|
ret = check_mm_seal(start, end);
|
|
if (ret)
|
|
goto out;
|
|
|
|
/*
|
|
* Second pass, this should success, unless there are errors
|
|
* from vma_modify_flags, e.g. merge/split error, or process
|
|
* reaching the max supported VMAs, however, those cases shall
|
|
* be rare.
|
|
*/
|
|
ret = apply_mm_seal(start, end);
|
|
|
|
out:
|
|
mmap_write_unlock(current->mm);
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
|
|
flags)
|
|
{
|
|
return do_mseal(start, len, flags);
|
|
}
|