linux/mm/mseal.c
Pedro Falcato 4d1b341665 mm: move can_modify_vma to mm/vma.h
Patch series "mm: Optimize mseal checks", v3.

Optimize mseal checks by removing the separate can_modify_mm() step, and
just doing checks on the individual vmas, when various operations are
themselves iterating through the tree.  This provides a nice speedup and
restores performance parity with pre-mseal[3].

will-it-scale mmap1_process[1] -t 1 results:

commit 3450fe2b574b4345e4296ccae395149e1a357fee:

min:277605 max:277605 total:277605
min:281784 max:281784 total:281784
min:277238 max:277238 total:277238
min:281761 max:281761 total:281761
min:274279 max:274279 total:274279
min:254854 max:254854 total:254854
measurement
min:269143 max:269143 total:269143
min:270454 max:270454 total:270454
min:243523 max:243523 total:243523
min:251148 max:251148 total:251148
min:209669 max:209669 total:209669
min:190426 max:190426 total:190426
min:231219 max:231219 total:231219
min:275364 max:275364 total:275364
min:266540 max:266540 total:266540
min:242572 max:242572 total:242572
min:284469 max:284469 total:284469
min:278882 max:278882 total:278882
min:283269 max:283269 total:283269
min:281204 max:281204 total:281204

After this patch set:

min:280580 max:280580 total:280580
min:290514 max:290514 total:290514
min:291006 max:291006 total:291006
min:290352 max:290352 total:290352
min:294582 max:294582 total:294582
min:293075 max:293075 total:293075
measurement
min:295613 max:295613 total:295613
min:294070 max:294070 total:294070
min:293193 max:293193 total:293193
min:291631 max:291631 total:291631
min:295278 max:295278 total:295278
min:293782 max:293782 total:293782
min:290361 max:290361 total:290361
min:294517 max:294517 total:294517
min:293750 max:293750 total:293750
min:293572 max:293572 total:293572
min:295239 max:295239 total:295239
min:292932 max:292932 total:292932
min:293319 max:293319 total:293319
min:294954 max:294954 total:294954

This was a Completely Unscientific test but seems to show there were
around 5-10% gains on ops per second.

Oliver performed his own tests and showed[3] a similar ~5% gain in them.

[1]: mmap1_process does mmap and munmap in a loop. I didn't bother testing multithreading cases.
[2]: https://lore.kernel.org/all/20240807124103.85644-1-mpe@ellerman.id.au/
[3]: https://lore.kernel.org/all/ZrMMJfe9aXSWxJz6@xsang-OptiPlex-9020/
Link: https://lore.kernel.org/all/202408041602.caa0372-oliver.sang@intel.com/


This patch (of 7):

Move can_modify_vma to vma.h so it can be inlined properly (with the
intent to remove can_modify_mm callsites).

Link: https://lkml.kernel.org/r/20240817-mseal-depessimize-v3-1-d8d2e037df30@gmail.com
Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Kees Cook <kees@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-09-03 21:15:40 -07:00

299 lines
6.8 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Implement mseal() syscall.
*
* Copyright (c) 2023,2024 Google, Inc.
*
* Author: Jeff Xu <jeffxu@chromium.org>
*/
#include <linux/mempolicy.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_context.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include "internal.h"
static inline void set_vma_sealed(struct vm_area_struct *vma)
{
vm_flags_set(vma, VM_SEALED);
}
static bool is_madv_discard(int behavior)
{
switch (behavior) {
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
case MADV_REMOVE:
case MADV_DONTFORK:
case MADV_WIPEONFORK:
return true;
}
return false;
}
static bool is_ro_anon(struct vm_area_struct *vma)
{
/* check anonymous mapping. */
if (vma->vm_file || vma->vm_flags & VM_SHARED)
return false;
/*
* check for non-writable:
* PROT=RO or PKRU is not writeable.
*/
if (!(vma->vm_flags & VM_WRITE) ||
!arch_vma_access_permitted(vma, true, false, false))
return true;
return false;
}
/*
* Check if the vmas of a memory range are allowed to be modified.
* the memory ranger can have a gap (unallocated memory).
* return true, if it is allowed.
*/
bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, start);
/* going through each vma to check. */
for_each_vma_range(vmi, vma, end) {
if (unlikely(!can_modify_vma(vma)))
return false;
}
/* Allow by default. */
return true;
}
/*
* Check if the vmas of a memory range are allowed to be modified by madvise.
* the memory ranger can have a gap (unallocated memory).
* return true, if it is allowed.
*/
bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
int behavior)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, start);
if (!is_madv_discard(behavior))
return true;
/* going through each vma to check. */
for_each_vma_range(vmi, vma, end)
if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
return false;
/* Allow by default. */
return true;
}
static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, vm_flags_t newflags)
{
int ret = 0;
vm_flags_t oldflags = vma->vm_flags;
if (newflags == oldflags)
goto out;
vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
}
set_vma_sealed(vma);
out:
*prev = vma;
return ret;
}
/*
* Check for do_mseal:
* 1> start is part of a valid vma.
* 2> end is part of a valid vma.
* 3> No gap (unallocated address) between start and end.
* 4> map is sealable.
*/
static int check_mm_seal(unsigned long start, unsigned long end)
{
struct vm_area_struct *vma;
unsigned long nstart = start;
VMA_ITERATOR(vmi, current->mm, start);
/* going through each vma to check. */
for_each_vma_range(vmi, vma, end) {
if (vma->vm_start > nstart)
/* unallocated memory found. */
return -ENOMEM;
if (vma->vm_end >= end)
return 0;
nstart = vma->vm_end;
}
return -ENOMEM;
}
/*
* Apply sealing.
*/
static int apply_mm_seal(unsigned long start, unsigned long end)
{
unsigned long nstart;
struct vm_area_struct *vma, *prev;
VMA_ITERATOR(vmi, current->mm, start);
vma = vma_iter_load(&vmi);
/*
* Note: check_mm_seal should already checked ENOMEM case.
* so vma should not be null, same for the other ENOMEM cases.
*/
prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
nstart = start;
for_each_vma_range(vmi, vma, end) {
int error;
unsigned long tmp;
vm_flags_t newflags;
newflags = vma->vm_flags | VM_SEALED;
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
if (error)
return error;
nstart = vma_iter_end(&vmi);
}
return 0;
}
/*
* mseal(2) seals the VM's meta data from
* selected syscalls.
*
* addr/len: VM address range.
*
* The address range by addr/len must meet:
* start (addr) must be in a valid VMA.
* end (addr + len) must be in a valid VMA.
* no gap (unallocated memory) between start and end.
* start (addr) must be page aligned.
*
* len: len will be page aligned implicitly.
*
* Below VMA operations are blocked after sealing.
* 1> Unmapping, moving to another location, and shrinking
* the size, via munmap() and mremap(), can leave an empty
* space, therefore can be replaced with a VMA with a new
* set of attributes.
* 2> Moving or expanding a different vma into the current location,
* via mremap().
* 3> Modifying a VMA via mmap(MAP_FIXED).
* 4> Size expansion, via mremap(), does not appear to pose any
* specific risks to sealed VMAs. It is included anyway because
* the use case is unclear. In any case, users can rely on
* merging to expand a sealed VMA.
* 5> mprotect and pkey_mprotect.
* 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
* for anonymous memory, when users don't have write permission to the
* memory. Those behaviors can alter region contents by discarding pages,
* effectively a memset(0) for anonymous memory.
*
* flags: reserved.
*
* return values:
* zero: success.
* -EINVAL:
* invalid input flags.
* start address is not page aligned.
* Address arange (start + len) overflow.
* -ENOMEM:
* addr is not a valid address (not allocated).
* end (start + len) is not a valid address.
* a gap (unallocated memory) between start and end.
* -EPERM:
* - In 32 bit architecture, sealing is not supported.
* Note:
* user can call mseal(2) multiple times, adding a seal on an
* already sealed memory is a no-action (no error).
*
* unseal() is not supported.
*/
static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
{
size_t len;
int ret = 0;
unsigned long end;
struct mm_struct *mm = current->mm;
ret = can_do_mseal(flags);
if (ret)
return ret;
start = untagged_addr(start);
if (!PAGE_ALIGNED(start))
return -EINVAL;
len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero. */
if (len_in && !len)
return -EINVAL;
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
if (mmap_write_lock_killable(mm))
return -EINTR;
/*
* First pass, this helps to avoid
* partial sealing in case of error in input address range,
* e.g. ENOMEM error.
*/
ret = check_mm_seal(start, end);
if (ret)
goto out;
/*
* Second pass, this should success, unless there are errors
* from vma_modify_flags, e.g. merge/split error, or process
* reaching the max supported VMAs, however, those cases shall
* be rare.
*/
ret = apply_mm_seal(start, end);
out:
mmap_write_unlock(current->mm);
return ret;
}
SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
flags)
{
return do_mseal(start, len, flags);
}