mirror of
https://github.com/torvalds/linux.git
synced 2026-05-12 16:18:45 +02:00
Update the vma_modify_flags() and vma_modify_flags_uffd() functions to accept a vma_flags_t parameter rather than a vm_flags_t one, and propagate the changes as needed to implement this change. Also add vma_flags_reset_once() in replacement of vm_flags_reset_once(). We still need to be careful here because we need to avoid tearing, so maintain the assumption that the first system word set of flags are the only ones that require protection from tearing, and retain this functionality. We can copy the remainder of VMA flags above 64 bits normally. But hopefully by the time that happens, we will have replaced the logic that requires these WRITE_ONCE()'s with something else. We also replace instances of vm_flags_reset() with a simple write of VMA flags. We are no longer perform a number of checks, most notable of all the VMA flags asserts becase: 1. We might be operating on a VMA that is not yet added to the tree. 2. We might be operating on a VMA that is now detached. 3. Really in all but core code, you should be using vma_desc_xxx(). 4. Other VMA fields are manipulated with no such checks. 5. It'd be egregious to have to add variants of flag functions just to account for cases such as the above, especially when we don't do so for other VMA fields. Drivers are the problematic cases and why it was especially important (and also for debug as VMA locks were introduced), the mmap_prepare work is solving this generally. Additionally, we can fairly safely assume by this point the soft dirty flags are being set correctly, so it's reasonable to drop this also. Finally, update the VMA tests to reflect this. Link: https://lkml.kernel.org/r/51afbb2b8c3681003cc7926647e37335d793836e.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org> Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org> Cc: Albert Ou <aou@eecs.berkeley.edu> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Alexandre Ghiti <alex@ghiti.fr> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com> Cc: "Borislav Petkov (AMD)" <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chengming Zhou <chengming.zhou@linux.dev> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Christian Brauner <brauner@kernel.org> Cc: David Hildenbrand <david@kernel.org> Cc: Dinh Nguyen <dinguyen@kernel.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jann Horn <jannh@google.com> Cc: Johannes Berg <johannes@sipsolutions.net> Cc: Kees Cook <kees@kernel.org> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Madhavan Srinivasan <maddy@linux.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Rapoport <rppt@kernel.org> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Ondrej Mosnacek <omosnace@redhat.com> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Paul Moore <paul@paul-moore.com> Cc: Pedro Falcato <pfalcato@suse.de> Cc: Richard Weinberger <richard@nod.at> Cc: Russell King <linux@armlinux.org.uk> Cc: Stephen Smalley <stephen.smalley.work@gmail.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Vineet Gupta <vgupta@kernel.org> Cc: WANG Xuerui <kernel@xen0n.name> Cc: Will Deacon <will@kernel.org> Cc: xu xin <xu.xin16@zte.com.cn> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
195 lines
5.1 KiB
C
195 lines
5.1 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Implement mseal() syscall.
|
|
*
|
|
* Copyright (c) 2023,2024 Google, Inc.
|
|
*
|
|
* Author: Jeff Xu <jeffxu@chromium.org>
|
|
*/
|
|
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mm_inline.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/sched.h>
|
|
#include "internal.h"
|
|
|
|
/*
|
|
* mseal() disallows an input range which contain unmapped ranges (VMA holes).
|
|
*
|
|
* It disallows unmapped regions from start to end whether they exist at the
|
|
* start, in the middle, or at the end of the range, or any combination thereof.
|
|
*
|
|
* This is because after sealing a range, there's nothing to stop memory mapping
|
|
* of ranges in the remaining gaps later, meaning that the user might then
|
|
* wrongly consider the entirety of the mseal()'d range to be sealed when it
|
|
* in fact isn't.
|
|
*/
|
|
|
|
/*
|
|
* Does the [start, end) range contain any unmapped memory?
|
|
*
|
|
* We ensure that:
|
|
* - start is part of a valid VMA.
|
|
* - end is part of a valid VMA.
|
|
* - no gap (unallocated memory) exists between start and end.
|
|
*/
|
|
static bool range_contains_unmapped(struct mm_struct *mm,
|
|
unsigned long start, unsigned long end)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
unsigned long prev_end = start;
|
|
VMA_ITERATOR(vmi, current->mm, start);
|
|
|
|
for_each_vma_range(vmi, vma, end) {
|
|
if (vma->vm_start > prev_end)
|
|
return true;
|
|
|
|
prev_end = vma->vm_end;
|
|
}
|
|
|
|
return prev_end < end;
|
|
}
|
|
|
|
static int mseal_apply(struct mm_struct *mm,
|
|
unsigned long start, unsigned long end)
|
|
{
|
|
struct vm_area_struct *vma, *prev;
|
|
VMA_ITERATOR(vmi, mm, start);
|
|
|
|
/* We know there are no gaps so this will be non-NULL. */
|
|
vma = vma_iter_load(&vmi);
|
|
prev = vma_prev(&vmi);
|
|
if (start > vma->vm_start)
|
|
prev = vma;
|
|
|
|
for_each_vma_range(vmi, vma, end) {
|
|
const unsigned long curr_start = MAX(vma->vm_start, start);
|
|
const unsigned long curr_end = MIN(vma->vm_end, end);
|
|
|
|
if (!vma_test(vma, VMA_SEALED_BIT)) {
|
|
vma_flags_t vma_flags = vma->flags;
|
|
|
|
vma_flags_set(&vma_flags, VMA_SEALED_BIT);
|
|
|
|
vma = vma_modify_flags(&vmi, prev, vma, curr_start,
|
|
curr_end, &vma_flags);
|
|
if (IS_ERR(vma))
|
|
return PTR_ERR(vma);
|
|
vma_start_write(vma);
|
|
vma_set_flags(vma, VMA_SEALED_BIT);
|
|
}
|
|
|
|
prev = vma;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* mseal(2) seals the VM's meta data from
|
|
* selected syscalls.
|
|
*
|
|
* addr/len: VM address range.
|
|
*
|
|
* The address range by addr/len must meet:
|
|
* start (addr) must be in a valid VMA.
|
|
* end (addr + len) must be in a valid VMA.
|
|
* no gap (unallocated memory) between start and end.
|
|
* start (addr) must be page aligned.
|
|
*
|
|
* len: len will be page aligned implicitly.
|
|
*
|
|
* Below VMA operations are blocked after sealing.
|
|
* 1> Unmapping, moving to another location, and shrinking
|
|
* the size, via munmap() and mremap(), can leave an empty
|
|
* space, therefore can be replaced with a VMA with a new
|
|
* set of attributes.
|
|
* 2> Moving or expanding a different vma into the current location,
|
|
* via mremap().
|
|
* 3> Modifying a VMA via mmap(MAP_FIXED).
|
|
* 4> Size expansion, via mremap(), does not appear to pose any
|
|
* specific risks to sealed VMAs. It is included anyway because
|
|
* the use case is unclear. In any case, users can rely on
|
|
* merging to expand a sealed VMA.
|
|
* 5> mprotect and pkey_mprotect.
|
|
* 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
|
|
* for anonymous memory, when users don't have write permission to the
|
|
* memory. Those behaviors can alter region contents by discarding pages,
|
|
* effectively a memset(0) for anonymous memory.
|
|
*
|
|
* flags: reserved.
|
|
*
|
|
* return values:
|
|
* zero: success.
|
|
* -EINVAL:
|
|
* invalid input flags.
|
|
* start address is not page aligned.
|
|
* Address range (start + len) overflow.
|
|
* -ENOMEM:
|
|
* addr is not a valid address (not allocated).
|
|
* end (start + len) is not a valid address.
|
|
* a gap (unallocated memory) between start and end.
|
|
* -EPERM:
|
|
* - In 32 bit architecture, sealing is not supported.
|
|
* Note:
|
|
* user can call mseal(2) multiple times, adding a seal on an
|
|
* already sealed memory is a no-action (no error).
|
|
*
|
|
* unseal() is not supported.
|
|
*/
|
|
int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
|
|
{
|
|
size_t len;
|
|
int ret = 0;
|
|
unsigned long end;
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
/* Verify flags not set. */
|
|
if (flags)
|
|
return -EINVAL;
|
|
|
|
start = untagged_addr(start);
|
|
if (!PAGE_ALIGNED(start))
|
|
return -EINVAL;
|
|
|
|
len = PAGE_ALIGN(len_in);
|
|
/* Check to see whether len was rounded up from small -ve to zero. */
|
|
if (len_in && !len)
|
|
return -EINVAL;
|
|
|
|
end = start + len;
|
|
if (end < start)
|
|
return -EINVAL;
|
|
|
|
if (end == start)
|
|
return 0;
|
|
|
|
if (mmap_write_lock_killable(mm))
|
|
return -EINTR;
|
|
|
|
if (range_contains_unmapped(mm, start, end)) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* Second pass, this should success, unless there are errors
|
|
* from vma_modify_flags, e.g. merge/split error, or process
|
|
* reaching the max supported VMAs, however, those cases shall
|
|
* be rare.
|
|
*/
|
|
ret = mseal_apply(mm, start, end);
|
|
|
|
out:
|
|
mmap_write_unlock(mm);
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
|
|
flags)
|
|
{
|
|
return do_mseal(start, len, flags);
|
|
}
|