mirror of
https://github.com/torvalds/linux.git
synced 2026-05-12 16:18:45 +02:00
In load_elf_binary as part of the execve(), when the current
task’s personality has MMAP_PAGE_ZERO set, the kernel allocates
one page at address 0. According to the comment:
/* Why this, you ask??? Well SVr4 maps page 0 as read-only,
and some applications "depend" upon this behavior.
Since we do not have the power to recompile these, we
emulate the SVr4 behavior. Sigh. */
At one point, Linus suggested removing this [1].
Code search in debian didn't see much use of MMAP_PAGE_ZERO [2],
it exists in util and test (rr).
Sealing this is probably safe, the comment doesn't say
the app ever wanting to change the mapping to rwx. Sealing
also ensures that never happens.
If there is a complaint, we can make this configurable.
Link: https://lore.kernel.org/lkml/CAHk-=whVa=nm_GW=NVfPHqcxDbWt4JjjK1YWb0cLjO4ZSGyiDA@mail.gmail.com/ [1]
Link: https://codesearch.debian.net/search?q=MMAP_PAGE_ZERO&literal=1&perpkg=1&page=1 [2]
Signed-off-by: Jeff Xu <jeffxu@chromium.org>
Link: https://lore.kernel.org/r/20240806214931.2198172-2-jeffxu@google.com
Signed-off-by: Kees Cook <kees@kernel.org>
308 lines
7.1 KiB
C
308 lines
7.1 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Implement mseal() syscall.
|
|
*
|
|
* Copyright (c) 2023,2024 Google, Inc.
|
|
*
|
|
* Author: Jeff Xu <jeffxu@chromium.org>
|
|
*/
|
|
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mm_inline.h>
|
|
#include <linux/mmu_context.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/sched.h>
|
|
#include "internal.h"
|
|
|
|
static inline bool vma_is_sealed(struct vm_area_struct *vma)
|
|
{
|
|
return (vma->vm_flags & VM_SEALED);
|
|
}
|
|
|
|
static inline void set_vma_sealed(struct vm_area_struct *vma)
|
|
{
|
|
vm_flags_set(vma, VM_SEALED);
|
|
}
|
|
|
|
/*
|
|
* check if a vma is sealed for modification.
|
|
* return true, if modification is allowed.
|
|
*/
|
|
static bool can_modify_vma(struct vm_area_struct *vma)
|
|
{
|
|
if (unlikely(vma_is_sealed(vma)))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool is_madv_discard(int behavior)
|
|
{
|
|
return behavior &
|
|
(MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED |
|
|
MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK);
|
|
}
|
|
|
|
static bool is_ro_anon(struct vm_area_struct *vma)
|
|
{
|
|
/* check anonymous mapping. */
|
|
if (vma->vm_file || vma->vm_flags & VM_SHARED)
|
|
return false;
|
|
|
|
/*
|
|
* check for non-writable:
|
|
* PROT=RO or PKRU is not writeable.
|
|
*/
|
|
if (!(vma->vm_flags & VM_WRITE) ||
|
|
!arch_vma_access_permitted(vma, true, false, false))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Check if the vmas of a memory range are allowed to be modified.
|
|
* the memory ranger can have a gap (unallocated memory).
|
|
* return true, if it is allowed.
|
|
*/
|
|
bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
|
|
VMA_ITERATOR(vmi, mm, start);
|
|
|
|
/* going through each vma to check. */
|
|
for_each_vma_range(vmi, vma, end) {
|
|
if (unlikely(!can_modify_vma(vma)))
|
|
return false;
|
|
}
|
|
|
|
/* Allow by default. */
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Check if the vmas of a memory range are allowed to be modified by madvise.
|
|
* the memory ranger can have a gap (unallocated memory).
|
|
* return true, if it is allowed.
|
|
*/
|
|
bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
|
|
int behavior)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
|
|
VMA_ITERATOR(vmi, mm, start);
|
|
|
|
if (!is_madv_discard(behavior))
|
|
return true;
|
|
|
|
/* going through each vma to check. */
|
|
for_each_vma_range(vmi, vma, end)
|
|
if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
|
|
return false;
|
|
|
|
/* Allow by default. */
|
|
return true;
|
|
}
|
|
|
|
static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|
struct vm_area_struct **prev, unsigned long start,
|
|
unsigned long end, vm_flags_t newflags)
|
|
{
|
|
int ret = 0;
|
|
vm_flags_t oldflags = vma->vm_flags;
|
|
|
|
if (newflags == oldflags)
|
|
goto out;
|
|
|
|
vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
|
|
if (IS_ERR(vma)) {
|
|
ret = PTR_ERR(vma);
|
|
goto out;
|
|
}
|
|
|
|
set_vma_sealed(vma);
|
|
out:
|
|
*prev = vma;
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Check for do_mseal:
|
|
* 1> start is part of a valid vma.
|
|
* 2> end is part of a valid vma.
|
|
* 3> No gap (unallocated address) between start and end.
|
|
* 4> map is sealable.
|
|
*/
|
|
static int check_mm_seal(unsigned long start, unsigned long end)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
unsigned long nstart = start;
|
|
|
|
VMA_ITERATOR(vmi, current->mm, start);
|
|
|
|
/* going through each vma to check. */
|
|
for_each_vma_range(vmi, vma, end) {
|
|
if (vma->vm_start > nstart)
|
|
/* unallocated memory found. */
|
|
return -ENOMEM;
|
|
|
|
if (vma->vm_end >= end)
|
|
return 0;
|
|
|
|
nstart = vma->vm_end;
|
|
}
|
|
|
|
return -ENOMEM;
|
|
}
|
|
|
|
/*
|
|
* Apply sealing.
|
|
*/
|
|
static int apply_mm_seal(unsigned long start, unsigned long end)
|
|
{
|
|
unsigned long nstart;
|
|
struct vm_area_struct *vma, *prev;
|
|
|
|
VMA_ITERATOR(vmi, current->mm, start);
|
|
|
|
vma = vma_iter_load(&vmi);
|
|
/*
|
|
* Note: check_mm_seal should already checked ENOMEM case.
|
|
* so vma should not be null, same for the other ENOMEM cases.
|
|
*/
|
|
prev = vma_prev(&vmi);
|
|
if (start > vma->vm_start)
|
|
prev = vma;
|
|
|
|
nstart = start;
|
|
for_each_vma_range(vmi, vma, end) {
|
|
int error;
|
|
unsigned long tmp;
|
|
vm_flags_t newflags;
|
|
|
|
newflags = vma->vm_flags | VM_SEALED;
|
|
tmp = vma->vm_end;
|
|
if (tmp > end)
|
|
tmp = end;
|
|
error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
|
|
if (error)
|
|
return error;
|
|
nstart = vma_iter_end(&vmi);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* mseal(2) seals the VM's meta data from
|
|
* selected syscalls.
|
|
*
|
|
* addr/len: VM address range.
|
|
*
|
|
* The address range by addr/len must meet:
|
|
* start (addr) must be in a valid VMA.
|
|
* end (addr + len) must be in a valid VMA.
|
|
* no gap (unallocated memory) between start and end.
|
|
* start (addr) must be page aligned.
|
|
*
|
|
* len: len will be page aligned implicitly.
|
|
*
|
|
* Below VMA operations are blocked after sealing.
|
|
* 1> Unmapping, moving to another location, and shrinking
|
|
* the size, via munmap() and mremap(), can leave an empty
|
|
* space, therefore can be replaced with a VMA with a new
|
|
* set of attributes.
|
|
* 2> Moving or expanding a different vma into the current location,
|
|
* via mremap().
|
|
* 3> Modifying a VMA via mmap(MAP_FIXED).
|
|
* 4> Size expansion, via mremap(), does not appear to pose any
|
|
* specific risks to sealed VMAs. It is included anyway because
|
|
* the use case is unclear. In any case, users can rely on
|
|
* merging to expand a sealed VMA.
|
|
* 5> mprotect and pkey_mprotect.
|
|
* 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
|
|
* for anonymous memory, when users don't have write permission to the
|
|
* memory. Those behaviors can alter region contents by discarding pages,
|
|
* effectively a memset(0) for anonymous memory.
|
|
*
|
|
* flags: reserved.
|
|
*
|
|
* return values:
|
|
* zero: success.
|
|
* -EINVAL:
|
|
* invalid input flags.
|
|
* start address is not page aligned.
|
|
* Address arange (start + len) overflow.
|
|
* -ENOMEM:
|
|
* addr is not a valid address (not allocated).
|
|
* end (start + len) is not a valid address.
|
|
* a gap (unallocated memory) between start and end.
|
|
* -EPERM:
|
|
* - In 32 bit architecture, sealing is not supported.
|
|
* Note:
|
|
* user can call mseal(2) multiple times, adding a seal on an
|
|
* already sealed memory is a no-action (no error).
|
|
*
|
|
* unseal() is not supported.
|
|
*/
|
|
int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
|
|
{
|
|
size_t len;
|
|
int ret = 0;
|
|
unsigned long end;
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
ret = can_do_mseal(flags);
|
|
if (ret)
|
|
return ret;
|
|
|
|
start = untagged_addr(start);
|
|
if (!PAGE_ALIGNED(start))
|
|
return -EINVAL;
|
|
|
|
len = PAGE_ALIGN(len_in);
|
|
/* Check to see whether len was rounded up from small -ve to zero. */
|
|
if (len_in && !len)
|
|
return -EINVAL;
|
|
|
|
end = start + len;
|
|
if (end < start)
|
|
return -EINVAL;
|
|
|
|
if (end == start)
|
|
return 0;
|
|
|
|
if (mmap_write_lock_killable(mm))
|
|
return -EINTR;
|
|
|
|
/*
|
|
* First pass, this helps to avoid
|
|
* partial sealing in case of error in input address range,
|
|
* e.g. ENOMEM error.
|
|
*/
|
|
ret = check_mm_seal(start, end);
|
|
if (ret)
|
|
goto out;
|
|
|
|
/*
|
|
* Second pass, this should success, unless there are errors
|
|
* from vma_modify_flags, e.g. merge/split error, or process
|
|
* reaching the max supported VMAs, however, those cases shall
|
|
* be rare.
|
|
*/
|
|
ret = apply_mm_seal(start, end);
|
|
|
|
out:
|
|
mmap_write_unlock(current->mm);
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
|
|
flags)
|
|
{
|
|
return do_mseal(start, len, flags);
|
|
}
|