diff --git a/arch/Kconfig b/arch/Kconfig index a4429bcd609e..84c94a89e75b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -331,6 +331,7 @@ config HAVE_ARCH_SECCOMP_FILTER - secure_computing is called from a ptrace_event()-safe context - secure_computing return value is checked and a return value of -1 results in the system call being skipped immediately. + - seccomp syscall wired up config SECCOMP_FILTER def_bool y diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h index 8dcd9c702d90..b00ef075bc2e 100644 --- a/arch/arm/include/asm/barrier.h +++ b/arch/arm/include/asm/barrier.h @@ -59,6 +59,21 @@ #define smp_wmb() dmb() #endif +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + #define read_barrier_depends() do { } while(0) #define smp_read_barrier_depends() do { } while(0) diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h index f1d96d4e8092..6db3caacb31a 100644 --- a/arch/arm/include/asm/syscall.h +++ b/arch/arm/include/asm/syscall.h @@ -97,8 +97,7 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->ARM_r0 + i, args, n * sizeof(args[0])); } -static inline int syscall_get_arch(struct task_struct *task, - struct pt_regs *regs) +static inline int syscall_get_arch(void) { /* ARM tasks don't change audit architectures on the fly. */ return AUDIT_ARCH_ARM; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 2221396c2a6c..43f1a2ee307c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -20,6 +20,7 @@ config ARM64 select GENERIC_TIME_VSYSCALL select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK @@ -232,6 +233,20 @@ config ARMV7_COMPAT_CPUINFO source "mm/Kconfig" +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + ---help--- + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via prctl(PR_SET_SECCOMP), it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + endmenu menu "Boot options" diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index d4a63338a53c..78e20ba8806b 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -35,10 +35,60 @@ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + #else + #define smp_mb() asm volatile("dmb ish" : : : "memory") #define smp_rmb() asm volatile("dmb ishld" : : : "memory") #define smp_wmb() asm volatile("dmb ishst" : : : "memory") + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + switch (sizeof(*p)) { \ + case 4: \ + asm volatile ("stlr %w1, %0" \ + : "=Q" (*p) : "r" (v) : "memory"); \ + break; \ + case 8: \ + asm volatile ("stlr %1, %0" \ + : "=Q" (*p) : "r" (v) : "memory"); \ + break; \ + } \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1; \ + compiletime_assert_atomic_type(*p); \ + switch (sizeof(*p)) { \ + case 4: \ + asm volatile ("ldar %w0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + break; \ + case 8: \ + asm volatile ("ldar %0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + break; \ + } \ + ___p1; \ +}) + #endif #define read_barrier_depends() do { } while(0) diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index ae0004fe6c23..a79da08994f2 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -191,6 +191,13 @@ typedef struct compat_siginfo { compat_long_t _band; /* POLL_IN, POLL_OUT, POLL_MSG */ int _fd; } _sigpoll; + + /* SIGSYS */ + struct { + compat_uptr_t _call_addr; /* calling user insn */ + int _syscall; /* triggering system call number */ + unsigned int _arch; /* AUDIT_ARCH_* of syscall */ + } _sigsys; } _sifields; } compat_siginfo_t; diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index dbe3b00d1eb7..58768774c7f4 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -60,6 +60,15 @@ #define COMPAT_PT_TEXT_ADDR 0x10000 #define COMPAT_PT_DATA_ADDR 0x10004 #define COMPAT_PT_TEXT_END_ADDR 0x10008 + +/* + * used to skip a system call when tracer changes its number to -1 + * with ptrace(PTRACE_SET_SYSCALL) + */ +#define RET_SKIP_SYSCALL -1 +#define RET_SKIP_SYSCALL_TRACE -2 +#define IS_SKIP_SYSCALL(no) ((int)(no & 0xffffffff) == -1) + #ifndef __ASSEMBLY__ /* sizeof(struct user) for AArch32 */ diff --git a/arch/arm64/include/asm/seccomp.h b/arch/arm64/include/asm/seccomp.h new file mode 100644 index 000000000000..bec3a43f7b17 --- /dev/null +++ b/arch/arm64/include/asm/seccomp.h @@ -0,0 +1,25 @@ +/* + * arch/arm64/include/asm/seccomp.h + * + * Copyright (C) 2014 Linaro Limited + * Author: AKASHI Takahiro linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H + +#include + +#ifdef CONFIG_COMPAT +#define __NR_seccomp_read_32 __NR_compat_read +#define __NR_seccomp_write_32 __NR_compat_write +#define __NR_seccomp_exit_32 __NR_compat_exit +#define __NR_seccomp_sigreturn_32 __NR_compat_rt_sigreturn +#endif /* CONFIG_COMPAT */ + +#include + +#endif /* _ASM_SECCOMP_H */ diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index a8f8f6992987..2a3957faa702 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -30,6 +30,9 @@ * Compat syscall numbers used by the AArch64 kernel. */ #define __NR_compat_restart_syscall 0 +#define __NR_compat_exit 1 +#define __NR_compat_read 3 +#define __NR_compat_write 4 #define __NR_compat_sigreturn 119 #define __NR_compat_rt_sigreturn 173 @@ -40,7 +43,7 @@ #define __ARM_NR_compat_cacheflush (__ARM_NR_COMPAT_BASE+2) #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE+5) -#define __NR_compat_syscalls 378 +#define __NR_compat_syscalls 384 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index dd336c150f3f..76d094565090 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -781,3 +781,11 @@ __SYSCALL(__NR_process_vm_writev, compat_sys_process_vm_writev) __SYSCALL(__NR_kcmp, sys_kcmp) #define __NR_finit_module 379 __SYSCALL(__NR_finit_module, sys_finit_module) +/* #define __NR_sched_setattr 380 */ +__SYSCALL(380, sys_ni_syscall) +/* #define __NR_sched_getattr 381 */ +__SYSCALL(381, sys_ni_syscall) +/* #define __NR_renameat2 382 */ +__SYSCALL(382, sys_ni_syscall) +#define __NR_seccomp 383 +__SYSCALL(__NR_seccomp, sys_seccomp) diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 6913643bbe54..49c61746297d 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -23,6 +23,7 @@ #include +#define PTRACE_SET_SYSCALL 23 /* * PSR bits diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 3c93e1a9ea8d..8927e52be32f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -662,6 +663,10 @@ __sys_trace: mov x0, sp bl syscall_trace_enter adr lr, __sys_trace_return // return address + cmp w0, #RET_SKIP_SYSCALL_TRACE // skip syscall and tracing? + b.eq ret_to_user + cmp w0, #RET_SKIP_SYSCALL // skip syscall? + b.eq __sys_trace_return_skipped uxtw scno, w0 // syscall number (possibly new) mov x1, sp // pointer to regs cmp scno, sc_nr // check upper syscall limit @@ -675,6 +680,7 @@ __sys_trace: __sys_trace_return: str x0, [sp] // save returned x0 +__sys_trace_return_skipped: // x0 already in regs[0] mov x0, sp bl syscall_trace_exit b ret_to_user diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index ee856d9f6f64..f47e70fed6af 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1064,7 +1065,19 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { - return ptrace_request(child, request, addr, data); + int ret; + + switch (request) { + case PTRACE_SET_SYSCALL: + task_pt_regs(child)->syscallno = data; + ret = 0; + break; + default: + ret = ptrace_request(child, request, addr, data); + break; + } + + return ret; } enum ptrace_syscall_dir { @@ -1096,9 +1109,33 @@ static void tracehook_report_syscall(struct pt_regs *regs, asmlinkage int syscall_trace_enter(struct pt_regs *regs) { + unsigned int saved_syscallno = regs->syscallno; + + /* Do the secure computing check first; failures should be fast. */ + if (secure_computing(regs->syscallno) == -1) + return RET_SKIP_SYSCALL_TRACE; + if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); + if (IS_SKIP_SYSCALL(regs->syscallno)) { + /* + * RESTRICTION: we can't modify a return value of user + * issued syscall(-1) here. In order to ease this flavor, + * we need to treat whatever value in x0 as a return value, + * but this might result in a bogus value being returned. + */ + /* + * NOTE: syscallno may also be set to -1 if fatal signal is + * detected in tracehook_report_syscall_entry(), but since + * a value set to x0 here is not used in this case, we may + * neglect the case. + */ + if (!test_thread_flag(TIF_SYSCALL_TRACE) || + (IS_SKIP_SYSCALL(saved_syscallno))) + regs->regs[0] = -ENOSYS; + } + audit_syscall_entry(syscall_get_arch(), regs->syscallno, regs->orig_x0, regs->regs[1], regs->regs[2], regs->regs[3]); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index b4692837e7ab..e1b8b9fb274e 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -211,6 +211,14 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user((compat_uptr_t)(unsigned long)from->si_ptr, &to->si_ptr); break; +#ifdef __ARCH_SIGSYS + case __SI_SYS: + err |= __put_user((compat_uptr_t)(unsigned long) + from->si_call_addr, &to->si_call_addr); + err |= __put_user(from->si_syscall, &to->si_syscall); + err |= __put_user(from->si_arch, &to->si_arch); + break; +#endif default: /* this is just in case for now ... */ err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h index 60576e06b6fb..d0a69aa35e27 100644 --- a/arch/ia64/include/asm/barrier.h +++ b/arch/ia64/include/asm/barrier.h @@ -45,13 +45,36 @@ # define smp_rmb() rmb() # define smp_wmb() wmb() # define smp_read_barrier_depends() read_barrier_depends() + #else + # define smp_mb() barrier() # define smp_rmb() barrier() # define smp_wmb() barrier() # define smp_read_barrier_depends() do { } while(0) + #endif +/* + * IA64 GCC turns volatile stores into st.rel and volatile loads into ld.acq no + * need for asm trickery! + */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + /* * XXX check on this ---I suspect what Linus really wants here is * acquire vs release semantics but we can't discuss this stuff with diff --git a/arch/metag/include/asm/barrier.h b/arch/metag/include/asm/barrier.h index c90bfc6bf648..5d6b4b407dda 100644 --- a/arch/metag/include/asm/barrier.h +++ b/arch/metag/include/asm/barrier.h @@ -82,4 +82,19 @@ static inline void fence(void) #define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; smp_mb(); } while (0) +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + #endif /* _ASM_METAG_BARRIER_H */ diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h index 314ab5532019..52c5b61d7aba 100644 --- a/arch/mips/include/asm/barrier.h +++ b/arch/mips/include/asm/barrier.h @@ -180,4 +180,19 @@ #define nudge_writes() mb() #endif +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + #endif /* __ASM_BARRIER_H */ diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index ae782254e731..f89da808ce31 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -45,11 +45,15 @@ # define SMPWMB eieio #endif +#define __lwsync() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory") + #define smp_mb() mb() -#define smp_rmb() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory") +#define smp_rmb() __lwsync() #define smp_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory") #define smp_read_barrier_depends() read_barrier_depends() #else +#define __lwsync() barrier() + #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() @@ -65,4 +69,19 @@ #define data_barrier(x) \ asm volatile("twi 0,%0,0; isync" : : "r" (x) : "memory"); +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + __lwsync(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + __lwsync(); \ + ___p1; \ +}) + #endif /* _ASM_POWERPC_BARRIER_H */ diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h index 16760eeb79b0..578680f6207a 100644 --- a/arch/s390/include/asm/barrier.h +++ b/arch/s390/include/asm/barrier.h @@ -32,4 +32,19 @@ #define set_mb(var, value) do { var = value; mb(); } while (0) +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* __ASM_BARRIER_H */ diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index cd29d2f4e4f3..bebc0bd8abc2 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -89,11 +89,10 @@ static inline void syscall_set_arguments(struct task_struct *task, regs->orig_gpr2 = args[0]; } -static inline int syscall_get_arch(struct task_struct *task, - struct pt_regs *regs) +static inline int syscall_get_arch(void) { #ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_31BIT)) + if (test_tsk_thread_flag(current, TIF_31BIT)) return AUDIT_ARCH_S390; #endif return sizeof(long) == 8 ? AUDIT_ARCH_S390X : AUDIT_ARCH_S390; diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h index 95d45986f908..b5aad964558e 100644 --- a/arch/sparc/include/asm/barrier_64.h +++ b/arch/sparc/include/asm/barrier_64.h @@ -53,4 +53,19 @@ do { __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \ #define smp_read_barrier_depends() do { } while(0) +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* !(__SPARC64_BARRIER_H) */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index c6cd358a1eec..04a48903b2eb 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -92,12 +92,53 @@ #endif #define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else +#else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() #define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif /* SMP */ + +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + +/* + * For either of these options x86 doesn't have a strong TSO memory + * model and we should fall back to full barriers. + */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + +#else /* regular x86 TSO memory ordering */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 2e188d68397c..f106908a12ec 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -90,8 +90,7 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->bx + i, args, n * sizeof(args[0])); } -static inline int syscall_get_arch(struct task_struct *task, - struct pt_regs *regs) +static inline int syscall_get_arch(void) { return AUDIT_ARCH_I386; } @@ -220,8 +219,7 @@ static inline void syscall_set_arguments(struct task_struct *task, } } -static inline int syscall_get_arch(struct task_struct *task, - struct pt_regs *regs) +static inline int syscall_get_arch(void) { #ifdef CONFIG_IA32_EMULATION /* @@ -233,7 +231,7 @@ static inline int syscall_get_arch(struct task_struct *task, * * x32 tasks should be considered AUDIT_ARCH_X86_64. */ - if (task_thread_info(task)->status & TS_COMPAT) + if (task_thread_info(current)->status & TS_COMPAT) return AUDIT_ARCH_I386; #endif /* Both x32 and x86_64 are considered "64-bit". */ diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index aabfb8380a1c..01ed50255473 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -357,3 +357,7 @@ 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 349 i386 kcmp sys_kcmp 350 i386 finit_module sys_finit_module +# 351 i386 sched_setattr sys_sched_setattr +# 352 i386 sched_getattr sys_sched_getattr +# 353 i386 renameat2 sys_renameat2 +354 i386 seccomp sys_seccomp diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 38ae65dfd14f..a3c38bbd6f01 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -320,6 +320,10 @@ 311 64 process_vm_writev sys_process_vm_writev 312 common kcmp sys_kcmp 313 common finit_module sys_finit_module +# 314 common sched_setattr sys_sched_setattr +# 315 common sched_getattr sys_sched_getattr +# 316 common renameat2 sys_renameat2 +317 common seccomp sys_seccomp # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/exec.c b/fs/exec.c index ffd7a813ad3d..c568bdce6413 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1220,7 +1220,7 @@ EXPORT_SYMBOL(install_exec_creds); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against - * PTRACE_ATTACH + * PTRACE_ATTACH or seccomp thread-sync */ static int check_unsafe_exec(struct linux_binprm *bprm) { @@ -1239,7 +1239,7 @@ static int check_unsafe_exec(struct linux_binprm *bprm) * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ - if (current->no_new_privs) + if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; n_fs = 1; @@ -1286,7 +1286,7 @@ int prepare_binprm(struct linux_binprm *bprm) bprm->cred->egid = current_egid(); if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && - !current->no_new_privs && + !task_no_new_privs(current) && kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { /* Set-uid? */ diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 639d7a4d033b..01613b382b0e 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -46,5 +46,20 @@ #define read_barrier_depends() do {} while (0) #define smp_read_barrier_depends() do {} while (0) +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h new file mode 100644 index 000000000000..663ac3dc02ea --- /dev/null +++ b/include/asm-generic/seccomp.h @@ -0,0 +1,29 @@ +/* + * include/asm-generic/seccomp.h + * + * Copyright (C) 2014 Linaro Limited + * Author: AKASHI Takahiro linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _ASM_GENERIC_SECCOMP_H +#define _ASM_GENERIC_SECCOMP_H + +#include + +#if defined(CONFIG_COMPAT) && !defined(__NR_seccomp_read_32) +#define __NR_seccomp_read_32 __NR_read +#define __NR_seccomp_write_32 __NR_write +#define __NR_seccomp_exit_32 __NR_exit +#define __NR_seccomp_sigreturn_32 __NR_rt_sigreturn +#endif /* CONFIG_COMPAT && ! already defined */ + +#define __NR_seccomp_read __NR_read +#define __NR_seccomp_write __NR_write +#define __NR_seccomp_exit __NR_exit +#define __NR_seccomp_sigreturn __NR_rt_sigreturn + +#endif /* _ASM_GENERIC_SECCOMP_H */ + diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 5b09392db673..d401e5463fb0 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -144,8 +144,6 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, /** * syscall_get_arch - return the AUDIT_ARCH for the current system call - * @task: task of interest, must be in system call entry tracing - * @regs: task_pt_regs() of @task * * Returns the AUDIT_ARCH_* based on the system call convention in use. * @@ -155,5 +153,5 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must * provide an implementation of this. */ -int syscall_get_arch(struct task_struct *task, struct pt_regs *regs); +int syscall_get_arch(void); #endif /* _ASM_SYSCALL_H */ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 92669cd182a6..fe7a686dfd8d 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -298,6 +298,11 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) #endif +/* Is this type a native word size -- useful for atomic operations */ +#ifndef __native_word +# define __native_word(t) (sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) +#endif + /* Compile time object size, -1 for unknown */ #ifndef __compiletime_object_size # define __compiletime_object_size(obj) -1 @@ -337,6 +342,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #define compiletime_assert(condition, msg) \ _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__) +#define compiletime_assert_atomic_type(t) \ + compiletime_assert(__native_word(t), \ + "Need native word sized stores/loads for atomicity.") + /* * Prevent the compiler from merging or refetching accesses. The compiler * is also forbidden from reordering successive instances of ACCESS_ONCE(), diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 5cd0f0949927..998f4dfedecf 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -40,6 +40,7 @@ extern struct fs_struct init_fs; #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ + .thread_head = LIST_HEAD_INIT(init_task.thread_node), \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ .shared_pending = { \ .list = LIST_HEAD_INIT(sig.shared_pending.list), \ @@ -213,6 +214,7 @@ extern struct task_group root_task_group; [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ }, \ .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ + .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ INIT_IDS \ INIT_PERF_EVENTS(tsk) \ INIT_TRACE_IRQFLAGS \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 7ccfeb1e4067..da5fe76a069b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -476,6 +476,7 @@ struct signal_struct { atomic_t sigcnt; atomic_t live; int nr_threads; + struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ @@ -1117,13 +1118,12 @@ struct task_struct { * execve */ unsigned in_iowait:1; - /* task may not gain privileges */ - unsigned no_new_privs:1; - /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; + unsigned long atomic_flags; /* Flags needing atomic access. */ + pid_t pid; pid_t tgid; @@ -1156,6 +1156,7 @@ struct task_struct { /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; + struct list_head thread_node; struct completion *vfork_done; /* for vfork() */ int __user *set_child_tid; /* CLONE_CHILD_SETTID */ @@ -1687,6 +1688,19 @@ static inline void memalloc_noio_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; } +/* Per-process atomic flags. */ +#define PFA_NO_NEW_PRIVS 0x00000001 /* May not gain new privileges. */ + +static inline bool task_no_new_privs(struct task_struct *p) +{ + return test_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags); +} + +static inline void task_set_no_new_privs(struct task_struct *p) +{ + set_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags); +} + /* * task->jobctl flags */ @@ -2166,6 +2180,16 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define __for_each_thread(signal, t) \ + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + +#define for_each_thread(p, t) \ + __for_each_thread((p)->signal, t) + +/* Careful: this is a double loop, 'break' won't work as expected. */ +#define for_each_process_thread(p, t) \ + for_each_process(p) for_each_thread(p, t) + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 6f19cfd1840e..9687691799ff 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -3,6 +3,8 @@ #include +#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC) + #ifdef CONFIG_SECCOMP #include @@ -14,11 +16,11 @@ struct seccomp_filter; * * @mode: indicates one of the valid values above for controlled * system calls available to a process. - * @filter: The metadata and ruleset for determining what system calls - * are allowed for a task. + * @filter: must always point to a valid seccomp-filter or NULL as it is + * accessed without locking during system call entry. * * @filter must only be accessed from the context of current as there - * is no locking. + * is no read locking. */ struct seccomp { int mode; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 4147d700a293..2a955dcc863c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -841,4 +841,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid, asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2); asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags); +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, + const char __user *uargs); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 0cc74c4403e4..b422ad5d238b 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -692,9 +692,19 @@ __SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \ __SYSCALL(__NR_kcmp, sys_kcmp) #define __NR_finit_module 273 __SYSCALL(__NR_finit_module, sys_finit_module) +/* Backporting seccomp, skip a few ... + * #define __NR_sched_setattr 274 +__SYSCALL(__NR_sched_setattr, sys_sched_setattr) + * #define __NR_sched_getattr 275 +__SYSCALL(__NR_sched_getattr, sys_sched_getattr) + * #define __NR_renameat2 276 +__SYSCALL(__NR_renameat2, sys_renameat2) + */ +#define __NR_seccomp 277 +__SYSCALL(__NR_seccomp, sys_seccomp) #undef __NR_syscalls -#define __NR_syscalls 274 +#define __NR_syscalls 278 /* * All syscalls below here should go away really, diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index ac2dc9f72973..0f238a43ff1e 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -10,6 +10,13 @@ #define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */ #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ +/* Valid operations for seccomp syscall. */ +#define SECCOMP_SET_MODE_STRICT 0 +#define SECCOMP_SET_MODE_FILTER 1 + +/* Valid flags for SECCOMP_SET_MODE_FILTER */ +#define SECCOMP_FILTER_FLAG_TSYNC 1 + /* * All BPF programs must return a 32-bit value. * The bottom 16-bits are for optional return data. diff --git a/kernel/exit.c b/kernel/exit.c index 6a057750ebbb..8e2166363b4b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_group); + list_del_rcu(&p->thread_node); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 41671a5d637d..0ff07d94e07c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -327,6 +327,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto free_ti; tsk->stack = ti; +#ifdef CONFIG_SECCOMP + /* + * We must handle setting up seccomp filters once we're under + * the sighand lock in case orig has changed between now and + * then. Until then, filter must be NULL to avoid messing up + * the usage counts on the error path calling free_task. + */ + tsk->seccomp.filter = NULL; +#endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); @@ -1061,6 +1070,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); + + /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ + sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); + tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); + init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); @@ -1102,6 +1116,39 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) p->flags = new_flags; } +static void copy_seccomp(struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + /* + * Must be called with sighand->lock held, which is common to + * all threads in the group. Holding cred_guard_mutex is not + * needed because this new task is not yet running and cannot + * be racing exec. + */ + assert_spin_locked(¤t->sighand->siglock); + + /* Ref-count the new filter user, and assign it. */ + get_seccomp_filter(current); + p->seccomp = current->seccomp; + + /* + * Explicitly enable no_new_privs here in case it got set + * between the task_struct being duplicated and holding the + * sighand lock. The seccomp state and nnp must be in sync. + */ + if (task_no_new_privs(current)) + task_set_no_new_privs(p); + + /* + * If the parent gained a seccomp mode after copying thread + * flags and between before we held the sighand lock, we have + * to manually enable the seccomp thread flag here. + */ + if (p->seccomp.mode != SECCOMP_MODE_DISABLED) + set_tsk_thread_flag(p, TIF_SECCOMP); +#endif +} + SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1205,7 +1252,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; ftrace_graph_init_task(p); - get_seccomp_filter(p); rt_mutex_init_task(p); @@ -1447,6 +1493,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_lock(¤t->sighand->siglock); + /* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the @@ -1487,6 +1539,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); __this_cpu_inc(process_counts); + } else { + list_add_tail_rcu(&p->thread_node, + &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b7a10048a32c..1fbb1a2bc459 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -18,15 +18,17 @@ #include #include #include +#include +#include /* #define SECCOMP_DEBUG 1 */ #ifdef CONFIG_SECCOMP_FILTER #include #include +#include #include #include -#include #include #include @@ -95,7 +97,7 @@ u32 seccomp_bpf_load(int off) if (off == BPF_DATA(nr)) return syscall_get_nr(current, regs); if (off == BPF_DATA(arch)) - return syscall_get_arch(current, regs); + return syscall_get_arch(); if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { unsigned long value; int arg = (off - BPF_DATA(args[0])) / sizeof(u64); @@ -201,32 +203,170 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(int syscall) { - struct seccomp_filter *f; + struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ - if (WARN_ON(current->seccomp.filter == NULL)) + if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; + /* Make sure cross-thread synced filter points somewhere sane. */ + smp_read_barrier_depends(); + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ - for (f = current->seccomp.filter; f; f = f->prev) { + for (; f; f = f->prev) { u32 cur_ret = sk_run_filter(NULL, f->insns); + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } return ret; } +#endif /* CONFIG_SECCOMP_FILTER */ + +static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) +{ + assert_spin_locked(¤t->sighand->siglock); + + if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) + return false; + + return true; +} + +static inline void seccomp_assign_mode(struct task_struct *task, + unsigned long seccomp_mode) +{ + assert_spin_locked(&task->sighand->siglock); + + task->seccomp.mode = seccomp_mode; + /* + * Make sure TIF_SECCOMP cannot be set before the mode (and + * filter) is set. + */ + smp_mb(); + set_tsk_thread_flag(task, TIF_SECCOMP); +} + +#ifdef CONFIG_SECCOMP_FILTER +/* Returns 1 if the parent is an ancestor of the child. */ +static int is_ancestor(struct seccomp_filter *parent, + struct seccomp_filter *child) +{ + /* NULL is the root ancestor. */ + if (parent == NULL) + return 1; + for (; child; child = child->prev) + if (child == parent) + return 1; + return 0; +} /** - * seccomp_attach_filter: Attaches a seccomp filter to current. + * seccomp_can_sync_threads: checks if all threads can be synchronized + * + * Expects sighand and cred_guard_mutex locks to be held. + * + * Returns 0 on success, -ve on error, or the pid of a thread which was + * either not in the correct seccomp mode or it did not have an ancestral + * seccomp filter. + */ +static inline pid_t seccomp_can_sync_threads(void) +{ + struct task_struct *thread, *caller; + + BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + assert_spin_locked(¤t->sighand->siglock); + + /* Validate all threads being eligible for synchronization. */ + caller = current; + for_each_thread(caller, thread) { + pid_t failed; + + /* Skip current, since it is initiating the sync. */ + if (thread == caller) + continue; + + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || + (thread->seccomp.mode == SECCOMP_MODE_FILTER && + is_ancestor(thread->seccomp.filter, + caller->seccomp.filter))) + continue; + + /* Return the first thread that cannot be synchronized. */ + failed = task_pid_vnr(thread); + /* If the pid cannot be resolved, then return -ESRCH */ + if (unlikely(WARN_ON(failed == 0))) + failed = -ESRCH; + return failed; + } + + return 0; +} + +/** + * seccomp_sync_threads: sets all threads to use current's filter + * + * Expects sighand and cred_guard_mutex locks to be held, and for + * seccomp_can_sync_threads() to have returned success already + * without dropping the locks. + * + */ +static inline void seccomp_sync_threads(void) +{ + struct task_struct *thread, *caller; + + BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + assert_spin_locked(¤t->sighand->siglock); + + /* Synchronize all threads. */ + caller = current; + for_each_thread(caller, thread) { + /* Skip current, since it needs no changes. */ + if (thread == caller) + continue; + + /* Get a task reference for the new leaf node. */ + get_seccomp_filter(caller); + /* + * Drop the task reference to the shared ancestor since + * current's path will hold a reference. (This also + * allows a put before the assignment.) + */ + put_seccomp_filter(thread); + smp_store_release(&thread->seccomp.filter, + caller->seccomp.filter); + /* + * Opt the other thread into seccomp if needed. + * As threads are considered to be trust-realm + * equivalent (see ptrace_may_access), it is safe to + * allow one thread to transition the other. + */ + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + + seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); + } + } +} + +/** + * seccomp_prepare_filter: Prepares a seccomp filter for use. * @fprog: BPF program to install * - * Returns 0 on success or an errno on failure. + * Returns filter on success or an ERR_PTR on failure. */ -static long seccomp_attach_filter(struct sock_fprog *fprog) +static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *filter; unsigned long fp_size = fprog->len * sizeof(struct sock_filter); @@ -234,12 +374,13 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) long ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) - return -EINVAL; + return ERR_PTR(-EINVAL); + BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); for (filter = current->seccomp.filter; filter; filter = filter->prev) total_insns += filter->len + 4; /* include a 4 instr penalty */ if (total_insns > MAX_INSNS_PER_PATH) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* * Installing a seccomp filter requires that the task have @@ -247,16 +388,16 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. */ - if (!current->no_new_privs && + if (!task_no_new_privs(current) && security_capable_noaudit(current_cred(), current_user_ns(), CAP_SYS_ADMIN) != 0) - return -EACCES; + return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, GFP_KERNEL|__GFP_NOWARN); if (!filter) - return -ENOMEM; + return ERR_PTR(-ENOMEM);; atomic_set(&filter->usage, 1); filter->len = fprog->len; @@ -275,28 +416,24 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (ret) goto fail; - /* - * If there is an existing filter, make it the prev and don't drop its - * task reference. - */ - filter->prev = current->seccomp.filter; - current->seccomp.filter = filter; - return 0; + return filter; + fail: kfree(filter); - return ret; + return ERR_PTR(ret); } /** - * seccomp_attach_user_filter - attaches a user-supplied sock_fprog + * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog * @user_filter: pointer to the user data containing a sock_fprog. * * Returns 0 on success and non-zero otherwise. */ -long seccomp_attach_user_filter(char __user *user_filter) +static struct seccomp_filter * +seccomp_prepare_user_filter(const char __user *user_filter) { struct sock_fprog fprog; - long ret = -EFAULT; + struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT if (is_compat_task()) { @@ -309,9 +446,56 @@ long seccomp_attach_user_filter(char __user *user_filter) #endif if (copy_from_user(&fprog, user_filter, sizeof(fprog))) goto out; - ret = seccomp_attach_filter(&fprog); + filter = seccomp_prepare_filter(&fprog); out: - return ret; + return filter; +} + +/** + * seccomp_attach_filter: validate and attach filter + * @flags: flags to change filter behavior + * @filter: seccomp filter to add to the current process + * + * Caller must be holding current->sighand->siglock lock. + * + * Returns 0 on success, -ve on error. + */ +static long seccomp_attach_filter(unsigned int flags, + struct seccomp_filter *filter) +{ + unsigned long total_insns; + struct seccomp_filter *walker; + + assert_spin_locked(¤t->sighand->siglock); + + /* Validate resulting filter length. */ + total_insns = filter->len; + for (walker = current->seccomp.filter; walker; walker = walker->prev) + total_insns += walker->len + 4; /* 4 instr penalty */ + if (total_insns > MAX_INSNS_PER_PATH) + return -ENOMEM; + + /* If thread sync has been requested, check that it is possible. */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + int ret; + + ret = seccomp_can_sync_threads(); + if (ret) + return ret; + } + + /* + * If there is an existing filter, make it the prev and don't drop its + * task reference. + */ + filter->prev = current->seccomp.filter; + current->seccomp.filter = filter; + + /* Now that the new filter is in place, synchronize to all threads. */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC) + seccomp_sync_threads(); + + return 0; } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -324,6 +508,13 @@ void get_seccomp_filter(struct task_struct *tsk) atomic_inc(&orig->usage); } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + kfree(filter); + } +} + /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ void put_seccomp_filter(struct task_struct *tsk) { @@ -332,7 +523,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; - kfree(freeme); + seccomp_filter_free(freeme); } } @@ -351,7 +542,7 @@ static void seccomp_send_sigsys(int syscall, int reason) info.si_code = SYS_SECCOMP; info.si_call_addr = (void __user *)KSTK_EIP(current); info.si_errno = reason; - info.si_arch = syscall_get_arch(current, task_pt_regs(current)); + info.si_arch = syscall_get_arch(); info.si_syscall = syscall; force_sig_info(SIGSYS, &info, current); } @@ -376,12 +567,17 @@ static int mode1_syscalls_32[] = { int __secure_computing(int this_syscall) { - int mode = current->seccomp.mode; int exit_sig = 0; int *syscall; u32 ret; - switch (mode) { + /* + * Make sure that any changes to mode from another thread have + * been seen after TIF_SECCOMP was seen. + */ + rmb(); + + switch (current->seccomp.mode) { case SECCOMP_MODE_STRICT: syscall = mode1_syscalls; #ifdef CONFIG_COMPAT @@ -467,47 +663,152 @@ long prctl_get_seccomp(void) } /** - * prctl_set_seccomp: configures current->seccomp.mode - * @seccomp_mode: requested mode to use - * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER - * - * This function may be called repeatedly with a @seccomp_mode of - * SECCOMP_MODE_FILTER to install additional filters. Every filter - * successfully installed will be evaluated (in reverse order) for each system - * call the task makes. + * seccomp_set_mode_strict: internal function for setting strict seccomp * * Once current->seccomp.mode is non-zero, it may not be changed. * * Returns 0 on success or -EINVAL on failure. */ -long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +static long seccomp_set_mode_strict(void) { + const unsigned long seccomp_mode = SECCOMP_MODE_STRICT; long ret = -EINVAL; - if (current->seccomp.mode && - current->seccomp.mode != seccomp_mode) + spin_lock_irq(¤t->sighand->siglock); + + if (!seccomp_may_assign_mode(seccomp_mode)) goto out; +#ifdef TIF_NOTSC + disable_TSC(); +#endif + seccomp_assign_mode(current, seccomp_mode); + ret = 0; + +out: + spin_unlock_irq(¤t->sighand->siglock); + + return ret; +} + +#ifdef CONFIG_SECCOMP_FILTER +/** + * seccomp_set_mode_filter: internal function for setting seccomp filter + * @flags: flags to change filter behavior + * @filter: struct sock_fprog containing filter + * + * This function may be called repeatedly to install additional filters. + * Every filter successfully installed will be evaluated (in reverse order) + * for each system call the task makes. + * + * Once current->seccomp.mode is non-zero, it may not be changed. + * + * Returns 0 on success or -EINVAL on failure. + */ +static long seccomp_set_mode_filter(unsigned int flags, + const char __user *filter) +{ + const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; + struct seccomp_filter *prepared = NULL; + long ret = -EINVAL; + + /* Validate flags. */ + if (flags & ~SECCOMP_FILTER_FLAG_MASK) + return -EINVAL; + + /* Prepare the new filter before holding any locks. */ + prepared = seccomp_prepare_user_filter(filter); + if (IS_ERR(prepared)) + return PTR_ERR(prepared); + + /* + * Make sure we cannot change seccomp or nnp state via TSYNC + * while another thread is in the middle of calling exec. + */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC && + mutex_lock_killable(¤t->signal->cred_guard_mutex)) + goto out_free; + + spin_lock_irq(¤t->sighand->siglock); + + if (!seccomp_may_assign_mode(seccomp_mode)) + goto out; + + ret = seccomp_attach_filter(flags, prepared); + if (ret) + goto out; + /* Do not free the successfully attached filter. */ + prepared = NULL; + + seccomp_assign_mode(current, seccomp_mode); +out: + spin_unlock_irq(¤t->sighand->siglock); + if (flags & SECCOMP_FILTER_FLAG_TSYNC) + mutex_unlock(¤t->signal->cred_guard_mutex); +out_free: + seccomp_filter_free(prepared); + return ret; +} +#else +static inline long seccomp_set_mode_filter(unsigned int flags, + const char __user *filter) +{ + return -EINVAL; +} +#endif + +/* Common entry point for both prctl and syscall. */ +static long do_seccomp(unsigned int op, unsigned int flags, + const char __user *uargs) +{ + switch (op) { + case SECCOMP_SET_MODE_STRICT: + if (flags != 0 || uargs != NULL) + return -EINVAL; + return seccomp_set_mode_strict(); + case SECCOMP_SET_MODE_FILTER: + return seccomp_set_mode_filter(flags, uargs); + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, + const char __user *, uargs) +{ + return do_seccomp(op, flags, uargs); +} + +/** + * prctl_set_seccomp: configures current->seccomp.mode + * @seccomp_mode: requested mode to use + * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * + * Returns 0 on success or -EINVAL on failure. + */ +long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +{ + unsigned int op; + char __user *uargs; + switch (seccomp_mode) { case SECCOMP_MODE_STRICT: - ret = 0; -#ifdef TIF_NOTSC - disable_TSC(); -#endif + op = SECCOMP_SET_MODE_STRICT; + /* + * Setting strict mode through prctl always ignored filter, + * so make sure it is always NULL here to pass the internal + * check in do_seccomp(). + */ + uargs = NULL; break; -#ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: - ret = seccomp_attach_user_filter(filter); - if (ret) - goto out; + op = SECCOMP_SET_MODE_FILTER; + uargs = filter; break; -#endif default: - goto out; + return -EINVAL; } - current->seccomp.mode = seccomp_mode; - set_thread_flag(TIF_SECCOMP); -out: - return ret; + /* prctl interface doesn't have flags, so they are always zero. */ + return do_seccomp(op, 0, uargs); } diff --git a/kernel/sys.c b/kernel/sys.c index 65d3e55bd282..0b08c9f000f3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2427,12 +2427,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; - current->no_new_privs = 1; + task_set_no_new_privs(current); break; case PR_GET_NO_NEW_PRIVS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - return current->no_new_privs ? 1 : 0; + return task_no_new_privs(current) ? 1 : 0; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7078052284fd..7e7fc0a082c4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -209,3 +209,6 @@ cond_syscall(compat_sys_open_by_handle_at); /* compare kernel pointers */ cond_syscall(sys_kcmp); + +/* operate on Secure Computing state */ +cond_syscall(sys_seccomp); diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 859abdaac1ea..9aaa4e72cc1f 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -629,7 +629,7 @@ int aa_change_hat(const char *hats[], int count, u64 token, bool permtest) * There is no exception for unconfined as change_hat is not * available. */ - if (current->no_new_privs) + if (task_no_new_privs(current)) return -EPERM; /* released below */ @@ -780,7 +780,7 @@ int aa_change_profile(const char *ns_name, const char *hname, bool onexec, * no_new_privs is set because this aways results in a reduction * of permissions. */ - if (current->no_new_privs && !unconfined(profile)) { + if (task_no_new_privs(current) && !unconfined(profile)) { put_cred(cred); return -EPERM; }