linux/kernel/rseq.c

// SPDX-License-Identifier: GPL-2.0+
/*
 * Restartable sequences system call
 *
 * Copyright (C) 2015, Google, Inc.,
 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
 * Copyright (C) 2015-2018, EfficiOS Inc.,
 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 */

/*
 * Restartable sequences are a lightweight interface that allows
 * user-level code to be executed atomically relative to scheduler
 * preemption and signal delivery. Typically used for implementing
 * per-cpu operations.
 *
 * It allows user-space to perform update operations on per-cpu data
 * without requiring heavy-weight atomic operations.
 *
 * Detailed algorithm of rseq user-space assembly sequences:
 *
 *                     init(rseq_cs)
 *                     cpu = TLS->rseq::cpu_id_start
 *   [1]               TLS->rseq::rseq_cs = rseq_cs
 *   [start_ip]        ----------------------------
 *   [2]               if (cpu != TLS->rseq::cpu_id)
 *                             goto abort_ip;
 *   [3]               <last_instruction_in_cs>
 *   [post_commit_ip]  ----------------------------
 *
 *   The address of jump target abort_ip must be outside the critical
 *   region, i.e.:
 *
 *     [abort_ip] < [start_ip]  || [abort_ip] >= [post_commit_ip]
 *
 *   Steps [2]-[3] (inclusive) need to be a sequence of instructions in
 *   userspace that can handle being interrupted between any of those
 *   instructions, and then resumed to the abort_ip.
 *
 *   1.  Userspace stores the address of the struct rseq_cs assembly
 *       block descriptor into the rseq_cs field of the registered
 *       struct rseq TLS area. This update is performed through a single
 *       store within the inline assembly instruction sequence.
 *       [start_ip]
 *
 *   2.  Userspace tests to check whether the current cpu_id field match
 *       the cpu number loaded before start_ip, branching to abort_ip
 *       in case of a mismatch.
 *
 *       If the sequence is preempted or interrupted by a signal
 *       at or after start_ip and before post_commit_ip, then the kernel
 *       clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
 *       ip to abort_ip before returning to user-space, so the preempted
 *       execution resumes at abort_ip.
 *
 *   3.  Userspace critical section final instruction before
 *       post_commit_ip is the commit. The critical section is
 *       self-terminating.
 *       [post_commit_ip]
 *
 *   4.  <success>
 *
 *   On failure at [2], or if interrupted by preempt or signal delivery
 *   between [1] and [3]:
 *
 *       [abort_ip]
 *   F1. <failure>
 */

/* Required to select the proper per_cpu ops for rseq_stats_inc() */
#define RSEQ_BUILD_SLOW_PATH

#include <linux/debugfs.h>
#include <linux/hrtimer.h>
#include <linux/percpu.h>
#include <linux/prctl.h>
#include <linux/ratelimit.h>
#include <linux/rseq_entry.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/rseq.h>
#include <asm/ptrace.h>

#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>

DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);

static inline void rseq_control_debug(bool on)
{
	if (on)
		static_branch_enable(&rseq_debug_enabled);
	else
		static_branch_disable(&rseq_debug_enabled);
}

static int __init rseq_setup_debug(char *str)
{
	bool on;

	if (kstrtobool(str, &on))
		return -EINVAL;
	rseq_control_debug(on);
	return 1;
}
__setup("rseq_debug=", rseq_setup_debug);

#ifdef CONFIG_TRACEPOINTS
/*
 * Out of line, so the actual update functions can be in a header to be
 * inlined into the exit to user code.
 */
void __rseq_trace_update(struct task_struct *t)
{
	trace_rseq_update(t);
}

void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
			   unsigned long offset, unsigned long abort_ip)
{
	trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip);
}
#endif /* CONFIG_TRACEPOINTS */

#ifdef CONFIG_RSEQ_STATS
DEFINE_PER_CPU(struct rseq_stats, rseq_stats);

static int rseq_stats_show(struct seq_file *m, void *p)
{
	struct rseq_stats stats = { };
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		stats.exit	+= data_race(per_cpu(rseq_stats.exit, cpu));
		stats.signal	+= data_race(per_cpu(rseq_stats.signal, cpu));
		stats.slowpath	+= data_race(per_cpu(rseq_stats.slowpath, cpu));
		stats.fastpath	+= data_race(per_cpu(rseq_stats.fastpath, cpu));
		stats.ids	+= data_race(per_cpu(rseq_stats.ids, cpu));
		stats.cs	+= data_race(per_cpu(rseq_stats.cs, cpu));
		stats.clear	+= data_race(per_cpu(rseq_stats.clear, cpu));
		stats.fixup	+= data_race(per_cpu(rseq_stats.fixup, cpu));
		if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
			stats.s_granted	+= data_race(per_cpu(rseq_stats.s_granted, cpu));
			stats.s_expired	+= data_race(per_cpu(rseq_stats.s_expired, cpu));
			stats.s_revoked	+= data_race(per_cpu(rseq_stats.s_revoked, cpu));
			stats.s_yielded	+= data_race(per_cpu(rseq_stats.s_yielded, cpu));
			stats.s_aborted	+= data_race(per_cpu(rseq_stats.s_aborted, cpu));
		}
	}

	seq_printf(m, "exit:   %16lu\n", stats.exit);
	seq_printf(m, "signal: %16lu\n", stats.signal);
	seq_printf(m, "slowp:  %16lu\n", stats.slowpath);
	seq_printf(m, "fastp:  %16lu\n", stats.fastpath);
	seq_printf(m, "ids:    %16lu\n", stats.ids);
	seq_printf(m, "cs:     %16lu\n", stats.cs);
	seq_printf(m, "clear:  %16lu\n", stats.clear);
	seq_printf(m, "fixup:  %16lu\n", stats.fixup);
	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
		seq_printf(m, "sgrant: %16lu\n", stats.s_granted);
		seq_printf(m, "sexpir: %16lu\n", stats.s_expired);
		seq_printf(m, "srevok: %16lu\n", stats.s_revoked);
		seq_printf(m, "syield: %16lu\n", stats.s_yielded);
		seq_printf(m, "sabort: %16lu\n", stats.s_aborted);
	}
	return 0;
}

static int rseq_stats_open(struct inode *inode, struct file *file)
{
	return single_open(file, rseq_stats_show, inode->i_private);
}

static const struct file_operations stat_ops = {
	.open		= rseq_stats_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static int __init rseq_stats_init(struct dentry *root_dir)
{
	debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops);
	return 0;
}
#else
static inline void rseq_stats_init(struct dentry *root_dir) { }
#endif /* CONFIG_RSEQ_STATS */

static int rseq_debug_show(struct seq_file *m, void *p)
{
	bool on = static_branch_unlikely(&rseq_debug_enabled);

	seq_printf(m, "%d\n", on);
	return 0;
}

static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf,
			    size_t count, loff_t *ppos)
{
	bool on;

	if (kstrtobool_from_user(ubuf, count, &on))
		return -EINVAL;

	rseq_control_debug(on);
	return count;
}

static int rseq_debug_open(struct inode *inode, struct file *file)
{
	return single_open(file, rseq_debug_show, inode->i_private);
}

static const struct file_operations debug_ops = {
	.open		= rseq_debug_open,
	.read		= seq_read,
	.write		= rseq_debug_write,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static void rseq_slice_ext_init(struct dentry *root_dir);

static int __init rseq_debugfs_init(void)
{
	struct dentry *root_dir = debugfs_create_dir("rseq", NULL);

	debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
	rseq_stats_init(root_dir);
	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
		rseq_slice_ext_init(root_dir);
	return 0;
}
__initcall(rseq_debugfs_init);

static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
{
	struct rseq __user *urseq = t->rseq.usrptr;
	u64 csaddr;

	scoped_user_read_access(urseq, efault)
		unsafe_get_user(csaddr, &urseq->rseq_cs, efault);
	if (likely(!csaddr))
		return true;
	return rseq_update_user_cs(t, regs, csaddr);
efault:
	return false;
}

static void rseq_slowpath_update_usr(struct pt_regs *regs)
{
	/*
	 * Preserve has_rseq and user_irq state. The generic entry code clears
	 * user_irq on the way out, the non-generic entry architectures are not
	 * setting user_irq.
	 */
	const struct rseq_event evt_mask = {
		.has_rseq	= RSEQ_HAS_RSEQ_VERSION_MASK,
		.user_irq	= true,
	};
	struct task_struct *t = current;
	struct rseq_ids ids;
	bool event;

	if (unlikely(t->flags & PF_EXITING))
		return;

	rseq_stat_inc(rseq_stats.slowpath);

	/*
	 * Read and clear the event pending bit first. If the task
	 * was not preempted or migrated or a signal is on the way,
	 * there is no point in doing any of the heavy lifting here
	 * on production kernels. In that case TIF_NOTIFY_RESUME
	 * was raised by some other functionality.
	 *
	 * This is correct because the read/clear operation is
	 * guarded against scheduler preemption, which makes it CPU
	 * local atomic. If the task is preempted right after
	 * re-enabling preemption then TIF_NOTIFY_RESUME is set
	 * again and this function is invoked another time _before_
	 * the task is able to return to user mode.
	 *
	 * On a debug kernel, invoke the fixup code unconditionally
	 * with the result handed in to allow the detection of
	 * inconsistencies.
	 */
	scoped_guard(irq) {
		event = t->rseq.event.sched_switch;
		t->rseq.event.all &= evt_mask.all;
		ids.cpu_id = task_cpu(t);
		ids.mm_cid = task_mm_cid(t);
	}

	if (!event)
		return;

	ids.node_id = cpu_to_node(ids.cpu_id);

	if (unlikely(!rseq_update_usr(t, regs, &ids))) {
		/*
		 * Clear the errors just in case this might survive magically, but
		 * leave the rest intact.
		 */
		t->rseq.event.error = 0;
		force_sig(SIGSEGV);
	}
}

void __rseq_handle_slowpath(struct pt_regs *regs)
{
	/*
	 * If invoked from hypervisors before entering the guest via
	 * resume_user_mode_work(), then @regs is a NULL pointer.
	 *
	 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
	 * it before returning from the ioctl() to user space when
	 * rseq_event.sched_switch is set.
	 *
	 * So it's safe to ignore here instead of pointlessly updating it
	 * in the vcpu_run() loop.
	 */
	if (!regs)
		return;

	rseq_slowpath_update_usr(regs);
}

void __rseq_signal_deliver(int sig, struct pt_regs *regs)
{
	rseq_stat_inc(rseq_stats.signal);

	/*
	 * Don't update IDs yet, they are handled on exit to user if
	 * necessary. The important thing is to abort a critical section of
	 * the interrupted context as after this point the instruction
	 * pointer in @regs points to the signal handler.
	 */
	if (unlikely(!rseq_handle_cs(current, regs))) {
		/*
		 * Clear the errors just in case this might survive
		 * magically, but leave the rest intact.
		 */
		current->rseq.event.error = 0;
		force_sigsegv(sig);
	}

	/*
	 * In legacy mode, force the update of IDs before returning to user
	 * space to stay compatible.
	 */
	if (!rseq_v2(current))
		rseq_force_update();
}

/*
 * Terminate the process if a syscall is issued within a restartable
 * sequence.
 */
void __rseq_debug_syscall_return(struct pt_regs *regs)
{
	struct task_struct *t = current;
	u64 csaddr;

	if (!t->rseq.event.has_rseq)
		return;
	if (get_user(csaddr, &t->rseq.usrptr->rseq_cs))
		goto fail;
	if (likely(!csaddr))
		return;
	if (unlikely(csaddr >= TASK_SIZE))
		goto fail;
	if (rseq_debug_update_user_cs(t, regs, csaddr))
		return;
fail:
	force_sig(SIGSEGV);
}

#ifdef CONFIG_DEBUG_RSEQ
/* Kept around to keep GENERIC_ENTRY=n architectures supported. */
void rseq_syscall(struct pt_regs *regs)
{
	__rseq_debug_syscall_return(regs);
}
#endif

static bool rseq_reset_ids(void)
{
	struct rseq __user *rseq = current->rseq.usrptr;

	/*
	 * If this fails, terminate it because this leaves the kernel in
	 * stupid state as exit to user space will try to fixup the ids
	 * again.
	 */
	scoped_user_rw_access(rseq, efault) {
		unsafe_put_user(0, &rseq->cpu_id_start, efault);
		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
		unsafe_put_user(0, &rseq->node_id, efault);
		unsafe_put_user(0, &rseq->mm_cid, efault);
	}
	return true;

efault:
	force_sig(SIGSEGV);
	return false;
}

/* The original rseq structure size (including padding) is 32 bytes. */
#define ORIG_RSEQ_SIZE		32

static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig)
{
	u32 rseqfl = 0;
	u8 version = 1;

	if (!access_ok(rseq, rseq_len))
		return -EFAULT;

	/*
	 * Architectures, which use the generic IRQ entry code (at least) enable
	 * registrations with a size greater than the original v1 fixed sized
	 * @rseq_len, which has been validated already to utilize the optimized
	 * v2 ABI mode which also enables extended RSEQ features beyond MMCID.
	 */
	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE)
		version = 2;

	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) {
		if (rseq_slice_extension_enabled()) {
			rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
			if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)
				rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
		}
	}

	scoped_user_write_access(rseq, efault) {
		/*
		 * If the rseq_cs pointer is non-NULL on registration, clear it to
		 * avoid a potential segfault on return to user-space. The proper thing
		 * to do would have been to fail the registration but this would break
		 * older libcs that reuse the rseq area for new threads without
		 * clearing the fields. Don't bother reading it, just reset it.
		 */
		unsafe_put_user(0UL, &rseq->rseq_cs, efault);
		unsafe_put_user(rseqfl, &rseq->flags, efault);
		/* Initialize IDs in user space */
		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
		unsafe_put_user(0U, &rseq->node_id, efault);
		unsafe_put_user(0U, &rseq->mm_cid, efault);

		/*
		 * All fields past mm_cid are only valid for non-legacy v2
		 * registrations.
		 */
		if (version > 1) {
			if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
				unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
		}
	}

	/*
	 * Activate the registration by setting the rseq area address, length
	 * and signature in the task struct.
	 */
	current->rseq.usrptr = rseq;
	current->rseq.len = rseq_len;
	current->rseq.sig = sig;

#ifdef CONFIG_RSEQ_SLICE_EXTENSION
	current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED);
#endif

	/*
	 * Ensure the cpu_id_start and cpu_id fields are updated before
	 * returning to user-space.
	 */
	current->rseq.event.has_rseq = version;
	rseq_force_update();
	return 0;

efault:
	return -EFAULT;
}

static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig)
{
	if (flags & ~RSEQ_FLAG_UNREGISTER)
		return -EINVAL;
	if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
		return -EINVAL;
	if (rseq_len != current->rseq.len)
		return -EINVAL;
	if (current->rseq.sig != sig)
		return -EPERM;
	if (!rseq_reset_ids())
		return -EFAULT;
	rseq_reset(current);
	return 0;
}

static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig)
{
	/*
	 * If rseq is already registered, check whether the provided address
	 * differs from the prior one.
	 */
	if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
		return -EINVAL;
	if (current->rseq.sig != sig)
		return -EPERM;
	/* Already registered. */
	return -EBUSY;
}

static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len)
{
	/*
	 * Ensure the provided rseq is properly aligned, as communicated to
	 * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If
	 * rseq_len is the original rseq size, the required alignment is the
	 * original struct rseq alignment.
	 *
	 * In order to be valid, rseq_len is either the original rseq size, or
	 * large enough to contain all supported fields, as communicated to
	 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
	 */
	if (rseq_len < ORIG_RSEQ_SIZE)
		return false;

	if (rseq_len == ORIG_RSEQ_SIZE)
		return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE);

	return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) &&
		rseq_len >= offsetof(struct rseq, end);
}

#define RSEQ_FLAGS_SUPPORTED	(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)

/*
 * sys_rseq - Register or unregister restartable sequences for the caller thread.
 */
SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
{
	if (flags & RSEQ_FLAG_UNREGISTER)
		return rseq_unregister(rseq, rseq_len, flags, sig);

	if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED))
		return -EINVAL;

	if (current->rseq.usrptr)
		return rseq_reregister(rseq, rseq_len, sig);

	if (!rseq_length_valid(rseq, rseq_len))
		return -EINVAL;

	return rseq_register(rseq, rseq_len, flags, sig);
}

#ifdef CONFIG_RSEQ_SLICE_EXTENSION
struct slice_timer {
	struct hrtimer	timer;
	void		*cookie;
};

static const unsigned int rseq_slice_ext_nsecs_min =  5 * NSEC_PER_USEC;
static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min;
static DEFINE_PER_CPU(struct slice_timer, slice_timer);
DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);

/*
 * When the timer expires and the task is still in user space, the return
 * from interrupt will revoke the grant and schedule. If the task already
 * entered the kernel via a syscall and the timer fires before the syscall
 * work was able to cancel it, then depending on the preemption model this
 * will either reschedule on return from interrupt or in the syscall work
 * below.
 */
static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
{
	struct slice_timer *st = container_of(tmr, struct slice_timer, timer);

	/*
	 * Validate that the task which armed the timer is still on the
	 * CPU. It could have been scheduled out without canceling the
	 * timer.
	 */
	if (st->cookie == current && current->rseq.slice.state.granted) {
		rseq_stat_inc(rseq_stats.s_expired);
		set_need_resched_current();
	}
	return HRTIMER_NORESTART;
}

bool __rseq_arm_slice_extension_timer(void)
{
	struct slice_timer *st = this_cpu_ptr(&slice_timer);
	struct task_struct *curr = current;

	lockdep_assert_irqs_disabled();

	/*
	 * This check prevents a task, which got a time slice extension
	 * granted, from exceeding the maximum scheduling latency when the
	 * grant expired before going out to user space. Don't bother to
	 * clear the grant here, it will be cleaned up automatically before
	 * going out to user space after being scheduled back in.
	 */
	if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
		set_need_resched_current();
		return true;
	}

	/*
	 * Store the task pointer as a cookie for comparison in the timer
	 * function. This is safe as the timer is CPU local and cannot be
	 * in the expiry function at this point.
	 */
	st->cookie = curr;
	hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
	/* Arm the syscall entry work */
	set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
	return false;
}

static void rseq_cancel_slice_extension_timer(void)
{
	struct slice_timer *st = this_cpu_ptr(&slice_timer);

	/*
	 * st->cookie can be safely read as preemption is disabled and the
	 * timer is CPU local.
	 *
	 * As this is most probably the first expiring timer, the cancel is
	 * expensive as it has to reprogram the hardware, but that's less
	 * expensive than going through a full hrtimer_interrupt() cycle
	 * for nothing.
	 *
	 * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
	 * local and once the hrtimer code disabled interrupts the timer
	 * callback cannot be running.
	 */
	if (st->cookie == current)
		hrtimer_try_to_cancel(&st->timer);
}

static inline void rseq_slice_set_need_resched(struct task_struct *curr)
{
	/*
	 * The interrupt guard is required to prevent inconsistent state in
	 * this case:
	 *
	 * set_tsk_need_resched()
	 * --> Interrupt
	 *       wakeup()
	 *        set_tsk_need_resched()
	 *	  set_preempt_need_resched()
	 *     schedule_on_return()
	 *        clear_tsk_need_resched()
	 *	  clear_preempt_need_resched()
	 * set_preempt_need_resched()		<- Inconsistent state
	 *
	 * This is safe vs. a remote set of TIF_NEED_RESCHED because that
	 * only sets the already set bit and does not create inconsistent
	 * state.
	 */
	scoped_guard(irq)
		set_need_resched_current();
}

static void rseq_slice_validate_ctrl(u32 expected)
{
	u32 __user *sctrl = &current->rseq.usrptr->slice_ctrl.all;
	u32 uval;

	if (get_user(uval, sctrl) || uval != expected)
		force_sig(SIGSEGV);
}

/*
 * Invoked from syscall entry if a time slice extension was granted and the
 * kernel did not clear it before user space left the critical section.
 *
 * While the recommended way to relinquish the CPU side effect free is
 * rseq_slice_yield(2), any syscall within a granted slice terminates the
 * grant and immediately reschedules if required. This supports onion layer
 * applications, where the code requesting the grant cannot control the
 * code within the critical section.
 */
void rseq_syscall_enter_work(long syscall)
{
	struct task_struct *curr = current;
	struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted };

	clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);

	if (static_branch_unlikely(&rseq_debug_enabled))
		rseq_slice_validate_ctrl(ctrl.all);

	/*
	 * The kernel might have raced, revoked the grant and updated
	 * userspace, but kept the SLICE work set.
	 */
	if (!ctrl.granted)
		return;

	/*
	 * Required to stabilize the per CPU timer pointer and to make
	 * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
	 *
	 * Leaving the scope will reschedule on preemption models FULL,
	 * LAZY and RT if necessary.
	 */
	scoped_guard(preempt) {
		rseq_cancel_slice_extension_timer();
		/*
		 * Now that preemption is disabled, quickly check whether
		 * the task was already rescheduled before arriving here.
		 */
		if (!curr->rseq.event.sched_switch) {
			rseq_slice_set_need_resched(curr);

			if (syscall == __NR_rseq_slice_yield) {
				rseq_stat_inc(rseq_stats.s_yielded);
				/* Update the yielded state for syscall return */
				curr->rseq.slice.yielded = 1;
			} else {
				rseq_stat_inc(rseq_stats.s_aborted);
			}
		}
	}
	/* Reschedule on NONE/VOLUNTARY preemption models */
	cond_resched();

	/* Clear the grant in kernel state and user space */
	curr->rseq.slice.state.granted = false;
	if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all))
		force_sig(SIGSEGV);
}

int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
{
	switch (arg2) {
	case PR_RSEQ_SLICE_EXTENSION_GET:
		if (arg3)
			return -EINVAL;
		return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0;

	case PR_RSEQ_SLICE_EXTENSION_SET: {
		u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
		bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE);

		if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE)
			return -EINVAL;
		if (!rseq_slice_extension_enabled())
			return -ENOTSUPP;
		if (!current->rseq.usrptr)
			return -ENXIO;
		if (!rseq_v2(current))
			return -ENOTSUPP;

		/* No change? */
		if (enable == !!current->rseq.slice.state.enabled)
			return 0;

		if (get_user(rflags, &current->rseq.usrptr->flags))
			goto die;

		if (current->rseq.slice.state.enabled)
			valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;

		if ((rflags & valid) != valid)
			goto die;

		rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
		rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
		if (enable)
			rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;

		if (put_user(rflags, &current->rseq.usrptr->flags))
			goto die;

		current->rseq.slice.state.enabled = enable;
		return 0;
	}
	default:
		return -EINVAL;
	}
die:
	force_sig(SIGSEGV);
	return -EFAULT;
}

/**
 * sys_rseq_slice_yield - yield the current processor side effect free if a
 *			  task granted with a time slice extension is done with
 *			  the critical work before being forced out.
 *
 * Return: 1 if the task successfully yielded the CPU within the granted slice.
 *         0 if the slice extension was either never granted or was revoked by
 *	     going over the granted extension, using a syscall other than this one
 *	     or being scheduled out earlier due to a subsequent interrupt.
 *
 * The syscall does not schedule because the syscall entry work immediately
 * relinquishes the CPU and schedules if required.
 */
SYSCALL_DEFINE0(rseq_slice_yield)
{
	int yielded = !!current->rseq.slice.yielded;

	current->rseq.slice.yielded = 0;
	return yielded;
}

static int rseq_slice_ext_show(struct seq_file *m, void *p)
{
	seq_printf(m, "%d\n", rseq_slice_ext_nsecs);
	return 0;
}

static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf,
				    size_t count, loff_t *ppos)
{
	unsigned int nsecs;

	if (kstrtouint_from_user(ubuf, count, 10, &nsecs))
		return -EINVAL;

	if (nsecs < rseq_slice_ext_nsecs_min)
		return -ERANGE;

	if (nsecs > rseq_slice_ext_nsecs_max)
		return -ERANGE;

	rseq_slice_ext_nsecs = nsecs;

	return count;
}

static int rseq_slice_ext_open(struct inode *inode, struct file *file)
{
	return single_open(file, rseq_slice_ext_show, inode->i_private);
}

static const struct file_operations slice_ext_ops = {
	.open		= rseq_slice_ext_open,
	.read		= seq_read,
	.write		= rseq_slice_ext_write,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static void rseq_slice_ext_init(struct dentry *root_dir)
{
	debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops);
}

static int __init rseq_slice_cmdline(char *str)
{
	bool on;

	if (kstrtobool(str, &on))
		return 0;

	if (!on)
		static_branch_disable(&rseq_slice_extension_key);
	return 1;
}
__setup("rseq_slice_ext=", rseq_slice_cmdline);

static int __init rseq_slice_init(void)
{
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
			      CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
	}
	return 0;
}
device_initcall(rseq_slice_init);
#else
static void rseq_slice_ext_init(struct dentry *root_dir) { }
#endif /* CONFIG_RSEQ_SLICE_EXTENSION */