Files
linux/include/linux/rseq.h
Thomas Gleixner 3db6b38dfe rseq: Switch to fast path processing on exit to user
Now that all bits and pieces are in place, hook the RSEQ handling fast path
function into exit_to_user_mode_prepare() after the TIF work bits have been
handled. If case of fast path failure, TIF_NOTIFY_RESUME has been raised
and the caller needs to take another turn through the TIF handling slow
path.

This only works for architectures which use the generic entry code.
Architectures who still have their own incomplete hacks are not supported
and won't be.

This results in the following improvements:

  Kernel build	       Before		  After		      Reduction

  exit to user         80692981		  80514451
  signal checks:          32581		       121	       99%
  slowpath runs:        1201408   1.49%	       198 0.00%      100%
  fastpath runs:			    675941 0.84%       N/A
  id updates:           1233989   1.53%	     50541 0.06%       96%
  cs checks:            1125366   1.39%	         0 0.00%      100%
    cs cleared:         1125366      100%	 0            100%
    cs fixup:                 0        0%	 0

  RSEQ selftests      Before		  After		      Reduction

  exit to user:       386281778		  387373750
  signal checks:       35661203		          0           100%
  slowpath runs:      140542396 36.38%	        100  0.00%    100%
  fastpath runs:			    9509789  2.51%     N/A
  id updates:         176203599 45.62%	    9087994  2.35%     95%
  cs checks:          175587856 45.46%	    4728394  1.22%     98%
    cs cleared:       172359544   98.16%    1319307   27.90%   99%
    cs fixup:           3228312    1.84%    3409087   72.10%

The 'cs cleared' and 'cs fixup' percentages are not relative to the exit to
user invocations, they are relative to the actual 'cs check' invocations.

While some of this could have been avoided in the original code, like the
obvious clearing of CS when it's already clear, the main problem of going
through TIF_NOTIFY_RESUME cannot be solved. In some workloads the RSEQ
notify handler is invoked more than once before going out to user
space. Doing this once when everything has stabilized is the only solution
to avoid this.

The initial attempt to completely decouple it from the TIF work turned out
to be suboptimal for workloads, which do a lot of quick and short system
calls. Even if the fast path decision is only 4 instructions (including a
conditional branch), this adds up quickly and becomes measurable when the
rate for actually having to handle rseq is in the low single digit
percentage range of user/kernel transitions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.701201365@linutronix.de
2025-11-04 08:34:39 +01:00

179 lines
5.4 KiB
C

/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
#ifndef _LINUX_RSEQ_H
#define _LINUX_RSEQ_H
#ifdef CONFIG_RSEQ
#include <linux/sched.h>
#include <uapi/linux/rseq.h>
void __rseq_handle_slowpath(struct pt_regs *regs);
/* Invoked from resume_user_mode_work() */
static inline void rseq_handle_slowpath(struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
if (current->rseq.event.slowpath)
__rseq_handle_slowpath(regs);
} else {
/* '&' is intentional to spare one conditional branch */
if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
__rseq_handle_slowpath(regs);
}
}
void __rseq_signal_deliver(int sig, struct pt_regs *regs);
/*
* Invoked from signal delivery to fixup based on the register context before
* switching to the signal delivery context.
*/
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
/* '&' is intentional to spare one conditional branch */
if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
__rseq_signal_deliver(ksig->sig, regs);
} else {
if (current->rseq.event.has_rseq)
__rseq_signal_deliver(ksig->sig, regs);
}
}
static inline void rseq_raise_notify_resume(struct task_struct *t)
{
set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}
/* Invoked from context switch to force evaluation on exit to user */
static __always_inline void rseq_sched_switch_event(struct task_struct *t)
{
struct rseq_event *ev = &t->rseq.event;
if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
/*
* Avoid a boat load of conditionals by using simple logic
* to determine whether NOTIFY_RESUME needs to be raised.
*
* It's required when the CPU or MM CID has changed or
* the entry was from user space.
*/
bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
if (raise) {
ev->sched_switch = true;
rseq_raise_notify_resume(t);
}
} else {
if (ev->has_rseq) {
t->rseq.event.sched_switch = true;
rseq_raise_notify_resume(t);
}
}
}
/*
* Invoked from __set_task_cpu() when a task migrates to enforce an IDs
* update.
*
* This does not raise TIF_NOTIFY_RESUME as that happens in
* rseq_sched_switch_event().
*/
static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu)
{
t->rseq.event.ids_changed = true;
}
/*
* Invoked from switch_mm_cid() in context switch when the task gets a MM
* CID assigned.
*
* This does not raise TIF_NOTIFY_RESUME as that happens in
* rseq_sched_switch_event().
*/
static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
{
/*
* Requires a comparison as the switch_mm_cid() code does not
* provide a conditional for it readily. So avoid excessive updates
* when nothing changes.
*/
if (t->rseq.ids.mm_cid != cid)
t->rseq.event.ids_changed = true;
}
/* Enforce a full update after RSEQ registration and when execve() failed */
static inline void rseq_force_update(void)
{
if (current->rseq.event.has_rseq) {
current->rseq.event.ids_changed = true;
current->rseq.event.sched_switch = true;
rseq_raise_notify_resume(current);
}
}
/*
* KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
* which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
* that case just to do it eventually again before returning to user space,
* the entry resume_user_mode_work() invocation is ignored as the register
* argument is NULL.
*
* After returning from guest mode, they have to invoke this function to
* re-raise TIF_NOTIFY_RESUME if necessary.
*/
static inline void rseq_virt_userspace_exit(void)
{
if (current->rseq.event.sched_switch)
rseq_raise_notify_resume(current);
}
static inline void rseq_reset(struct task_struct *t)
{
memset(&t->rseq, 0, sizeof(t->rseq));
t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
}
static inline void rseq_execve(struct task_struct *t)
{
rseq_reset(t);
}
/*
* If parent process has a registered restartable sequences area, the
* child inherits. Unregister rseq for a clone with CLONE_VM set.
*
* On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
* on the COW page on exit to user space, when the child stays on the same
* CPU as the parent. That's obviously not guaranteed, but in overcommit
* scenarios it is more likely and optimizes for the fork/exec case without
* taking the fault.
*/
static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
{
if (clone_flags & CLONE_VM)
rseq_reset(t);
else
t->rseq = current->rseq;
}
#else /* CONFIG_RSEQ */
static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
static inline void rseq_force_update(void) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
static inline void rseq_execve(struct task_struct *t) { }
#endif /* !CONFIG_RSEQ */
#ifdef CONFIG_DEBUG_RSEQ
void rseq_syscall(struct pt_regs *regs);
#else /* CONFIG_DEBUG_RSEQ */
static inline void rseq_syscall(struct pt_regs *regs) { }
#endif /* !CONFIG_DEBUG_RSEQ */
#endif /* _LINUX_RSEQ_H */