linux/tools/testing/selftests/bpf/progs/verifier_private_stack.c
Puranjay Mohan b1d6bd5462 bpf, arm64: Use ORR-based MOV for general-purpose registers
The A64_MOV macro unconditionally uses ADD Rd, Rn, #0 to implement
register moves. While functionally correct, this is not the canonical
encoding when both operands are general-purpose registers.

On AArch64, MOV has two aliases depending on the operand registers:
  - MOV <Xd|SP>, <Xn|SP>  →  ADD <Xd|SP>, <Xn|SP>, #0
  - MOV <Xd>, <Xn>        →  ORR <Xd>, XZR, <Xn>

The ADD form is required when the stack pointer is involved (as ORR
does not accept SP), while the ORR form is the preferred encoding for
general-purpose registers.

The ORR encoding is also measurably faster on modern microarchitectures.
A microbenchmark [1] comparing dependent chains of MOV (ORR) vs ADD #0
on an ARM Neoverse-V2 (72-core, 3.4 GHz) shows:

  === mov (ORR Xd, XZR, Xn) ===
  run1 cycles/op=0.749859456
  run2 cycles/op=0.749991250
  run3 cycles/op=0.749601847
  avg cycles/op=0.749817518

  === add0 (ADD Xd, Xn, #0) ===
  run1 cycles/op=1.004777689
  run2 cycles/op=1.004558266
  run3 cycles/op=1.004806559
  avg cycles/op=1.004714171

The ORR form completes in ~0.75 cycles/op vs ~1.00 cycles/op for ADD #0,
a ~25% improvement. This is likely because the CPU's register renaming
hardware can eliminate ORR-based moves, while ADD #0 must go through the
ALU pipeline.

Update A64_MOV to select the appropriate encoding at JIT time:
use ADD when either register is A64_SP, and ORR (via
aarch64_insn_gen_move_reg()) otherwise.

Update verifier_private_stack selftests to expect "mov x7, x0" instead
of "add x7, x0, #0x0" in the JITed instruction checks, matching the
new ORR-based encoding.

[1] https://github.com/puranjaymohan/scripts/blob/main/arm64/bench/run_mov_vs_add0.sh

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/r/20260225134339.2723288-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2026-03-03 08:43:10 -08:00

360 lines
7.8 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
#include "bpf_experimental.h"
/* From include/linux/filter.h */
#define MAX_BPF_STACK 512
#if defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)
struct elem {
struct bpf_timer t;
char pad[256];
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, int);
__type(value, struct elem);
} array SEC(".maps");
SEC("kprobe")
__description("Private stack, single prog")
__success
__arch_x86_64
__jited(" movabsq $0x{{.*}}, %r9")
__jited(" addq %gs:{{.*}}, %r9")
__jited(" movl $0x2a, %edi")
__jited(" movq %rdi, -0x100(%r9)")
__arch_arm64
__jited(" stp x25, x27, [sp, {{.*}}]!")
__jited(" mov x27, {{.*}}")
__jited(" movk x27, {{.*}}, lsl #16")
__jited(" movk x27, {{.*}}")
__jited(" mrs x10, TPIDR_EL{{[0-1]}}")
__jited(" add x27, x27, x10")
__jited(" add x25, x27, {{.*}}")
__jited(" mov x0, #0x2a")
__jited(" str x0, [x27]")
__jited("...")
__jited(" ldp x25, x27, [sp], {{.*}}")
__naked void private_stack_single_prog(void)
{
asm volatile (" \
r1 = 42; \
*(u64 *)(r10 - 256) = r1; \
r0 = 0; \
exit; \
" ::: __clobber_all);
}
SEC("raw_tp")
__description("No private stack")
__success
__arch_x86_64
__jited(" subq $0x8, %rsp")
__arch_arm64
__jited(" mov x25, sp")
__jited(" sub sp, sp, #0x10")
__naked void no_private_stack_nested(void)
{
asm volatile (" \
r1 = 42; \
*(u64 *)(r10 - 8) = r1; \
r0 = 0; \
exit; \
" ::: __clobber_all);
}
__used
__naked static void cumulative_stack_depth_subprog(void)
{
asm volatile (" \
r1 = 41; \
*(u64 *)(r10 - 32) = r1; \
call %[bpf_get_smp_processor_id]; \
exit; \
" :
: __imm(bpf_get_smp_processor_id)
: __clobber_all);
}
SEC("kprobe")
__description("Private stack, subtree > MAX_BPF_STACK")
__success
__arch_x86_64
/* private stack fp for the main prog */
__jited(" movabsq $0x{{.*}}, %r9")
__jited(" addq %gs:{{.*}}, %r9")
__jited(" movl $0x2a, %edi")
__jited(" movq %rdi, -0x200(%r9)")
__jited(" pushq %r9")
__jited(" callq 0x{{.*}}")
__jited(" popq %r9")
__jited(" xorl %eax, %eax")
__arch_arm64
__jited(" stp x25, x27, [sp, {{.*}}]!")
__jited(" mov x27, {{.*}}")
__jited(" movk x27, {{.*}}, lsl #16")
__jited(" movk x27, {{.*}}")
__jited(" mrs x10, TPIDR_EL{{[0-1]}}")
__jited(" add x27, x27, x10")
__jited(" add x25, x27, {{.*}}")
__jited(" mov x0, #0x2a")
__jited(" str x0, [x27]")
__jited(" bl {{.*}}")
__jited("...")
__jited(" ldp x25, x27, [sp], {{.*}}")
__naked void private_stack_nested_1(void)
{
asm volatile (" \
r1 = 42; \
*(u64 *)(r10 - %[max_bpf_stack]) = r1; \
call cumulative_stack_depth_subprog; \
r0 = 0; \
exit; \
" :
: __imm_const(max_bpf_stack, MAX_BPF_STACK)
: __clobber_all);
}
__naked __noinline __used
static unsigned long loop_callback(void)
{
asm volatile (" \
call %[bpf_get_prandom_u32]; \
r1 = 42; \
*(u64 *)(r10 - 512) = r1; \
call cumulative_stack_depth_subprog; \
r0 = 0; \
exit; \
" :
: __imm(bpf_get_prandom_u32)
: __clobber_common);
}
SEC("raw_tp")
__description("Private stack, callback")
__success
__arch_x86_64
/* for func loop_callback */
__jited("func #1")
__jited(" endbr64")
__jited(" nopl (%rax,%rax)")
__jited(" nopl (%rax)")
__jited(" pushq %rbp")
__jited(" movq %rsp, %rbp")
__jited(" endbr64")
__jited(" movabsq $0x{{.*}}, %r9")
__jited(" addq %gs:{{.*}}, %r9")
__jited(" pushq %r9")
__jited(" callq")
__jited(" popq %r9")
__jited(" movl $0x2a, %edi")
__jited(" movq %rdi, -0x200(%r9)")
__jited(" pushq %r9")
__jited(" callq")
__jited(" popq %r9")
__arch_arm64
__jited("func #1")
__jited("...")
__jited(" stp x25, x27, [sp, {{.*}}]!")
__jited(" mov x27, {{.*}}")
__jited(" movk x27, {{.*}}, lsl #16")
__jited(" movk x27, {{.*}}")
__jited(" mrs x10, TPIDR_EL{{[0-1]}}")
__jited(" add x27, x27, x10")
__jited(" add x25, x27, {{.*}}")
__jited(" bl 0x{{.*}}")
__jited(" mov x7, x0")
__jited(" mov x0, #0x2a")
__jited(" str x0, [x27]")
__jited(" bl 0x{{.*}}")
__jited(" mov x7, x0")
__jited(" mov x7, #0x0")
__jited(" ldp x25, x27, [sp], {{.*}}")
__naked void private_stack_callback(void)
{
asm volatile (" \
r1 = 1; \
r2 = %[loop_callback]; \
r3 = 0; \
r4 = 0; \
call %[bpf_loop]; \
r0 = 0; \
exit; \
" :
: __imm_ptr(loop_callback),
__imm(bpf_loop)
: __clobber_common);
}
SEC("fentry/bpf_fentry_test9")
__description("Private stack, exception in main prog")
__success __retval(0)
__arch_x86_64
__jited(" pushq %r9")
__jited(" callq")
__jited(" popq %r9")
__arch_arm64
__jited(" stp x29, x30, [sp, #-0x10]!")
__jited(" mov x29, sp")
__jited(" stp xzr, x26, [sp, #-0x10]!")
__jited(" mov x26, sp")
__jited(" stp x19, x20, [sp, #-0x10]!")
__jited(" stp x21, x22, [sp, #-0x10]!")
__jited(" stp x23, x24, [sp, #-0x10]!")
__jited(" stp x25, x26, [sp, #-0x10]!")
__jited(" stp x27, x28, [sp, #-0x10]!")
__jited(" mov x27, {{.*}}")
__jited(" movk x27, {{.*}}, lsl #16")
__jited(" movk x27, {{.*}}")
__jited(" mrs x10, TPIDR_EL{{[0-1]}}")
__jited(" add x27, x27, x10")
__jited(" add x25, x27, {{.*}}")
__jited(" mov x0, #0x2a")
__jited(" str x0, [x27]")
__jited(" mov x0, #0x0")
__jited(" bl 0x{{.*}}")
__jited(" mov x7, x0")
__jited(" ldp x27, x28, [sp], #0x10")
int private_stack_exception_main_prog(void)
{
asm volatile (" \
r1 = 42; \
*(u64 *)(r10 - 512) = r1; \
" ::: __clobber_common);
bpf_throw(0);
return 0;
}
__used static int subprog_exception(void)
{
bpf_throw(0);
return 0;
}
SEC("fentry/bpf_fentry_test9")
__description("Private stack, exception in subprog")
__success __retval(0)
__arch_x86_64
__jited(" movq %rdi, -0x200(%r9)")
__jited(" pushq %r9")
__jited(" callq")
__jited(" popq %r9")
__arch_arm64
__jited(" stp x27, x28, [sp, #-0x10]!")
__jited(" mov x27, {{.*}}")
__jited(" movk x27, {{.*}}, lsl #16")
__jited(" movk x27, {{.*}}")
__jited(" mrs x10, TPIDR_EL{{[0-1]}}")
__jited(" add x27, x27, x10")
__jited(" add x25, x27, {{.*}}")
__jited(" mov x0, #0x2a")
__jited(" str x0, [x27]")
__jited(" bl 0x{{.*}}")
__jited(" mov x7, x0")
__jited(" ldp x27, x28, [sp], #0x10")
int private_stack_exception_sub_prog(void)
{
asm volatile (" \
r1 = 42; \
*(u64 *)(r10 - 512) = r1; \
call subprog_exception; \
" ::: __clobber_common);
return 0;
}
int glob;
__noinline static void subprog2(int *val)
{
glob += val[0] * 2;
}
__noinline static void subprog1(int *val)
{
int tmp[64] = {};
tmp[0] = *val;
subprog2(tmp);
}
__noinline static int timer_cb1(void *map, int *key, struct bpf_timer *timer)
{
subprog1(key);
return 0;
}
__noinline static int timer_cb2(void *map, int *key, struct bpf_timer *timer)
{
return 0;
}
SEC("fentry/bpf_fentry_test9")
__description("Private stack, async callback, not nested")
__success __retval(0)
__arch_x86_64
__jited(" movabsq $0x{{.*}}, %r9")
__arch_arm64
__jited(" mrs x10, TPIDR_EL{{[0-1]}}")
__jited(" add x27, x27, x10")
__jited(" add x25, x27, {{.*}}")
int private_stack_async_callback_1(void)
{
struct bpf_timer *arr_timer;
int array_key = 0;
arr_timer = bpf_map_lookup_elem(&array, &array_key);
if (!arr_timer)
return 0;
bpf_timer_init(arr_timer, &array, 1);
bpf_timer_set_callback(arr_timer, timer_cb2);
bpf_timer_start(arr_timer, 0, 0);
subprog1(&array_key);
return 0;
}
SEC("fentry/bpf_fentry_test9")
__description("Private stack, async callback, potential nesting")
__success __retval(0)
__arch_x86_64
__jited(" subq $0x100, %rsp")
__arch_arm64
__jited(" sub sp, sp, #0x100")
int private_stack_async_callback_2(void)
{
struct bpf_timer *arr_timer;
int array_key = 0;
arr_timer = bpf_map_lookup_elem(&array, &array_key);
if (!arr_timer)
return 0;
bpf_timer_init(arr_timer, &array, 1);
bpf_timer_set_callback(arr_timer, timer_cb1);
bpf_timer_start(arr_timer, 0, 0);
subprog1(&array_key);
return 0;
}
#else
SEC("kprobe")
__description("private stack is not supported, use a dummy test")
__success
int dummy_test(void)
{
return 0;
}
#endif
char _license[] SEC("license") = "GPL";