linux/arch/s390/include/asm/percpu.h
Heiko Carstens fa8be59ce2 s390/percpu: Provide arch_raw_cpu_ptr()
Provide an s390 specific arch_raw_cpu_ptr() implementation which avoids the
detour over get_lowcore() to get the lowcore pointer. The inline assembly
is implemented with an alternative so that relocated lowcore (percpu offset
is at a different address) is handled correctly.

This turns code like this

  102f78:       a7 39 00 00             lghi    %r3,0
  102f7c:       e3 20 33 b8 00 08       ag      %r2,952(%r3)

which adds the percpu offset to register r2 into a single instruction

  102f7c:       e3 20 33 b8 00 08       ag      %r2,952(%r0)

and also avoids the need of a base register, thus reducing register
pressure.

With defconfig bloat-o-meter -t provides this result:

add/remove: 12/26 grow/shrink: 183/3391 up/down: 14880/-41950 (-27070)

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
2026-03-24 21:00:41 +01:00

196 lines
6.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ARCH_S390_PERCPU__
#define __ARCH_S390_PERCPU__
#include <linux/preempt.h>
#include <asm/cmpxchg.h>
#include <asm/march.h>
/*
* s390 uses its own implementation for per cpu data, the offset of
* the cpu local data area is cached in the cpu's lowcore memory.
*/
#define __my_cpu_offset get_lowcore()->percpu_offset
#define arch_raw_cpu_ptr(_ptr) \
({ \
unsigned long lc_percpu, tcp_ptr__; \
\
tcp_ptr__ = (__force unsigned long)(_ptr); \
lc_percpu = offsetof(struct lowcore, percpu_offset); \
asm_inline volatile( \
ALTERNATIVE("ag %[__ptr__],%[offzero](%%r0)\n", \
"ag %[__ptr__],%[offalt](%%r0)\n", \
ALT_FEATURE(MFEATURE_LOWCORE)) \
: [__ptr__] "+d" (tcp_ptr__) \
: [offzero] "i" (lc_percpu), \
[offalt] "i" (lc_percpu + LOWCORE_ALT_ADDRESS), \
"m" (((struct lowcore *)0)->percpu_offset) \
: "cc"); \
(TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)tcp_ptr__; \
})
/*
* We use a compare-and-swap loop since that uses less cpu cycles than
* disabling and enabling interrupts like the generic variant would do.
*/
#define arch_this_cpu_to_op_simple(pcp, val, op) \
({ \
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ old__, new__, prev__; \
pcp_op_T__ *ptr__; \
preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
prev__ = READ_ONCE(*ptr__); \
do { \
old__ = prev__; \
new__ = old__ op (val); \
prev__ = cmpxchg(ptr__, old__, new__); \
} while (prev__ != old__); \
preempt_enable_notrace(); \
new__; \
})
#define this_cpu_add_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
#define this_cpu_add_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
#define this_cpu_add_return_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
#define this_cpu_add_return_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
#define this_cpu_and_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &)
#define this_cpu_and_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &)
#define this_cpu_or_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |)
#define this_cpu_or_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |)
#ifndef MARCH_HAS_Z196_FEATURES
#define this_cpu_add_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
#define this_cpu_add_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
#define this_cpu_add_return_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
#define this_cpu_add_return_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +)
#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &)
#define this_cpu_and_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &)
#define this_cpu_or_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |)
#define this_cpu_or_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |)
#else /* MARCH_HAS_Z196_FEATURES */
#define arch_this_cpu_add(pcp, val, op1, op2, szcast) \
{ \
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ val__ = (val); \
pcp_op_T__ old__, *ptr__; \
preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
if (__builtin_constant_p(val__) && \
((szcast)val__ > -129) && ((szcast)val__ < 128)) { \
asm volatile( \
op2 " %[ptr__],%[val__]" \
: [ptr__] "+Q" (*ptr__) \
: [val__] "i" ((szcast)val__) \
: "cc"); \
} else { \
asm volatile( \
op1 " %[old__],%[val__],%[ptr__]" \
: [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \
: [val__] "d" (val__) \
: "cc"); \
} \
preempt_enable_notrace(); \
}
#define this_cpu_add_4(pcp, val) arch_this_cpu_add(pcp, val, "laa", "asi", int)
#define this_cpu_add_8(pcp, val) arch_this_cpu_add(pcp, val, "laag", "agsi", long)
#define arch_this_cpu_add_return(pcp, val, op) \
({ \
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ val__ = (val); \
pcp_op_T__ old__, *ptr__; \
preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
asm volatile( \
op " %[old__],%[val__],%[ptr__]" \
: [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \
: [val__] "d" (val__) \
: "cc"); \
preempt_enable_notrace(); \
old__ + val__; \
})
#define this_cpu_add_return_4(pcp, val) arch_this_cpu_add_return(pcp, val, "laa")
#define this_cpu_add_return_8(pcp, val) arch_this_cpu_add_return(pcp, val, "laag")
#define arch_this_cpu_to_op(pcp, val, op) \
{ \
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ val__ = (val); \
pcp_op_T__ old__, *ptr__; \
preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
asm volatile( \
op " %[old__],%[val__],%[ptr__]" \
: [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \
: [val__] "d" (val__) \
: "cc"); \
preempt_enable_notrace(); \
}
#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lan")
#define this_cpu_and_8(pcp, val) arch_this_cpu_to_op(pcp, val, "lang")
#define this_cpu_or_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lao")
#define this_cpu_or_8(pcp, val) arch_this_cpu_to_op(pcp, val, "laog")
#endif /* MARCH_HAS_Z196_FEATURES */
#define arch_this_cpu_cmpxchg(pcp, oval, nval) \
({ \
typedef typeof(pcp) pcp_op_T__; \
pcp_op_T__ ret__; \
pcp_op_T__ *ptr__; \
preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
ret__ = cmpxchg(ptr__, oval, nval); \
preempt_enable_notrace(); \
ret__; \
})
#define this_cpu_cmpxchg_1(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
#define this_cpu_cmpxchg_2(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
#define this_cpu_cmpxchg_4(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
#define this_cpu_cmpxchg64(pcp, o, n) this_cpu_cmpxchg_8(pcp, o, n)
#define this_cpu_cmpxchg128(pcp, oval, nval) \
({ \
typedef typeof(pcp) pcp_op_T__; \
u128 old__, new__, ret__; \
pcp_op_T__ *ptr__; \
old__ = oval; \
new__ = nval; \
preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
ret__ = cmpxchg128((void *)ptr__, old__, new__); \
preempt_enable_notrace(); \
ret__; \
})
#define arch_this_cpu_xchg(pcp, nval) \
({ \
typeof(pcp) *ptr__; \
typeof(pcp) ret__; \
preempt_disable_notrace(); \
ptr__ = raw_cpu_ptr(&(pcp)); \
ret__ = xchg(ptr__, nval); \
preempt_enable_notrace(); \
ret__; \
})
#define this_cpu_xchg_1(pcp, nval) arch_this_cpu_xchg(pcp, nval)
#define this_cpu_xchg_2(pcp, nval) arch_this_cpu_xchg(pcp, nval)
#define this_cpu_xchg_4(pcp, nval) arch_this_cpu_xchg(pcp, nval)
#define this_cpu_xchg_8(pcp, nval) arch_this_cpu_xchg(pcp, nval)
#include <asm-generic/percpu.h>
#endif /* __ARCH_S390_PERCPU__ */