x86: move the XOR code to lib/raid/

Move the optimized XOR code out of line into lib/raid.

Link: https://lkml.kernel.org/r/20260327061704.3707577-20-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@kernel.org>
Tested-by: Eric Biggers <ebiggers@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: David Sterba <dsterba@suse.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason A. Donenfeld <jason@zx2c4.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Li Nan <linan122@huawei.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Magnus Lindholm <linmag7@gmail.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Song Liu <song@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Christoph Hellwig 2026-03-27 07:16:51 +01:00 committed by Andrew Morton
parent 95c104cc55
commit 77fd47e57a
6 changed files with 522 additions and 580 deletions

View File

@ -2,498 +2,42 @@
#ifndef _ASM_X86_XOR_H
#define _ASM_X86_XOR_H
/*
* Optimized RAID-5 checksumming functions for SSE.
*/
#include <asm/cpufeature.h>
#include <asm-generic/xor.h>
extern struct xor_block_template xor_block_pII_mmx;
extern struct xor_block_template xor_block_p5_mmx;
extern struct xor_block_template xor_block_sse;
extern struct xor_block_template xor_block_sse_pf64;
extern struct xor_block_template xor_block_avx;
/*
* Cache avoiding checksumming functions utilizing KNI instructions
* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
*/
/*
* Based on
* High-speed RAID5 checksumming functions utilizing SSE instructions.
* Copyright (C) 1998 Ingo Molnar.
*/
/*
* x86-64 changes / gcc fixes from Andi Kleen.
* Copyright 2002 Andi Kleen, SuSE Labs.
* When SSE is available, use it as it can write around L2. We may also be able
* to load into the L1 only depending on how the cpu deals with a load to a line
* that is being prefetched.
*
* This hasn't been optimized for the hammer yet, but there are likely
* no advantages to be gotten from x86-64 here anyways.
* When AVX2 is available, force using it as it is better by all measures.
*
* 32-bit without MMX can fall back to the generic routines.
*/
#include <asm/fpu/api.h>
#ifdef CONFIG_X86_32
/* reduce register pressure */
# define XOR_CONSTANT_CONSTRAINT "i"
#else
# define XOR_CONSTANT_CONSTRAINT "re"
#endif
#define OFFS(x) "16*("#x")"
#define PF_OFFS(x) "256+16*("#x")"
#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
#define NOP(x)
#define BLK64(pf, op, i) \
pf(i) \
op(i, 0) \
op(i + 1, 1) \
op(i + 2, 2) \
op(i + 3, 3)
static void
xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
#define arch_xor_init arch_xor_init
static __always_inline void __init arch_xor_init(void)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
LD(i, 0) \
LD(i + 1, 1) \
PF1(i) \
PF1(i + 2) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF0(i + 4) \
PF0(i + 6) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
if (boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_OSXSAVE)) {
xor_force(&xor_block_avx);
} else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) {
xor_register(&xor_block_sse);
xor_register(&xor_block_sse_pf64);
} else if (boot_cpu_has(X86_FEATURE_MMX)) {
xor_register(&xor_block_pII_mmx);
xor_register(&xor_block_p5_mmx);
} else {
xor_register(&xor_block_8regs);
xor_register(&xor_block_8regs_p);
xor_register(&xor_block_32regs);
xor_register(&xor_block_32regs_p);
}
}
static void
xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i + 2) \
LD(i, 0) \
LD(i + 1, 1) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF2(i) \
PF2(i + 2) \
PF0(i + 4) \
PF0(i + 6) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
XO2(i, 0) \
XO2(i + 1, 1) \
XO2(i + 2, 2) \
XO2(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i + 2) \
LD(i, 0) \
LD(i + 1, 1) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF2(i) \
PF2(i + 2) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
PF3(i) \
PF3(i + 2) \
PF0(i + 4) \
PF0(i + 6) \
XO2(i, 0) \
XO2(i + 1, 1) \
XO2(i + 2, 2) \
XO2(i + 3, 3) \
XO3(i, 0) \
XO3(i + 1, 1) \
XO3(i + 2, 2) \
XO3(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1),
[p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(PF3, XO3, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1),
[p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i + 2) \
LD(i, 0) \
LD(i + 1, 1) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF2(i) \
PF2(i + 2) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
PF3(i) \
PF3(i + 2) \
XO2(i, 0) \
XO2(i + 1, 1) \
XO2(i + 2, 2) \
XO2(i + 3, 3) \
PF4(i) \
PF4(i + 2) \
PF0(i + 4) \
PF0(i + 6) \
XO3(i, 0) \
XO3(i + 1, 1) \
XO3(i + 2, 2) \
XO3(i + 3, 3) \
XO4(i, 0) \
XO4(i + 1, 1) \
XO4(i + 2, 2) \
XO4(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" add %[inc], %[p5] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
[p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(PF3, XO3, i) \
BLK64(PF4, XO4, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" add %[inc], %[p5] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
[p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static struct xor_block_template xor_block_sse_pf64 = {
.name = "prefetch64-sse",
.do_2 = xor_sse_2_pf64,
.do_3 = xor_sse_3_pf64,
.do_4 = xor_sse_4_pf64,
.do_5 = xor_sse_5_pf64,
};
#undef LD
#undef XO1
#undef XO2
#undef XO3
#undef XO4
#undef ST
#undef NOP
#undef BLK64
#undef BLOCK
#undef XOR_CONSTANT_CONSTRAINT
#ifdef CONFIG_X86_32
# include <asm/xor_32.h>
#else
# include <asm/xor_64.h>
#endif
#endif /* _ASM_X86_XOR_H */

View File

@ -1,32 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_XOR_64_H
#define _ASM_X86_XOR_64_H
static struct xor_block_template xor_block_sse = {
.name = "generic_sse",
.do_2 = xor_sse_2,
.do_3 = xor_sse_3,
.do_4 = xor_sse_4,
.do_5 = xor_sse_5,
};
/* Also try the AVX routines */
#include <asm/xor_avx.h>
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#define arch_xor_init arch_xor_init
static __always_inline void __init arch_xor_init(void)
{
if (boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_OSXSAVE)) {
xor_force(&xor_block_avx);
} else {
xor_register(&xor_block_sse_pf64);
xor_register(&xor_block_sse);
}
}
#endif /* _ASM_X86_XOR_64_H */

View File

@ -21,6 +21,8 @@ xor-$(CONFIG_RISCV_ISA_V) += riscv/xor.o riscv/xor-glue.o
xor-$(CONFIG_SPARC32) += sparc/xor-sparc32.o
xor-$(CONFIG_SPARC64) += sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
xor-$(CONFIG_S390) += s390/xor.o
xor-$(CONFIG_X86_32) += x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
xor-$(CONFIG_X86_64) += x86/xor-avx.o x86/xor-sse.o
CFLAGS_arm/xor-neon.o += $(CC_FLAGS_FPU)

View File

@ -1,18 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_X86_XOR_AVX_H
#define _ASM_X86_XOR_AVX_H
// SPDX-License-Identifier: GPL-2.0-only
/*
* Optimized RAID-5 checksumming functions for AVX
* Optimized XOR parity functions for AVX
*
* Copyright (C) 2012 Intel Corporation
* Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
*/
#include <linux/compiler.h>
#include <linux/raid/xor_impl.h>
#include <asm/fpu/api.h>
#include <asm/xor.h>
#define BLOCK4(i) \
BLOCK(32 * i, 0) \
@ -158,12 +156,10 @@ do { \
kernel_fpu_end();
}
static struct xor_block_template xor_block_avx = {
struct xor_block_template xor_block_avx = {
.name = "avx",
.do_2 = xor_avx_2,
.do_3 = xor_avx_3,
.do_4 = xor_avx_4,
.do_5 = xor_avx_5,
};
#endif

View File

@ -1,15 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_X86_XOR_32_H
#define _ASM_X86_XOR_32_H
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Optimized RAID-5 checksumming functions for MMX.
*/
/*
* High-speed RAID5 checksumming functions utilizing MMX instructions.
* Optimized XOR parity functions for MMX.
*
* Copyright (C) 1998 Ingo Molnar.
*/
#include <linux/raid/xor_impl.h>
#include <asm/fpu/api.h>
#include <asm/xor.h>
#define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
#define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
@ -18,8 +15,6 @@
#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
#include <asm/fpu/api.h>
static void
xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
@ -519,7 +514,7 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
kernel_fpu_end();
}
static struct xor_block_template xor_block_pII_mmx = {
struct xor_block_template xor_block_pII_mmx = {
.name = "pII_mmx",
.do_2 = xor_pII_mmx_2,
.do_3 = xor_pII_mmx_3,
@ -527,49 +522,10 @@ static struct xor_block_template xor_block_pII_mmx = {
.do_5 = xor_pII_mmx_5,
};
static struct xor_block_template xor_block_p5_mmx = {
struct xor_block_template xor_block_p5_mmx = {
.name = "p5_mmx",
.do_2 = xor_p5_mmx_2,
.do_3 = xor_p5_mmx_3,
.do_4 = xor_p5_mmx_4,
.do_5 = xor_p5_mmx_5,
};
static struct xor_block_template xor_block_pIII_sse = {
.name = "pIII_sse",
.do_2 = xor_sse_2,
.do_3 = xor_sse_3,
.do_4 = xor_sse_4,
.do_5 = xor_sse_5,
};
/* Also try the AVX routines */
#include <asm/xor_avx.h>
/* Also try the generic routines. */
#include <asm-generic/xor.h>
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#define arch_xor_init arch_xor_init
static __always_inline void __init arch_xor_init(void)
{
if (boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_OSXSAVE)) {
xor_force(&xor_block_avx);
} else if (boot_cpu_has(X86_FEATURE_XMM)) {
xor_register(&xor_block_pIII_sse);
xor_register(&xor_block_sse_pf64);
} else if (boot_cpu_has(X86_FEATURE_MMX)) {
xor_register(&xor_block_pII_mmx);
xor_register(&xor_block_p5_mmx);
} else {
xor_register(&xor_block_8regs);
xor_register(&xor_block_8regs_p);
xor_register(&xor_block_32regs);
xor_register(&xor_block_32regs_p);
}
}
#endif /* _ASM_X86_XOR_32_H */

476
lib/raid/xor/x86/xor-sse.c Normal file
View File

@ -0,0 +1,476 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Optimized XOR parity functions for SSE.
*
* Cache avoiding checksumming functions utilizing KNI instructions
* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
*
* Based on
* High-speed RAID5 checksumming functions utilizing SSE instructions.
* Copyright (C) 1998 Ingo Molnar.
*
* x86-64 changes / gcc fixes from Andi Kleen.
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
#include <linux/raid/xor_impl.h>
#include <asm/fpu/api.h>
#include <asm/xor.h>
#ifdef CONFIG_X86_32
/* reduce register pressure */
# define XOR_CONSTANT_CONSTRAINT "i"
#else
# define XOR_CONSTANT_CONSTRAINT "re"
#endif
#define OFFS(x) "16*("#x")"
#define PF_OFFS(x) "256+16*("#x")"
#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
#define NOP(x)
#define BLK64(pf, op, i) \
pf(i) \
op(i, 0) \
op(i + 1, 1) \
op(i + 2, 2) \
op(i + 3, 3)
static void
xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
LD(i, 0) \
LD(i + 1, 1) \
PF1(i) \
PF1(i + 2) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF0(i + 4) \
PF0(i + 6) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i + 2) \
LD(i, 0) \
LD(i + 1, 1) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF2(i) \
PF2(i + 2) \
PF0(i + 4) \
PF0(i + 6) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
XO2(i, 0) \
XO2(i + 1, 1) \
XO2(i + 2, 2) \
XO2(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i + 2) \
LD(i, 0) \
LD(i + 1, 1) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF2(i) \
PF2(i + 2) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
PF3(i) \
PF3(i + 2) \
PF0(i + 4) \
PF0(i + 6) \
XO2(i, 0) \
XO2(i + 1, 1) \
XO2(i + 2, 2) \
XO2(i + 3, 3) \
XO3(i, 0) \
XO3(i + 1, 1) \
XO3(i + 2, 2) \
XO3(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1),
[p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(PF3, XO3, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1),
[p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
PF1(i + 2) \
LD(i, 0) \
LD(i + 1, 1) \
LD(i + 2, 2) \
LD(i + 3, 3) \
PF2(i) \
PF2(i + 2) \
XO1(i, 0) \
XO1(i + 1, 1) \
XO1(i + 2, 2) \
XO1(i + 3, 3) \
PF3(i) \
PF3(i + 2) \
XO2(i, 0) \
XO2(i + 1, 1) \
XO2(i + 2, 2) \
XO2(i + 3, 3) \
PF4(i) \
PF4(i + 2) \
PF0(i + 4) \
PF0(i + 6) \
XO3(i, 0) \
XO3(i + 1, 1) \
XO3(i + 2, 2) \
XO3(i + 3, 3) \
XO4(i, 0) \
XO4(i + 1, 1) \
XO4(i + 2, 2) \
XO4(i + 3, 3) \
ST(i, 0) \
ST(i + 1, 1) \
ST(i + 2, 2) \
ST(i + 3, 3) \
PF0(0)
PF0(2)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" add %[inc], %[p5] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
[p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
static void
xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p2,
const unsigned long * __restrict p3,
const unsigned long * __restrict p4,
const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 8;
kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(PF3, XO3, i) \
BLK64(PF4, XO4, i) \
BLK64(NOP, ST, i) \
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)
" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" add %[inc], %[p5] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
[p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");
kernel_fpu_end();
}
struct xor_block_template xor_block_sse = {
.name = "sse",
.do_2 = xor_sse_2,
.do_3 = xor_sse_3,
.do_4 = xor_sse_4,
.do_5 = xor_sse_5,
};
struct xor_block_template xor_block_sse_pf64 = {
.name = "prefetch64-sse",
.do_2 = xor_sse_2_pf64,
.do_3 = xor_sse_3_pf64,
.do_4 = xor_sse_4_pf64,
.do_5 = xor_sse_5_pf64,
};