crypto: x86/aes-xts - optimize _compute_first_set_of_tweaks for AVX-512

Optimize the AVX-512 version of _compute_first_set_of_tweaks by using
vectorized shifts to compute the first vector of tweak blocks, and by
using byte-aligned shifts when multiplying by x^8.

AES-XTS performance on AMD Ryzen 9 9950X (Zen 5) improves by about 2%
for 4096-byte messages or 6% for 512-byte messages.  AES-XTS performance
on Intel Sapphire Rapids improves by about 1% for 4096-byte messages or
3% for 512-byte messages.  Code size decreases by 75 bytes which
outweighs the increase in rodata size of 16 bytes.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Eric Biggers 2025-04-04 21:09:30 -07:00 committed by Herbert Xu
parent bc23fe6dc1
commit 570ef50a15

View File

@ -100,6 +100,17 @@
// exists when there's a carry out of the low 64 bits of the tweak.
.quad 0x87, 1
// These are the shift amounts that are needed when multiplying by [x^0,
// x^1, x^2, x^3] to compute the first vector of tweaks when VL=64.
//
// The right shifts by 64 are expected to zeroize the destination.
// 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the
// amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would.
.Lrshift_amounts:
.byte 64, 64, 63, 63, 62, 62, 61, 61
.Llshift_amounts:
.byte 0, 0, 1, 1, 2, 2, 3, 3
// This table contains constants for vpshufb and vpblendvb, used to
// handle variable byte shifts and blending during ciphertext stealing
// on CPUs that don't support AVX512-style masking.
@ -294,52 +305,75 @@
// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
.macro _compute_first_set_of_tweaks
vmovdqu (TWEAK), TWEAK0_XMM
_vbroadcast128 .Lgf_poly(%rip), GF_POLY
.if VL == 16
// With VL=16, multiplying by x serially is fastest.
vmovdqu (TWEAK), TWEAK0_XMM
vmovdqu .Lgf_poly(%rip), GF_POLY
_next_tweak TWEAK0, %xmm0, TWEAK1
_next_tweak TWEAK1, %xmm0, TWEAK2
_next_tweak TWEAK2, %xmm0, TWEAK3
.else
.if VL == 32
// Compute the second block of TWEAK0.
.elseif VL == 32
vmovdqu (TWEAK), TWEAK0_XMM
vbroadcasti128 .Lgf_poly(%rip), GF_POLY
// Compute the first vector of tweaks.
_next_tweak TWEAK0_XMM, %xmm0, %xmm1
vinserti128 $1, %xmm1, TWEAK0, TWEAK0
.elseif VL == 64
// Compute the remaining blocks of TWEAK0.
_next_tweak TWEAK0_XMM, %xmm0, %xmm1
_next_tweak %xmm1, %xmm0, %xmm2
_next_tweak %xmm2, %xmm0, %xmm3
vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0
vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0
vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0
.endif
// Compute TWEAK[1-3] from TWEAK0.
vpsrlq $64 - 1*VL/16, TWEAK0, V0
vpsrlq $64 - 2*VL/16, TWEAK0, V2
vpsrlq $64 - 3*VL/16, TWEAK0, V4
// Compute the next three vectors of tweaks:
// TWEAK1 = TWEAK0 * [x^2, x^2]
// TWEAK2 = TWEAK0 * [x^4, x^4]
// TWEAK3 = TWEAK0 * [x^6, x^6]
vpsrlq $64 - 2, TWEAK0, V0
vpsrlq $64 - 4, TWEAK0, V2
vpsrlq $64 - 6, TWEAK0, V4
vpclmulqdq $0x01, GF_POLY, V0, V1
vpclmulqdq $0x01, GF_POLY, V2, V3
vpclmulqdq $0x01, GF_POLY, V4, V5
vpslldq $8, V0, V0
vpslldq $8, V2, V2
vpslldq $8, V4, V4
vpsllq $1*VL/16, TWEAK0, TWEAK1
vpsllq $2*VL/16, TWEAK0, TWEAK2
vpsllq $3*VL/16, TWEAK0, TWEAK3
.if USE_AVX512
vpternlogd $0x96, V0, V1, TWEAK1
vpternlogd $0x96, V2, V3, TWEAK2
vpternlogd $0x96, V4, V5, TWEAK3
.else
vpsllq $2, TWEAK0, TWEAK1
vpsllq $4, TWEAK0, TWEAK2
vpsllq $6, TWEAK0, TWEAK3
vpxor V0, TWEAK1, TWEAK1
vpxor V2, TWEAK2, TWEAK2
vpxor V4, TWEAK3, TWEAK3
vpxor V1, TWEAK1, TWEAK1
vpxor V3, TWEAK2, TWEAK2
vpxor V5, TWEAK3, TWEAK3
.endif
.else
vbroadcasti32x4 (TWEAK), TWEAK0
vbroadcasti32x4 .Lgf_poly(%rip), GF_POLY
// Compute the first vector of tweaks:
// TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3]
vpmovzxbq .Lrshift_amounts(%rip), V4
vpsrlvq V4, TWEAK0, V0
vpclmulqdq $0x01, GF_POLY, V0, V1
vpmovzxbq .Llshift_amounts(%rip), V4
vpslldq $8, V0, V0
vpsllvq V4, TWEAK0, TWEAK0
vpternlogd $0x96, V0, V1, TWEAK0
// Compute the next three vectors of tweaks:
// TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4]
// TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8]
// TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12]
// x^8 only needs byte-aligned shifts, so optimize accordingly.
vpsrlq $64 - 4, TWEAK0, V0
vpsrldq $(64 - 8) / 8, TWEAK0, V2
vpsrlq $64 - 12, TWEAK0, V4
vpclmulqdq $0x01, GF_POLY, V0, V1
vpclmulqdq $0x01, GF_POLY, V2, V3
vpclmulqdq $0x01, GF_POLY, V4, V5
vpslldq $8, V0, V0
vpslldq $8, V4, V4
vpsllq $4, TWEAK0, TWEAK1
vpslldq $8 / 8, TWEAK0, TWEAK2
vpsllq $12, TWEAK0, TWEAK3
vpternlogd $0x96, V0, V1, TWEAK1
vpxord V3, TWEAK2, TWEAK2
vpternlogd $0x96, V4, V5, TWEAK3
.endif
.endm