mirror of
https://github.com/torvalds/linux.git
synced 2026-05-31 10:33:41 +02:00
crypto: x86/aes-xts - optimize _compute_first_set_of_tweaks for AVX-512
Optimize the AVX-512 version of _compute_first_set_of_tweaks by using vectorized shifts to compute the first vector of tweak blocks, and by using byte-aligned shifts when multiplying by x^8. AES-XTS performance on AMD Ryzen 9 9950X (Zen 5) improves by about 2% for 4096-byte messages or 6% for 512-byte messages. AES-XTS performance on Intel Sapphire Rapids improves by about 1% for 4096-byte messages or 3% for 512-byte messages. Code size decreases by 75 bytes which outweighs the increase in rodata size of 16 bytes. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
bc23fe6dc1
commit
570ef50a15
|
|
@ -100,6 +100,17 @@
|
|||
// exists when there's a carry out of the low 64 bits of the tweak.
|
||||
.quad 0x87, 1
|
||||
|
||||
// These are the shift amounts that are needed when multiplying by [x^0,
|
||||
// x^1, x^2, x^3] to compute the first vector of tweaks when VL=64.
|
||||
//
|
||||
// The right shifts by 64 are expected to zeroize the destination.
|
||||
// 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the
|
||||
// amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would.
|
||||
.Lrshift_amounts:
|
||||
.byte 64, 64, 63, 63, 62, 62, 61, 61
|
||||
.Llshift_amounts:
|
||||
.byte 0, 0, 1, 1, 2, 2, 3, 3
|
||||
|
||||
// This table contains constants for vpshufb and vpblendvb, used to
|
||||
// handle variable byte shifts and blending during ciphertext stealing
|
||||
// on CPUs that don't support AVX512-style masking.
|
||||
|
|
@ -294,52 +305,75 @@
|
|||
// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
|
||||
// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
|
||||
.macro _compute_first_set_of_tweaks
|
||||
vmovdqu (TWEAK), TWEAK0_XMM
|
||||
_vbroadcast128 .Lgf_poly(%rip), GF_POLY
|
||||
.if VL == 16
|
||||
// With VL=16, multiplying by x serially is fastest.
|
||||
vmovdqu (TWEAK), TWEAK0_XMM
|
||||
vmovdqu .Lgf_poly(%rip), GF_POLY
|
||||
_next_tweak TWEAK0, %xmm0, TWEAK1
|
||||
_next_tweak TWEAK1, %xmm0, TWEAK2
|
||||
_next_tweak TWEAK2, %xmm0, TWEAK3
|
||||
.else
|
||||
.if VL == 32
|
||||
// Compute the second block of TWEAK0.
|
||||
.elseif VL == 32
|
||||
vmovdqu (TWEAK), TWEAK0_XMM
|
||||
vbroadcasti128 .Lgf_poly(%rip), GF_POLY
|
||||
|
||||
// Compute the first vector of tweaks.
|
||||
_next_tweak TWEAK0_XMM, %xmm0, %xmm1
|
||||
vinserti128 $1, %xmm1, TWEAK0, TWEAK0
|
||||
.elseif VL == 64
|
||||
// Compute the remaining blocks of TWEAK0.
|
||||
_next_tweak TWEAK0_XMM, %xmm0, %xmm1
|
||||
_next_tweak %xmm1, %xmm0, %xmm2
|
||||
_next_tweak %xmm2, %xmm0, %xmm3
|
||||
vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0
|
||||
vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0
|
||||
vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0
|
||||
.endif
|
||||
// Compute TWEAK[1-3] from TWEAK0.
|
||||
vpsrlq $64 - 1*VL/16, TWEAK0, V0
|
||||
vpsrlq $64 - 2*VL/16, TWEAK0, V2
|
||||
vpsrlq $64 - 3*VL/16, TWEAK0, V4
|
||||
|
||||
// Compute the next three vectors of tweaks:
|
||||
// TWEAK1 = TWEAK0 * [x^2, x^2]
|
||||
// TWEAK2 = TWEAK0 * [x^4, x^4]
|
||||
// TWEAK3 = TWEAK0 * [x^6, x^6]
|
||||
vpsrlq $64 - 2, TWEAK0, V0
|
||||
vpsrlq $64 - 4, TWEAK0, V2
|
||||
vpsrlq $64 - 6, TWEAK0, V4
|
||||
vpclmulqdq $0x01, GF_POLY, V0, V1
|
||||
vpclmulqdq $0x01, GF_POLY, V2, V3
|
||||
vpclmulqdq $0x01, GF_POLY, V4, V5
|
||||
vpslldq $8, V0, V0
|
||||
vpslldq $8, V2, V2
|
||||
vpslldq $8, V4, V4
|
||||
vpsllq $1*VL/16, TWEAK0, TWEAK1
|
||||
vpsllq $2*VL/16, TWEAK0, TWEAK2
|
||||
vpsllq $3*VL/16, TWEAK0, TWEAK3
|
||||
.if USE_AVX512
|
||||
vpternlogd $0x96, V0, V1, TWEAK1
|
||||
vpternlogd $0x96, V2, V3, TWEAK2
|
||||
vpternlogd $0x96, V4, V5, TWEAK3
|
||||
.else
|
||||
vpsllq $2, TWEAK0, TWEAK1
|
||||
vpsllq $4, TWEAK0, TWEAK2
|
||||
vpsllq $6, TWEAK0, TWEAK3
|
||||
vpxor V0, TWEAK1, TWEAK1
|
||||
vpxor V2, TWEAK2, TWEAK2
|
||||
vpxor V4, TWEAK3, TWEAK3
|
||||
vpxor V1, TWEAK1, TWEAK1
|
||||
vpxor V3, TWEAK2, TWEAK2
|
||||
vpxor V5, TWEAK3, TWEAK3
|
||||
.endif
|
||||
.else
|
||||
vbroadcasti32x4 (TWEAK), TWEAK0
|
||||
vbroadcasti32x4 .Lgf_poly(%rip), GF_POLY
|
||||
|
||||
// Compute the first vector of tweaks:
|
||||
// TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3]
|
||||
vpmovzxbq .Lrshift_amounts(%rip), V4
|
||||
vpsrlvq V4, TWEAK0, V0
|
||||
vpclmulqdq $0x01, GF_POLY, V0, V1
|
||||
vpmovzxbq .Llshift_amounts(%rip), V4
|
||||
vpslldq $8, V0, V0
|
||||
vpsllvq V4, TWEAK0, TWEAK0
|
||||
vpternlogd $0x96, V0, V1, TWEAK0
|
||||
|
||||
// Compute the next three vectors of tweaks:
|
||||
// TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4]
|
||||
// TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8]
|
||||
// TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12]
|
||||
// x^8 only needs byte-aligned shifts, so optimize accordingly.
|
||||
vpsrlq $64 - 4, TWEAK0, V0
|
||||
vpsrldq $(64 - 8) / 8, TWEAK0, V2
|
||||
vpsrlq $64 - 12, TWEAK0, V4
|
||||
vpclmulqdq $0x01, GF_POLY, V0, V1
|
||||
vpclmulqdq $0x01, GF_POLY, V2, V3
|
||||
vpclmulqdq $0x01, GF_POLY, V4, V5
|
||||
vpslldq $8, V0, V0
|
||||
vpslldq $8, V4, V4
|
||||
vpsllq $4, TWEAK0, TWEAK1
|
||||
vpslldq $8 / 8, TWEAK0, TWEAK2
|
||||
vpsllq $12, TWEAK0, TWEAK3
|
||||
vpternlogd $0x96, V0, V1, TWEAK1
|
||||
vpxord V3, TWEAK2, TWEAK2
|
||||
vpternlogd $0x96, V4, V5, TWEAK3
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user