crypto: x86/aes-xts - optimize _compute_first_set_of_tweaks for AVX-512

Optimize the AVX-512 version of _compute_first_set_of_tweaks by using vectorized shifts to compute the first vector of tweak blocks, and by using byte-aligned shifts when multiplying by x^8. AES-XTS performance on AMD Ryzen 9 9950X (Zen 5) improves by about 2% for 4096-byte messages or 6% for 512-byte messages. AES-XTS performance on Intel Sapphire Rapids improves by about 1% for 4096-byte messages or 3% for 512-byte messages. Code size decreases by 75 bytes which outweighs the increase in rodata size of 16 bytes. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2026-05-31 10:33:41 +02:00 · 2025-04-04 21:09:30 -07:00 · 2025-04-04 21:09:30 -07:00 · 570ef50a15
commit 570ef50a15
parent bc23fe6dc1
1 changed files with 62 additions and 28 deletions
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@ -100,6 +100,17 @@
 	// exists when there's a carry out of the low 64 bits of the tweak.
 	.quad	0x87, 1

+	// These are the shift amounts that are needed when multiplying by [x^0,
+	// x^1, x^2, x^3] to compute the first vector of tweaks when VL=64.
+	//
+	// The right shifts by 64 are expected to zeroize the destination.
+	// 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the
+	// amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would.
+.Lrshift_amounts:
+	.byte	64, 64, 63, 63, 62, 62, 61, 61
+.Llshift_amounts:
+	.byte	0, 0, 1, 1, 2, 2, 3, 3
+
 	// This table contains constants for vpshufb and vpblendvb, used to
 	// handle variable byte shifts and blending during ciphertext stealing
 	// on CPUs that don't support AVX512-style masking.
@ -294,52 +305,75 @@
 // Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
 // store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
 .macro	_compute_first_set_of_tweaks
-	vmovdqu		(TWEAK), TWEAK0_XMM
-	_vbroadcast128	.Lgf_poly(%rip), GF_POLY
 .if VL == 16
-	// With VL=16, multiplying by x serially is fastest.
+	vmovdqu		(TWEAK), TWEAK0_XMM
+	vmovdqu		.Lgf_poly(%rip), GF_POLY
 	_next_tweak	TWEAK0, %xmm0, TWEAK1
 	_next_tweak	TWEAK1, %xmm0, TWEAK2
 	_next_tweak	TWEAK2, %xmm0, TWEAK3
-.else
-.if VL == 32
-	// Compute the second block of TWEAK0.
+.elseif VL == 32
+	vmovdqu		(TWEAK), TWEAK0_XMM
+	vbroadcasti128	.Lgf_poly(%rip), GF_POLY
+
+	// Compute the first vector of tweaks.
 	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
 	vinserti128	$1, %xmm1, TWEAK0, TWEAK0
-.elseif VL == 64
-	// Compute the remaining blocks of TWEAK0.
-	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
-	_next_tweak	%xmm1, %xmm0, %xmm2
-	_next_tweak	%xmm2, %xmm0, %xmm3
-	vinserti32x4	$1, %xmm1, TWEAK0, TWEAK0
-	vinserti32x4	$2, %xmm2, TWEAK0, TWEAK0
-	vinserti32x4	$3, %xmm3, TWEAK0, TWEAK0
-.endif
-	// Compute TWEAK[1-3] from TWEAK0.
-	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
-	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
-	vpsrlq		$64 - 3*VL/16, TWEAK0, V4
+
+	// Compute the next three vectors of tweaks:
+	//	TWEAK1 = TWEAK0 * [x^2, x^2]
+	//	TWEAK2 = TWEAK0 * [x^4, x^4]
+	//	TWEAK3 = TWEAK0 * [x^6, x^6]
+	vpsrlq		$64 - 2, TWEAK0, V0
+	vpsrlq		$64 - 4, TWEAK0, V2
+	vpsrlq		$64 - 6, TWEAK0, V4
 	vpclmulqdq	$0x01, GF_POLY, V0, V1
 	vpclmulqdq	$0x01, GF_POLY, V2, V3
 	vpclmulqdq	$0x01, GF_POLY, V4, V5
 	vpslldq		$8, V0, V0
 	vpslldq		$8, V2, V2
 	vpslldq		$8, V4, V4
-	vpsllq		$1*VL/16, TWEAK0, TWEAK1
-	vpsllq		$2*VL/16, TWEAK0, TWEAK2
-	vpsllq		$3*VL/16, TWEAK0, TWEAK3
-.if USE_AVX512
-	vpternlogd	$0x96, V0, V1, TWEAK1
-	vpternlogd	$0x96, V2, V3, TWEAK2
-	vpternlogd	$0x96, V4, V5, TWEAK3
-.else
+	vpsllq		$2, TWEAK0, TWEAK1
+	vpsllq		$4, TWEAK0, TWEAK2
+	vpsllq		$6, TWEAK0, TWEAK3
 	vpxor		V0, TWEAK1, TWEAK1
 	vpxor		V2, TWEAK2, TWEAK2
 	vpxor		V4, TWEAK3, TWEAK3
 	vpxor		V1, TWEAK1, TWEAK1
 	vpxor		V3, TWEAK2, TWEAK2
 	vpxor		V5, TWEAK3, TWEAK3
-.endif
+.else
+	vbroadcasti32x4	(TWEAK), TWEAK0
+	vbroadcasti32x4	.Lgf_poly(%rip), GF_POLY
+
+	// Compute the first vector of tweaks:
+	//	TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3]
+	vpmovzxbq	.Lrshift_amounts(%rip), V4
+	vpsrlvq		V4, TWEAK0, V0
+	vpclmulqdq	$0x01, GF_POLY, V0, V1
+	vpmovzxbq	.Llshift_amounts(%rip), V4
+	vpslldq		$8, V0, V0
+	vpsllvq		V4, TWEAK0, TWEAK0
+	vpternlogd	$0x96, V0, V1, TWEAK0
+
+	// Compute the next three vectors of tweaks:
+	//	TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4]
+	//	TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8]
+	//	TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12]
+	// x^8 only needs byte-aligned shifts, so optimize accordingly.
+	vpsrlq		$64 - 4, TWEAK0, V0
+	vpsrldq		$(64 - 8) / 8, TWEAK0, V2
+	vpsrlq		$64 - 12, TWEAK0, V4
+	vpclmulqdq	$0x01, GF_POLY, V0, V1
+	vpclmulqdq	$0x01, GF_POLY, V2, V3
+	vpclmulqdq	$0x01, GF_POLY, V4, V5
+	vpslldq		$8, V0, V0
+	vpslldq		$8, V4, V4
+	vpsllq		$4, TWEAK0, TWEAK1
+	vpslldq		$8 / 8, TWEAK0, TWEAK2
+	vpsllq		$12, TWEAK0, TWEAK3
+	vpternlogd	$0x96, V0, V1, TWEAK1
+	vpxord		V3, TWEAK2, TWEAK2
+	vpternlogd	$0x96, V4, V5, TWEAK3
 .endif
 .endm