crypto: x86/aes-gcm - tune better for AMD CPUs

Reorganize the main loop to free up the RNDKEYLAST[0-3] registers and use them for more cached round keys. This improves performance by about 2% on AMD Zen 4 and Zen 5. Intel performance remains about the same. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2026-05-22 06:01:53 +02:00 · 2024-12-12 13:28:39 -08:00 · 2024-12-12 13:28:39 -08:00 · 95791ccd11
commit 95791ccd11
parent 3cae5a3c05
1 changed files with 38 additions and 61 deletions
--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
@ -88,7 +88,7 @@

 	// A shuffle mask that reflects the bytes of 16-byte blocks
 .Lbswap_mask:
-	.octa   0x000102030405060708090a0b0c0d0e0f
+	.octa	0x000102030405060708090a0b0c0d0e0f

 	// This is the GHASH reducing polynomial without its constant term, i.e.
 	// x^128 + x^7 + x^2 + x, represented using the backwards mapping
@ -562,6 +562,32 @@
 	vpxord		RNDKEY0, V3, V3
 .endm

+// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
+// data with the resulting keystream, and write the result to DST and
+// GHASHDATA[0-3].  (Implementation differs slightly, but has the same effect.)
+.macro	_aesenclast_and_xor_4x
+	// XOR the source data with the last round key, saving the result in
+	// GHASHDATA[0-3].  This reduces latency by taking advantage of the
+	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+	vpxord		0*VL(SRC), RNDKEYLAST, GHASHDATA0
+	vpxord		1*VL(SRC), RNDKEYLAST, GHASHDATA1
+	vpxord		2*VL(SRC), RNDKEYLAST, GHASHDATA2
+	vpxord		3*VL(SRC), RNDKEYLAST, GHASHDATA3
+
+	// Do the last AES round.  This handles the XOR with the source data
+	// too, as per the optimization described above.
+	vaesenclast	GHASHDATA0, V0, GHASHDATA0
+	vaesenclast	GHASHDATA1, V1, GHASHDATA1
+	vaesenclast	GHASHDATA2, V2, GHASHDATA2
+	vaesenclast	GHASHDATA3, V3, GHASHDATA3
+
+	// Store the en/decrypted data to DST.
+	vmovdqu8	GHASHDATA0, 0*VL(DST)
+	vmovdqu8	GHASHDATA1, 1*VL(DST)
+	vmovdqu8	GHASHDATA2, 2*VL(DST)
+	vmovdqu8	GHASHDATA3, 3*VL(DST)
+.endm
+
 // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
 //					  const u32 le_ctr[4], u8 ghash_acc[16],
 //					  const u8 *src, u8 *dst, int datalen);
@ -640,7 +666,7 @@
 	// LE_CTR contains the next set of little-endian counter blocks.
 	.set	LE_CTR,		V12

-	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
+	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
 	// copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
 	// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
 	.set	RNDKEY0,	V13
@ -650,15 +676,10 @@
 	.set	RNDKEY_M7,	V17
 	.set	RNDKEY_M6,	V18
 	.set	RNDKEY_M5,	V19
-
-	// RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
-	// the corresponding block of source data.  This is useful because
-	// vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can
-	// be computed in parallel with the AES rounds.
-	.set	RNDKEYLAST0,	V20
-	.set	RNDKEYLAST1,	V21
-	.set	RNDKEYLAST2,	V22
-	.set	RNDKEYLAST3,	V23
+	.set	RNDKEY_M4,	V20
+	.set	RNDKEY_M3,	V21
+	.set	RNDKEY_M2,	V22
+	.set	RNDKEY_M1,	V23

 	// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
 	// cannot coincide with anything used for AES encryption, since for
@ -748,18 +769,7 @@
 	add		$16, %rax
 	cmp		%rax, RNDKEYLAST_PTR
 	jne		1b
-	vpxord		0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
-	vpxord		1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
-	vpxord		2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
-	vpxord		3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
-	vaesenclast	RNDKEYLAST0, V0, GHASHDATA0
-	vaesenclast	RNDKEYLAST1, V1, GHASHDATA1
-	vaesenclast	RNDKEYLAST2, V2, GHASHDATA2
-	vaesenclast	RNDKEYLAST3, V3, GHASHDATA3
-	vmovdqu8	GHASHDATA0, 0*VL(DST)
-	vmovdqu8	GHASHDATA1, 1*VL(DST)
-	vmovdqu8	GHASHDATA2, 2*VL(DST)
-	vmovdqu8	GHASHDATA3, 3*VL(DST)
+	_aesenclast_and_xor_4x
 	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
 	sub		$-4*VL, DST
 	add		$-4*VL, DATALEN
@ -767,7 +777,7 @@
 .endif

 	// Cache as many additional AES round keys as possible.
-.irp i, 9,8,7,6,5
+.irp i, 9,8,7,6,5,4,3,2,1
 	vbroadcasti32x4	-\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
 .endr

@ -799,47 +809,14 @@
 	_vaesenc_4x	RNDKEY
 128:

-	// XOR the source data with the last round key, saving the result in
-	// RNDKEYLAST[0-3].  This reduces latency by taking advantage of the
-	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
-.if \enc
-	vpxord		0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
-	vpxord		1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
-	vpxord		2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
-	vpxord		3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
-.else
-	vpxord		GHASHDATA0, RNDKEYLAST, RNDKEYLAST0
-	vpxord		GHASHDATA1, RNDKEYLAST, RNDKEYLAST1
-	vpxord		GHASHDATA2, RNDKEYLAST, RNDKEYLAST2
-	vpxord		GHASHDATA3, RNDKEYLAST, RNDKEYLAST3
-.endif
-
 	// Finish the AES encryption of the counter blocks in V0-V3, interleaved
 	// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
-.irp i, 9,8,7,6,5
+.irp i, 9,8,7,6,5,4,3,2,1
+	_ghash_step_4x  (9 - \i)
 	_vaesenc_4x	RNDKEY_M\i
-	_ghash_step_4x	(9 - \i)
-.endr
-.irp i, 4,3,2,1
-	vbroadcasti32x4	-\i*16(RNDKEYLAST_PTR), RNDKEY
-	_vaesenc_4x	RNDKEY
-	_ghash_step_4x	(9 - \i)
 .endr
 	_ghash_step_4x	9
-
-	// Do the last AES round.  This handles the XOR with the source data
-	// too, as per the optimization described above.
-	vaesenclast	RNDKEYLAST0, V0, GHASHDATA0
-	vaesenclast	RNDKEYLAST1, V1, GHASHDATA1
-	vaesenclast	RNDKEYLAST2, V2, GHASHDATA2
-	vaesenclast	RNDKEYLAST3, V3, GHASHDATA3
-
-	// Store the en/decrypted data to DST.
-	vmovdqu8	GHASHDATA0, 0*VL(DST)
-	vmovdqu8	GHASHDATA1, 1*VL(DST)
-	vmovdqu8	GHASHDATA2, 2*VL(DST)
-	vmovdqu8	GHASHDATA3, 3*VL(DST)
-
+	_aesenclast_and_xor_4x
 	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
 	sub		$-4*VL, DST
 	add		$-4*VL, DATALEN
@ -940,7 +917,7 @@
 	// GHASH.  However, any such blocks are all-zeroes, and the values that
 	// they're multiplied with are also all-zeroes.  Therefore they just add
 	// 0 * 0 = 0 to the final GHASH result, which makes no difference.
-	vmovdqu8        (POWERS_PTR), H_POW1
+	vmovdqu8	(POWERS_PTR), H_POW1
 .if \enc
 	vmovdqu8	V0, V1{%k1}{z}
 .endif