crypto: x86/aes-gcm - tune better for AMD CPUs

Reorganize the main loop to free up the RNDKEYLAST[0-3] registers and
use them for more cached round keys.  This improves performance by about
2% on AMD Zen 4 and Zen 5.  Intel performance remains about the same.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Eric Biggers 2024-12-12 13:28:39 -08:00 committed by Herbert Xu
parent 3cae5a3c05
commit 95791ccd11

View File

@ -88,7 +88,7 @@
// A shuffle mask that reflects the bytes of 16-byte blocks
.Lbswap_mask:
.octa 0x000102030405060708090a0b0c0d0e0f
.octa 0x000102030405060708090a0b0c0d0e0f
// This is the GHASH reducing polynomial without its constant term, i.e.
// x^128 + x^7 + x^2 + x, represented using the backwards mapping
@ -562,6 +562,32 @@
vpxord RNDKEY0, V3, V3
.endm
// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
// data with the resulting keystream, and write the result to DST and
// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.)
.macro _aesenclast_and_xor_4x
// XOR the source data with the last round key, saving the result in
// GHASHDATA[0-3]. This reduces latency by taking advantage of the
// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0
vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1
vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2
vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3
// Do the last AES round. This handles the XOR with the source data
// too, as per the optimization described above.
vaesenclast GHASHDATA0, V0, GHASHDATA0
vaesenclast GHASHDATA1, V1, GHASHDATA1
vaesenclast GHASHDATA2, V2, GHASHDATA2
vaesenclast GHASHDATA3, V3, GHASHDATA3
// Store the en/decrypted data to DST.
vmovdqu8 GHASHDATA0, 0*VL(DST)
vmovdqu8 GHASHDATA1, 1*VL(DST)
vmovdqu8 GHASHDATA2, 2*VL(DST)
vmovdqu8 GHASHDATA3, 3*VL(DST)
.endm
// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
// const u32 le_ctr[4], u8 ghash_acc[16],
// const u8 *src, u8 *dst, int datalen);
@ -640,7 +666,7 @@
// LE_CTR contains the next set of little-endian counter blocks.
.set LE_CTR, V12
// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
// copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
.set RNDKEY0, V13
@ -650,15 +676,10 @@
.set RNDKEY_M7, V17
.set RNDKEY_M6, V18
.set RNDKEY_M5, V19
// RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
// the corresponding block of source data. This is useful because
// vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can
// be computed in parallel with the AES rounds.
.set RNDKEYLAST0, V20
.set RNDKEYLAST1, V21
.set RNDKEYLAST2, V22
.set RNDKEYLAST3, V23
.set RNDKEY_M4, V20
.set RNDKEY_M3, V21
.set RNDKEY_M2, V22
.set RNDKEY_M1, V23
// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These
// cannot coincide with anything used for AES encryption, since for
@ -748,18 +769,7 @@
add $16, %rax
cmp %rax, RNDKEYLAST_PTR
jne 1b
vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
vaesenclast RNDKEYLAST0, V0, GHASHDATA0
vaesenclast RNDKEYLAST1, V1, GHASHDATA1
vaesenclast RNDKEYLAST2, V2, GHASHDATA2
vaesenclast RNDKEYLAST3, V3, GHASHDATA3
vmovdqu8 GHASHDATA0, 0*VL(DST)
vmovdqu8 GHASHDATA1, 1*VL(DST)
vmovdqu8 GHASHDATA2, 2*VL(DST)
vmovdqu8 GHASHDATA3, 3*VL(DST)
_aesenclast_and_xor_4x
sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
sub $-4*VL, DST
add $-4*VL, DATALEN
@ -767,7 +777,7 @@
.endif
// Cache as many additional AES round keys as possible.
.irp i, 9,8,7,6,5
.irp i, 9,8,7,6,5,4,3,2,1
vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
.endr
@ -799,47 +809,14 @@
_vaesenc_4x RNDKEY
128:
// XOR the source data with the last round key, saving the result in
// RNDKEYLAST[0-3]. This reduces latency by taking advantage of the
// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
.if \enc
vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
.else
vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0
vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1
vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2
vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3
.endif
// Finish the AES encryption of the counter blocks in V0-V3, interleaved
// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
.irp i, 9,8,7,6,5
.irp i, 9,8,7,6,5,4,3,2,1
_ghash_step_4x (9 - \i)
_vaesenc_4x RNDKEY_M\i
_ghash_step_4x (9 - \i)
.endr
.irp i, 4,3,2,1
vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY
_vaesenc_4x RNDKEY
_ghash_step_4x (9 - \i)
.endr
_ghash_step_4x 9
// Do the last AES round. This handles the XOR with the source data
// too, as per the optimization described above.
vaesenclast RNDKEYLAST0, V0, GHASHDATA0
vaesenclast RNDKEYLAST1, V1, GHASHDATA1
vaesenclast RNDKEYLAST2, V2, GHASHDATA2
vaesenclast RNDKEYLAST3, V3, GHASHDATA3
// Store the en/decrypted data to DST.
vmovdqu8 GHASHDATA0, 0*VL(DST)
vmovdqu8 GHASHDATA1, 1*VL(DST)
vmovdqu8 GHASHDATA2, 2*VL(DST)
vmovdqu8 GHASHDATA3, 3*VL(DST)
_aesenclast_and_xor_4x
sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
sub $-4*VL, DST
add $-4*VL, DATALEN
@ -940,7 +917,7 @@
// GHASH. However, any such blocks are all-zeroes, and the values that
// they're multiplied with are also all-zeroes. Therefore they just add
// 0 * 0 = 0 to the final GHASH result, which makes no difference.
vmovdqu8 (POWERS_PTR), H_POW1
vmovdqu8 (POWERS_PTR), H_POW1
.if \enc
vmovdqu8 V0, V1{%k1}{z}
.endif