mirror of
https://github.com/torvalds/linux.git
synced 2026-05-22 06:01:53 +02:00
crypto: x86/aes-gcm - tune better for AMD CPUs
Reorganize the main loop to free up the RNDKEYLAST[0-3] registers and use them for more cached round keys. This improves performance by about 2% on AMD Zen 4 and Zen 5. Intel performance remains about the same. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
3cae5a3c05
commit
95791ccd11
|
|
@ -88,7 +88,7 @@
|
|||
|
||||
// A shuffle mask that reflects the bytes of 16-byte blocks
|
||||
.Lbswap_mask:
|
||||
.octa 0x000102030405060708090a0b0c0d0e0f
|
||||
.octa 0x000102030405060708090a0b0c0d0e0f
|
||||
|
||||
// This is the GHASH reducing polynomial without its constant term, i.e.
|
||||
// x^128 + x^7 + x^2 + x, represented using the backwards mapping
|
||||
|
|
@ -562,6 +562,32 @@
|
|||
vpxord RNDKEY0, V3, V3
|
||||
.endm
|
||||
|
||||
// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
|
||||
// data with the resulting keystream, and write the result to DST and
|
||||
// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.)
|
||||
.macro _aesenclast_and_xor_4x
|
||||
// XOR the source data with the last round key, saving the result in
|
||||
// GHASHDATA[0-3]. This reduces latency by taking advantage of the
|
||||
// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
|
||||
vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0
|
||||
vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1
|
||||
vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2
|
||||
vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3
|
||||
|
||||
// Do the last AES round. This handles the XOR with the source data
|
||||
// too, as per the optimization described above.
|
||||
vaesenclast GHASHDATA0, V0, GHASHDATA0
|
||||
vaesenclast GHASHDATA1, V1, GHASHDATA1
|
||||
vaesenclast GHASHDATA2, V2, GHASHDATA2
|
||||
vaesenclast GHASHDATA3, V3, GHASHDATA3
|
||||
|
||||
// Store the en/decrypted data to DST.
|
||||
vmovdqu8 GHASHDATA0, 0*VL(DST)
|
||||
vmovdqu8 GHASHDATA1, 1*VL(DST)
|
||||
vmovdqu8 GHASHDATA2, 2*VL(DST)
|
||||
vmovdqu8 GHASHDATA3, 3*VL(DST)
|
||||
.endm
|
||||
|
||||
// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
|
||||
// const u32 le_ctr[4], u8 ghash_acc[16],
|
||||
// const u8 *src, u8 *dst, int datalen);
|
||||
|
|
@ -640,7 +666,7 @@
|
|||
// LE_CTR contains the next set of little-endian counter blocks.
|
||||
.set LE_CTR, V12
|
||||
|
||||
// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
|
||||
// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
|
||||
// copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
|
||||
// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
|
||||
.set RNDKEY0, V13
|
||||
|
|
@ -650,15 +676,10 @@
|
|||
.set RNDKEY_M7, V17
|
||||
.set RNDKEY_M6, V18
|
||||
.set RNDKEY_M5, V19
|
||||
|
||||
// RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
|
||||
// the corresponding block of source data. This is useful because
|
||||
// vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can
|
||||
// be computed in parallel with the AES rounds.
|
||||
.set RNDKEYLAST0, V20
|
||||
.set RNDKEYLAST1, V21
|
||||
.set RNDKEYLAST2, V22
|
||||
.set RNDKEYLAST3, V23
|
||||
.set RNDKEY_M4, V20
|
||||
.set RNDKEY_M3, V21
|
||||
.set RNDKEY_M2, V22
|
||||
.set RNDKEY_M1, V23
|
||||
|
||||
// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These
|
||||
// cannot coincide with anything used for AES encryption, since for
|
||||
|
|
@ -748,18 +769,7 @@
|
|||
add $16, %rax
|
||||
cmp %rax, RNDKEYLAST_PTR
|
||||
jne 1b
|
||||
vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
|
||||
vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
|
||||
vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
|
||||
vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
|
||||
vaesenclast RNDKEYLAST0, V0, GHASHDATA0
|
||||
vaesenclast RNDKEYLAST1, V1, GHASHDATA1
|
||||
vaesenclast RNDKEYLAST2, V2, GHASHDATA2
|
||||
vaesenclast RNDKEYLAST3, V3, GHASHDATA3
|
||||
vmovdqu8 GHASHDATA0, 0*VL(DST)
|
||||
vmovdqu8 GHASHDATA1, 1*VL(DST)
|
||||
vmovdqu8 GHASHDATA2, 2*VL(DST)
|
||||
vmovdqu8 GHASHDATA3, 3*VL(DST)
|
||||
_aesenclast_and_xor_4x
|
||||
sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
|
||||
sub $-4*VL, DST
|
||||
add $-4*VL, DATALEN
|
||||
|
|
@ -767,7 +777,7 @@
|
|||
.endif
|
||||
|
||||
// Cache as many additional AES round keys as possible.
|
||||
.irp i, 9,8,7,6,5
|
||||
.irp i, 9,8,7,6,5,4,3,2,1
|
||||
vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
|
||||
.endr
|
||||
|
||||
|
|
@ -799,47 +809,14 @@
|
|||
_vaesenc_4x RNDKEY
|
||||
128:
|
||||
|
||||
// XOR the source data with the last round key, saving the result in
|
||||
// RNDKEYLAST[0-3]. This reduces latency by taking advantage of the
|
||||
// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
|
||||
.if \enc
|
||||
vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
|
||||
vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
|
||||
vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
|
||||
vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
|
||||
.else
|
||||
vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0
|
||||
vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1
|
||||
vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2
|
||||
vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3
|
||||
.endif
|
||||
|
||||
// Finish the AES encryption of the counter blocks in V0-V3, interleaved
|
||||
// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
|
||||
.irp i, 9,8,7,6,5
|
||||
.irp i, 9,8,7,6,5,4,3,2,1
|
||||
_ghash_step_4x (9 - \i)
|
||||
_vaesenc_4x RNDKEY_M\i
|
||||
_ghash_step_4x (9 - \i)
|
||||
.endr
|
||||
.irp i, 4,3,2,1
|
||||
vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY
|
||||
_vaesenc_4x RNDKEY
|
||||
_ghash_step_4x (9 - \i)
|
||||
.endr
|
||||
_ghash_step_4x 9
|
||||
|
||||
// Do the last AES round. This handles the XOR with the source data
|
||||
// too, as per the optimization described above.
|
||||
vaesenclast RNDKEYLAST0, V0, GHASHDATA0
|
||||
vaesenclast RNDKEYLAST1, V1, GHASHDATA1
|
||||
vaesenclast RNDKEYLAST2, V2, GHASHDATA2
|
||||
vaesenclast RNDKEYLAST3, V3, GHASHDATA3
|
||||
|
||||
// Store the en/decrypted data to DST.
|
||||
vmovdqu8 GHASHDATA0, 0*VL(DST)
|
||||
vmovdqu8 GHASHDATA1, 1*VL(DST)
|
||||
vmovdqu8 GHASHDATA2, 2*VL(DST)
|
||||
vmovdqu8 GHASHDATA3, 3*VL(DST)
|
||||
|
||||
_aesenclast_and_xor_4x
|
||||
sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
|
||||
sub $-4*VL, DST
|
||||
add $-4*VL, DATALEN
|
||||
|
|
@ -940,7 +917,7 @@
|
|||
// GHASH. However, any such blocks are all-zeroes, and the values that
|
||||
// they're multiplied with are also all-zeroes. Therefore they just add
|
||||
// 0 * 0 = 0 to the final GHASH result, which makes no difference.
|
||||
vmovdqu8 (POWERS_PTR), H_POW1
|
||||
vmovdqu8 (POWERS_PTR), H_POW1
|
||||
.if \enc
|
||||
vmovdqu8 V0, V1{%k1}{z}
|
||||
.endif
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user