crypto: x86/aes - drop the avx10_256 AES-XTS and AES-CTR code

Intel made a late change to the AVX10 specification that removes support
for a 256-bit maximum vector length and enumeration of the maximum
vector length.  AVX10 will imply a maximum vector length of 512 bits.
I.e. there won't be any such thing as AVX10/256 or AVX10/512; there will
just be AVX10, and it will essentially just consolidate AVX512 features.

As a result of this new development, my strategy of providing both
*_avx10_256 and *_avx10_512 functions didn't turn out to be that useful.
The only remaining motivation for the 256-bit AVX512 / AVX10 functions
is to avoid downclocking on older Intel CPUs.  But in the case of
AES-XTS and AES-CTR, I already wrote *_avx2 code too (primarily to
support CPUs without AVX512), which performs almost as well as
*_avx10_256.  So we should just use that.

Therefore, remove the *_avx10_256 AES-XTS and AES-CTR functions and
algorithms, and rename the *_avx10_512 AES-XTS and AES-CTR functions and
algorithms to *_avx512.  Make Ice Lake and Tiger Lake use *_avx2 instead
of *_avx10_256 which they previously used.

I've left AES-GCM unchanged for now.  There is no VAES+AVX2 optimized
AES-GCM in the kernel yet, so the path forward for that is not as clear.
However, I did write a VAES+AVX2 optimized AES-GCM for BoringSSL.  So
one option is to port that to the kernel and then do the same cleanup.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Eric Biggers 2025-04-01 17:24:01 -07:00 committed by Herbert Xu
parent 5ebc052d3b
commit 7d14fbc569
3 changed files with 74 additions and 121 deletions

View File

@ -48,8 +48,7 @@
// using the following sets of CPU features:
// - AES-NI && AVX
// - VAES && AVX2
// - VAES && (AVX10/256 || (AVX512BW && AVX512VL)) && BMI2
// - VAES && (AVX10/512 || (AVX512BW && AVX512VL)) && BMI2
// - VAES && AVX512BW && AVX512VL && BMI2
//
// See the function definitions at the bottom of the file for more information.
@ -76,7 +75,6 @@
.text
// Move a vector between memory and a register.
// The register operand must be in the first 16 vector registers.
.macro _vmovdqu src, dst
.if VL < 64
vmovdqu \src, \dst
@ -86,7 +84,6 @@
.endm
// Move a vector between registers.
// The registers must be in the first 16 vector registers.
.macro _vmovdqa src, dst
.if VL < 64
vmovdqa \src, \dst
@ -96,7 +93,7 @@
.endm
// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector
// register. The register operand must be in the first 16 vector registers.
// register.
.macro _vbroadcast128 src, dst
.if VL == 16
vmovdqu \src, \dst
@ -108,7 +105,6 @@
.endm
// XOR two vectors together.
// Any register operands must be in the first 16 vector registers.
.macro _vpxor src1, src2, dst
.if VL < 64
vpxor \src1, \src2, \dst
@ -199,8 +195,8 @@
// XOR each with the zero-th round key. Also update LE_CTR if !\final.
.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0
.if \is_xctr
.if USE_AVX10
_vmovdqa LE_CTR, AESDATA\i0
.if USE_AVX512
vmovdqa64 LE_CTR, AESDATA\i0
vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0
.else
vpxor XCTR_IV, LE_CTR, AESDATA\i0
@ -208,7 +204,7 @@
.endif
vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1
.if USE_AVX10
.if USE_AVX512
vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1
.else
vpxor XCTR_IV, AESDATA\i1, AESDATA\i1
@ -481,18 +477,12 @@
.Lxor_tail_partial_vec_0\@:
// XOR the remaining 1 <= LEN < VL bytes. It's easy if masked
// loads/stores are available; otherwise it's a bit harder...
.if USE_AVX10
.if VL <= 32
mov $-1, %eax
bzhi LEN, %eax, %eax
kmovd %eax, %k1
.else
.if USE_AVX512
mov $-1, %rax
bzhi LEN64, %rax, %rax
kmovq %rax, %k1
.endif
vmovdqu8 (SRC), AESDATA1{%k1}{z}
_vpxor AESDATA1, AESDATA0, AESDATA0
vpxord AESDATA1, AESDATA0, AESDATA0
vmovdqu8 AESDATA0, (DST){%k1}
.else
.if VL == 32
@ -554,7 +544,7 @@
// eliminates carries. |ctr| is the per-message block counter starting at 1.
.set VL, 16
.set USE_AVX10, 0
.set USE_AVX512, 0
SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx)
_aes_ctr_crypt 0
SYM_FUNC_END(aes_ctr64_crypt_aesni_avx)
@ -564,7 +554,7 @@ SYM_FUNC_END(aes_xctr_crypt_aesni_avx)
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
.set VL, 32
.set USE_AVX10, 0
.set USE_AVX512, 0
SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2)
_aes_ctr_crypt 0
SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2)
@ -572,21 +562,12 @@ SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2)
_aes_ctr_crypt 1
SYM_FUNC_END(aes_xctr_crypt_vaes_avx2)
.set VL, 32
.set USE_AVX10, 1
SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256)
_aes_ctr_crypt 0
SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256)
SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256)
_aes_ctr_crypt 1
SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256)
.set VL, 64
.set USE_AVX10, 1
SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512)
.set USE_AVX512, 1
SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512)
_aes_ctr_crypt 0
SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512)
SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512)
SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512)
SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512)
_aes_ctr_crypt 1
SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512)
SYM_FUNC_END(aes_xctr_crypt_vaes_avx512)
#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ

View File

@ -52,32 +52,25 @@
* different code, it uses a macro to generate several implementations that
* share similar source code but are targeted at different CPUs, listed below:
*
* AES-NI + AVX
* AES-NI && AVX
* - 128-bit vectors (1 AES block per vector)
* - VEX-coded instructions
* - xmm0-xmm15
* - This is for older CPUs that lack VAES but do have AVX.
*
* VAES + VPCLMULQDQ + AVX2
* VAES && VPCLMULQDQ && AVX2
* - 256-bit vectors (2 AES blocks per vector)
* - VEX-coded instructions
* - ymm0-ymm15
* - This is for CPUs that have VAES but lack AVX512 or AVX10,
* e.g. Intel's Alder Lake and AMD's Zen 3.
* - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's
* Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm
* registers (e.g. Intel's Ice Lake).
*
* VAES + VPCLMULQDQ + AVX10/256 + BMI2
* - 256-bit vectors (2 AES blocks per vector)
* VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
* - 512-bit vectors (4 AES blocks per vector)
* - EVEX-coded instructions
* - ymm0-ymm31
* - This is for CPUs that have AVX512 but where using zmm registers causes
* downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
* - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
* To avoid confusion with 512-bit, we just write AVX10/256.
*
* VAES + VPCLMULQDQ + AVX10/512 + BMI2
* - Same as the previous one, but upgrades to 512-bit vectors
* (4 AES blocks per vector) in zmm0-zmm31.
* - This is for CPUs that have good AVX512 or AVX10/512 support.
* - zmm0-zmm31
* - This is for CPUs that have good AVX512 support.
*
* This file doesn't have an implementation for AES-NI alone (without AVX), as
* the lack of VEX would make all the assembly code different.
@ -109,7 +102,7 @@
// This table contains constants for vpshufb and vpblendvb, used to
// handle variable byte shifts and blending during ciphertext stealing
// on CPUs that don't support AVX10-style masking.
// on CPUs that don't support AVX512-style masking.
.Lcts_permute_table:
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
@ -138,7 +131,7 @@
.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
_define_Vi \i
.endr
.if USE_AVX10
.if USE_AVX512
.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
_define_Vi \i
.endr
@ -193,7 +186,7 @@
// keys to the *end* of this register range. I.e., AES-128 uses
// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
// (All also use KEY0 for the XOR-only "round" at the beginning.)
.if USE_AVX10
.if USE_AVX512
.set KEY1_XMM, %xmm16
.set KEY1, V16
.set KEY2_XMM, %xmm17
@ -227,7 +220,6 @@
.endm
// Move a vector between memory and a register.
// The register operand must be in the first 16 vector registers.
.macro _vmovdqu src, dst
.if VL < 64
vmovdqu \src, \dst
@ -238,9 +230,9 @@
// Broadcast a 128-bit value into a vector.
.macro _vbroadcast128 src, dst
.if VL == 16 && !USE_AVX10
.if VL == 16
vmovdqu \src, \dst
.elseif VL == 32 && !USE_AVX10
.elseif VL == 32
vbroadcasti128 \src, \dst
.else
vbroadcasti32x4 \src, \dst
@ -248,7 +240,6 @@
.endm
// XOR two vectors together.
// Any register operands must be in the first 16 vector registers.
.macro _vpxor src1, src2, dst
.if VL < 64
vpxor \src1, \src2, \dst
@ -259,7 +250,7 @@
// XOR three vectors together.
.macro _xor3 src1, src2, src3_and_dst
.if USE_AVX10
.if USE_AVX512
// vpternlogd with immediate 0x96 is a three-argument XOR.
vpternlogd $0x96, \src1, \src2, \src3_and_dst
.else
@ -274,7 +265,7 @@
vpshufd $0x13, \src, \tmp
vpaddq \src, \src, \dst
vpsrad $31, \tmp, \tmp
.if USE_AVX10
.if USE_AVX512
vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst
.else
vpand GF_POLY_XMM, \tmp, \tmp
@ -337,7 +328,7 @@
vpsllq $1*VL/16, TWEAK0, TWEAK1
vpsllq $2*VL/16, TWEAK0, TWEAK2
vpsllq $3*VL/16, TWEAK0, TWEAK3
.if USE_AVX10
.if USE_AVX512
vpternlogd $0x96, V0, V1, TWEAK1
vpternlogd $0x96, V2, V3, TWEAK2
vpternlogd $0x96, V4, V5, TWEAK3
@ -474,26 +465,26 @@
lea OFFS-16(KEY, KEYLEN64, 4), KEY
// If all 32 SIMD registers are available, cache all the round keys.
.if USE_AVX10
.if USE_AVX512
cmp $24, KEYLEN
jl .Laes128\@
je .Laes192\@
_vbroadcast128 -6*16(KEY), KEY1
_vbroadcast128 -5*16(KEY), KEY2
vbroadcasti32x4 -6*16(KEY), KEY1
vbroadcasti32x4 -5*16(KEY), KEY2
.Laes192\@:
_vbroadcast128 -4*16(KEY), KEY3
_vbroadcast128 -3*16(KEY), KEY4
vbroadcasti32x4 -4*16(KEY), KEY3
vbroadcasti32x4 -3*16(KEY), KEY4
.Laes128\@:
_vbroadcast128 -2*16(KEY), KEY5
_vbroadcast128 -1*16(KEY), KEY6
_vbroadcast128 0*16(KEY), KEY7
_vbroadcast128 1*16(KEY), KEY8
_vbroadcast128 2*16(KEY), KEY9
_vbroadcast128 3*16(KEY), KEY10
_vbroadcast128 4*16(KEY), KEY11
_vbroadcast128 5*16(KEY), KEY12
_vbroadcast128 6*16(KEY), KEY13
_vbroadcast128 7*16(KEY), KEY14
vbroadcasti32x4 -2*16(KEY), KEY5
vbroadcasti32x4 -1*16(KEY), KEY6
vbroadcasti32x4 0*16(KEY), KEY7
vbroadcasti32x4 1*16(KEY), KEY8
vbroadcasti32x4 2*16(KEY), KEY9
vbroadcasti32x4 3*16(KEY), KEY10
vbroadcasti32x4 4*16(KEY), KEY11
vbroadcasti32x4 5*16(KEY), KEY12
vbroadcasti32x4 6*16(KEY), KEY13
vbroadcasti32x4 7*16(KEY), KEY14
.endif
.endm
@ -521,7 +512,7 @@
// using the same key for all block(s). The round key is loaded from the
// appropriate register or memory location for round \i. May clobber \tmp.
.macro _vaes_1x enc, i, xmm_suffix, data, tmp
.if USE_AVX10
.if USE_AVX512
_vaes \enc, KEY\i\xmm_suffix, \data
.else
.ifnb \xmm_suffix
@ -538,7 +529,7 @@
// appropriate register or memory location for round \i. In addition, does two
// steps of the computation of the next set of tweaks. May clobber V4 and V5.
.macro _vaes_4x enc, i
.if USE_AVX10
.if USE_AVX512
_tweak_step (2*(\i-5))
_vaes \enc, KEY\i, V0
_vaes \enc, KEY\i, V1
@ -574,7 +565,7 @@
.irp i, 5,6,7,8,9,10,11,12,13
_vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp
.endr
.if USE_AVX10
.if USE_AVX512
vpxord KEY14\xmm_suffix, \tweak, \tmp
.else
.ifnb \xmm_suffix
@ -617,11 +608,11 @@
// This is the main loop, en/decrypting 4*VL bytes per iteration.
// XOR each source block with its tweak and the zero-th round key.
.if USE_AVX10
_vmovdqu 0*VL(SRC), V0
_vmovdqu 1*VL(SRC), V1
_vmovdqu 2*VL(SRC), V2
_vmovdqu 3*VL(SRC), V3
.if USE_AVX512
vmovdqu8 0*VL(SRC), V0
vmovdqu8 1*VL(SRC), V1
vmovdqu8 2*VL(SRC), V2
vmovdqu8 3*VL(SRC), V3
vpternlogd $0x96, TWEAK0, KEY0, V0
vpternlogd $0x96, TWEAK1, KEY0, V1
vpternlogd $0x96, TWEAK2, KEY0, V2
@ -654,7 +645,7 @@
// Reduce latency by doing the XOR before the vaesenclast, utilizing the
// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
// (and likewise for vaesdeclast).
.if USE_AVX10
.if USE_AVX512
_tweak_step 18
_tweak_step 19
vpxord TWEAK0, KEY14, V4
@ -762,7 +753,7 @@
_aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
.endif
.if USE_AVX10
.if USE_AVX512
// Create a mask that has the first LEN bits set.
mov $-1, %r9d
bzhi LEN, %r9d, %r9d
@ -811,7 +802,7 @@
// u8 iv[AES_BLOCK_SIZE]);
//
// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes
// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512.
SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
.set TWEAK_KEY, %rdi
.set IV, %rsi
@ -853,7 +844,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv)
// multiple of 16, then this function updates |tweak| to contain the next tweak.
.set VL, 16
.set USE_AVX10, 0
.set USE_AVX512, 0
SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
_aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
@ -863,7 +854,7 @@ SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
.set VL, 32
.set USE_AVX10, 0
.set USE_AVX512, 0
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
_aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
@ -871,21 +862,12 @@ SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
_aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
.set VL, 32
.set USE_AVX10, 1
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
_aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
_aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
.set VL, 64
.set USE_AVX10, 1
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
.set USE_AVX512, 1
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512)
_aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
SYM_FUNC_END(aes_xts_encrypt_vaes_avx512)
SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512)
_aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
SYM_FUNC_END(aes_xts_decrypt_vaes_avx512)
#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */

View File

@ -844,8 +844,7 @@ simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)]
DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500);
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600);
DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700);
DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800);
DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800);
#endif
/* The common part of the x86_64 AES-GCM key struct */
@ -1592,11 +1591,6 @@ static int __init register_avx_algs(void)
XFEATURE_MASK_AVX512, NULL))
return 0;
err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256,
ARRAY_SIZE(skcipher_algs_vaes_avx10_256),
simd_skcipher_algs_vaes_avx10_256);
if (err)
return err;
err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256,
ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
aes_gcm_simdalgs_vaes_avx10_256);
@ -1606,15 +1600,15 @@ static int __init register_avx_algs(void)
if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
int i;
for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++)
skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1;
for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
}
err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512,
ARRAY_SIZE(skcipher_algs_vaes_avx10_512),
simd_skcipher_algs_vaes_avx10_512);
err = simd_register_skciphers_compat(skcipher_algs_vaes_avx512,
ARRAY_SIZE(skcipher_algs_vaes_avx512),
simd_skcipher_algs_vaes_avx512);
if (err)
return err;
err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512,
@ -1641,18 +1635,14 @@ static void unregister_avx_algs(void)
simd_unregister_skciphers(skcipher_algs_vaes_avx2,
ARRAY_SIZE(skcipher_algs_vaes_avx2),
simd_skcipher_algs_vaes_avx2);
if (simd_skcipher_algs_vaes_avx10_256[0])
simd_unregister_skciphers(skcipher_algs_vaes_avx10_256,
ARRAY_SIZE(skcipher_algs_vaes_avx10_256),
simd_skcipher_algs_vaes_avx10_256);
if (aes_gcm_simdalgs_vaes_avx10_256[0])
simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256,
ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
aes_gcm_simdalgs_vaes_avx10_256);
if (simd_skcipher_algs_vaes_avx10_512[0])
simd_unregister_skciphers(skcipher_algs_vaes_avx10_512,
ARRAY_SIZE(skcipher_algs_vaes_avx10_512),
simd_skcipher_algs_vaes_avx10_512);
if (simd_skcipher_algs_vaes_avx512[0])
simd_unregister_skciphers(skcipher_algs_vaes_avx512,
ARRAY_SIZE(skcipher_algs_vaes_avx512),
simd_skcipher_algs_vaes_avx512);
if (aes_gcm_simdalgs_vaes_avx10_512[0])
simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512,
ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),