mirror of
https://github.com/torvalds/linux.git
synced 2026-05-30 18:13:41 +02:00
crypto: x86/aes-gcm - reorder AVX512 precompute and aad_update functions
Now that the _aes_gcm_precompute macro is instantiated only once, replace it directly with a function definition. Also, move aes_gcm_aad_update_vaes_avx512() to a different location in the file so that it's consistent with aes-gcm-vaes-avx2.S and also the BoringSSL port of this code. No functional changes. Acked-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20251002023117.37504-6-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
This commit is contained in:
parent
4b582e0fb3
commit
5213aefa9e
|
|
@ -268,7 +268,7 @@
|
|||
// The number of key powers initialized is NUM_H_POWERS, and they are stored in
|
||||
// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key
|
||||
// powers themselves are also initialized.
|
||||
.macro _aes_gcm_precompute
|
||||
SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
|
||||
|
||||
// Function arguments
|
||||
.set KEY, %rdi
|
||||
|
|
@ -361,16 +361,16 @@
|
|||
// Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
|
||||
// [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
|
||||
mov $3, %eax
|
||||
.Lprecompute_next\@:
|
||||
.Lprecompute_next:
|
||||
sub $64, POWERS_PTR
|
||||
_ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2
|
||||
vmovdqu8 H_CUR, (POWERS_PTR)
|
||||
dec %eax
|
||||
jnz .Lprecompute_next\@
|
||||
jnz .Lprecompute_next
|
||||
|
||||
vzeroupper // This is needed after using ymm or zmm registers.
|
||||
RET
|
||||
.endm
|
||||
SYM_FUNC_END(aes_gcm_precompute_vaes_avx512)
|
||||
|
||||
// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
|
||||
// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst.
|
||||
|
|
@ -463,6 +463,94 @@
|
|||
.endif
|
||||
.endm
|
||||
|
||||
// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
|
||||
// u8 ghash_acc[16],
|
||||
// const u8 *aad, int aadlen);
|
||||
//
|
||||
// This function processes the AAD (Additional Authenticated Data) in GCM.
|
||||
// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
|
||||
// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been
|
||||
// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen|
|
||||
// must be a multiple of 16, except on the last call where it can be any length.
|
||||
// The caller must do any buffering needed to ensure this.
|
||||
//
|
||||
// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
|
||||
// Therefore, for AAD processing we currently only provide this implementation
|
||||
// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This
|
||||
// keeps the code size down, and it enables some micro-optimizations, e.g. using
|
||||
// VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
|
||||
// To optimize for large amounts of AAD, we could implement a 4x-wide loop and
|
||||
// provide a version using 512-bit vectors, but that doesn't seem to be useful.
|
||||
SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512)
|
||||
|
||||
// Function arguments
|
||||
.set KEY, %rdi
|
||||
.set GHASH_ACC_PTR, %rsi
|
||||
.set AAD, %rdx
|
||||
.set AADLEN, %ecx
|
||||
.set AADLEN64, %rcx // Zero-extend AADLEN before using!
|
||||
|
||||
// Additional local variables.
|
||||
// %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
|
||||
.set BSWAP_MASK, %ymm4
|
||||
.set GFPOLY, %ymm5
|
||||
.set GHASH_ACC, %ymm6
|
||||
.set GHASH_ACC_XMM, %xmm6
|
||||
.set H_POW1, %ymm7
|
||||
|
||||
// Load some constants.
|
||||
vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK
|
||||
vbroadcasti128 .Lgfpoly(%rip), GFPOLY
|
||||
|
||||
// Load the GHASH accumulator.
|
||||
vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
|
||||
|
||||
// Update GHASH with 32 bytes of AAD at a time.
|
||||
//
|
||||
// Pre-subtracting 32 from AADLEN saves an instruction from the loop and
|
||||
// also ensures that at least one write always occurs to AADLEN,
|
||||
// zero-extending it and allowing AADLEN64 to be used later.
|
||||
sub $32, AADLEN
|
||||
jl .Laad_loop_1x_done
|
||||
vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1]
|
||||
.Laad_loop_1x:
|
||||
vmovdqu (AAD), %ymm0
|
||||
vpshufb BSWAP_MASK, %ymm0, %ymm0
|
||||
vpxor %ymm0, GHASH_ACC, GHASH_ACC
|
||||
_ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
|
||||
%ymm0, %ymm1, %ymm2
|
||||
vextracti128 $1, GHASH_ACC, %xmm0
|
||||
vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
|
||||
add $32, AAD
|
||||
sub $32, AADLEN
|
||||
jge .Laad_loop_1x
|
||||
.Laad_loop_1x_done:
|
||||
add $32, AADLEN
|
||||
jz .Laad_done
|
||||
|
||||
// Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD.
|
||||
mov $-1, %eax
|
||||
bzhi AADLEN, %eax, %eax
|
||||
kmovd %eax, %k1
|
||||
vmovdqu8 (AAD), %ymm0{%k1}{z}
|
||||
neg AADLEN64
|
||||
and $~15, AADLEN64 // -round_up(AADLEN, 16)
|
||||
vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
|
||||
vpshufb BSWAP_MASK, %ymm0, %ymm0
|
||||
vpxor %ymm0, GHASH_ACC, GHASH_ACC
|
||||
_ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
|
||||
%ymm0, %ymm1, %ymm2
|
||||
vextracti128 $1, GHASH_ACC, %xmm0
|
||||
vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
|
||||
|
||||
.Laad_done:
|
||||
// Store the updated GHASH accumulator back to memory.
|
||||
vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
|
||||
|
||||
vzeroupper // This is needed after using ymm or zmm registers.
|
||||
RET
|
||||
SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512)
|
||||
|
||||
// Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the
|
||||
// round key that has been broadcast to all 128-bit lanes of \round_key.
|
||||
.macro _vaesenc_4x round_key
|
||||
|
|
@ -1001,9 +1089,6 @@
|
|||
RET
|
||||
.endm
|
||||
|
||||
SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
|
||||
_aes_gcm_precompute
|
||||
SYM_FUNC_END(aes_gcm_precompute_vaes_avx512)
|
||||
SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512)
|
||||
_aes_gcm_update 1
|
||||
SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512)
|
||||
|
|
@ -1011,94 +1096,6 @@ SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512)
|
|||
_aes_gcm_update 0
|
||||
SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512)
|
||||
|
||||
// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
|
||||
// u8 ghash_acc[16],
|
||||
// const u8 *aad, int aadlen);
|
||||
//
|
||||
// This function processes the AAD (Additional Authenticated Data) in GCM.
|
||||
// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
|
||||
// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been
|
||||
// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen|
|
||||
// must be a multiple of 16, except on the last call where it can be any length.
|
||||
// The caller must do any buffering needed to ensure this.
|
||||
//
|
||||
// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
|
||||
// Therefore, for AAD processing we currently only provide this implementation
|
||||
// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This
|
||||
// keeps the code size down, and it enables some micro-optimizations, e.g. using
|
||||
// VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
|
||||
// To optimize for large amounts of AAD, we could implement a 4x-wide loop and
|
||||
// provide a version using 512-bit vectors, but that doesn't seem to be useful.
|
||||
SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512)
|
||||
|
||||
// Function arguments
|
||||
.set KEY, %rdi
|
||||
.set GHASH_ACC_PTR, %rsi
|
||||
.set AAD, %rdx
|
||||
.set AADLEN, %ecx
|
||||
.set AADLEN64, %rcx // Zero-extend AADLEN before using!
|
||||
|
||||
// Additional local variables.
|
||||
// %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
|
||||
.set BSWAP_MASK, %ymm4
|
||||
.set GFPOLY, %ymm5
|
||||
.set GHASH_ACC, %ymm6
|
||||
.set GHASH_ACC_XMM, %xmm6
|
||||
.set H_POW1, %ymm7
|
||||
|
||||
// Load some constants.
|
||||
vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK
|
||||
vbroadcasti128 .Lgfpoly(%rip), GFPOLY
|
||||
|
||||
// Load the GHASH accumulator.
|
||||
vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
|
||||
|
||||
// Update GHASH with 32 bytes of AAD at a time.
|
||||
//
|
||||
// Pre-subtracting 32 from AADLEN saves an instruction from the loop and
|
||||
// also ensures that at least one write always occurs to AADLEN,
|
||||
// zero-extending it and allowing AADLEN64 to be used later.
|
||||
sub $32, AADLEN
|
||||
jl .Laad_loop_1x_done
|
||||
vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1]
|
||||
.Laad_loop_1x:
|
||||
vmovdqu (AAD), %ymm0
|
||||
vpshufb BSWAP_MASK, %ymm0, %ymm0
|
||||
vpxor %ymm0, GHASH_ACC, GHASH_ACC
|
||||
_ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
|
||||
%ymm0, %ymm1, %ymm2
|
||||
vextracti128 $1, GHASH_ACC, %xmm0
|
||||
vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
|
||||
add $32, AAD
|
||||
sub $32, AADLEN
|
||||
jge .Laad_loop_1x
|
||||
.Laad_loop_1x_done:
|
||||
add $32, AADLEN
|
||||
jz .Laad_done
|
||||
|
||||
// Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD.
|
||||
mov $-1, %eax
|
||||
bzhi AADLEN, %eax, %eax
|
||||
kmovd %eax, %k1
|
||||
vmovdqu8 (AAD), %ymm0{%k1}{z}
|
||||
neg AADLEN64
|
||||
and $~15, AADLEN64 // -round_up(AADLEN, 16)
|
||||
vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
|
||||
vpshufb BSWAP_MASK, %ymm0, %ymm0
|
||||
vpxor %ymm0, GHASH_ACC, GHASH_ACC
|
||||
_ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
|
||||
%ymm0, %ymm1, %ymm2
|
||||
vextracti128 $1, GHASH_ACC, %xmm0
|
||||
vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
|
||||
|
||||
.Laad_done:
|
||||
// Store the updated GHASH accumulator back to memory.
|
||||
vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
|
||||
|
||||
vzeroupper // This is needed after using ymm or zmm registers.
|
||||
RET
|
||||
SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512)
|
||||
|
||||
SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512)
|
||||
_aes_gcm_final 1
|
||||
SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512)
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user