mirror of
https://github.com/torvalds/linux.git
synced 2026-05-31 02:24:24 +02:00
Zhaoxin CPUs have implemented the SHA(Secure Hash Algorithm) as its CPU instructions by PHE(Padlock Hash Engine) Extensions, including XSHA1, XSHA256, XSHA384 and XSHA512 instructions. The instruction specification is available at the following link. (https://gitee.com/openzhaoxin/zhaoxin_specifications/blob/20260227/ZX_Padlock_Reference.pdf) With the help of implementation of SHA in hardware instead of software, can develop applications with higher performance, more security and more flexibility. This patch includes the XSHA256 instruction optimized implementation of SHA-256 transform function. The table below shows the benchmark results before and after applying this patch by using CRYPTO_LIB_BENCHMARK on Zhaoxin KX-7000 platform, highlighting the achieved speedups. +---------+--------------------------+ | | SHA256 | +---------+--------+-----------------+ | Len | Before | After | +---------+--------+-----------------+ | 1* | 2 | 7 (3.50x) | | 16 | 35 | 119 (3.40x) | | 64 | 74 | 280 (3.78x) | | 127 | 99 | 387 (3.91x) | | 128 | 103 | 427 (4.15x) | | 200 | 123 | 537 (4.37x) | | 256 | 128 | 582 (4.55x) | | 511 | 144 | 679 (4.72x) | | 512 | 146 | 714 (4.89x) | | 1024 | 157 | 796 (5.07x) | | 3173 | 167 | 883 (5.28x) | | 4096 | 166 | 876 (5.28x) | | 16384 | 169 | 899 (5.32x) | +---------+--------+-----------------+ *: The length of each data block to be processed by one complete SHA sequence. **: The throughput of processing data blocks, unit is Mb/s. After applying this patch, the SHA256 KUnit test suite passes on Zhaoxin platforms. Detailed test logs are shown below. [ 7.767257] # Subtest: sha256 [ 7.770542] # module: sha256_kunit [ 7.770544] 1..15 [ 7.777383] ok 1 test_hash_test_vectors [ 7.788563] ok 2 test_hash_all_lens_up_to_4096 [ 7.806090] ok 3 test_hash_incremental_updates [ 7.813553] ok 4 test_hash_buffer_overruns [ 7.822384] ok 5 test_hash_overlaps [ 7.829388] ok 6 test_hash_alignment_consistency [ 7.833843] ok 7 test_hash_ctx_zeroization [ 7.915191] ok 8 test_hash_interrupt_context_1 [ 8.362312] ok 9 test_hash_interrupt_context_2 [ 8.401607] ok 10 test_hmac [ 8.415458] ok 11 test_sha256_finup_2x [ 8.419397] ok 12 test_sha256_finup_2x_defaultctx [ 8.424107] ok 13 test_sha256_finup_2x_hugelen [ 8.451289] # benchmark_hash: len=1: 7 MB/s [ 8.465372] # benchmark_hash: len=16: 119 MB/s [ 8.481760] # benchmark_hash: len=64: 280 MB/s [ 8.499344] # benchmark_hash: len=127: 387 MB/s [ 8.515800] # benchmark_hash: len=128: 427 MB/s [ 8.531970] # benchmark_hash: len=200: 537 MB/s [ 8.548241] # benchmark_hash: len=256: 582 MB/s [ 8.564838] # benchmark_hash: len=511: 679 MB/s [ 8.580872] # benchmark_hash: len=512: 714 MB/s [ 8.596858] # benchmark_hash: len=1024: 796 MB/s [ 8.612567] # benchmark_hash: len=3173: 883 MB/s [ 8.628546] # benchmark_hash: len=4096: 876 MB/s [ 8.644482] # benchmark_hash: len=16384: 899 MB/s [ 8.649773] ok 14 benchmark_hash [ 8.655505] ok 15 benchmark_sha256_finup_2x # SKIP not relevant [ 8.659065] # sha256: pass:14 fail:0 skip:1 total:15 [ 8.665276] # Totals: pass:14 fail:0 skip:1 total:15 [ 8.670195] ok 7 sha256 Signed-off-by: AlanSong-oc <AlanSong-oc@zhaoxin.com> Link: https://lore.kernel.org/r/20260313080150.9393-3-AlanSong-oc@zhaoxin.com Signed-off-by: Eric Biggers <ebiggers@kernel.org>
121 lines
4.2 KiB
C
121 lines
4.2 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* SHA-256 optimized for x86_64
|
|
*
|
|
* Copyright 2025 Google LLC
|
|
*/
|
|
#include <asm/fpu/api.h>
|
|
#include <linux/static_call.h>
|
|
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
|
|
|
|
DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
|
|
|
|
#define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \
|
|
asmlinkage void asm_fn(struct sha256_block_state *state, \
|
|
const u8 *data, size_t nblocks); \
|
|
static void c_fn(struct sha256_block_state *state, const u8 *data, \
|
|
size_t nblocks) \
|
|
{ \
|
|
if (likely(irq_fpu_usable())) { \
|
|
kernel_fpu_begin(); \
|
|
asm_fn(state, data, nblocks); \
|
|
kernel_fpu_end(); \
|
|
} else { \
|
|
sha256_blocks_generic(state, data, nblocks); \
|
|
} \
|
|
}
|
|
|
|
DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3);
|
|
DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx);
|
|
DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx);
|
|
DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform);
|
|
|
|
#define PHE_ALIGNMENT 16
|
|
static void sha256_blocks_phe(struct sha256_block_state *state,
|
|
const u8 *data, size_t nblocks)
|
|
{
|
|
/*
|
|
* On Zhaoxin processors, XSHA256 requires the %rdi register
|
|
* in 64-bit mode (or %edi in 32-bit mode) to point to
|
|
* a 32-byte, 16-byte-aligned buffer.
|
|
*/
|
|
u8 buf[32 + PHE_ALIGNMENT - 1];
|
|
u8 *dst = PTR_ALIGN(&buf[0], PHE_ALIGNMENT);
|
|
size_t padding = -1;
|
|
|
|
memcpy(dst, state, SHA256_DIGEST_SIZE);
|
|
asm volatile(".byte 0xf3,0x0f,0xa6,0xd0" /* REP XSHA256 */
|
|
: "+a"(padding), "+c"(nblocks), "+S"(data)
|
|
: "D"(dst)
|
|
: "memory");
|
|
memcpy(state, dst, SHA256_DIGEST_SIZE);
|
|
}
|
|
|
|
static void sha256_blocks(struct sha256_block_state *state,
|
|
const u8 *data, size_t nblocks)
|
|
{
|
|
static_call(sha256_blocks_x86)(state, data, nblocks);
|
|
}
|
|
|
|
static_assert(offsetof(struct __sha256_ctx, state) == 0);
|
|
static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
|
|
static_assert(offsetof(struct __sha256_ctx, buf) == 40);
|
|
asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
|
|
const u8 *data1, const u8 *data2, int len,
|
|
u8 out1[SHA256_DIGEST_SIZE],
|
|
u8 out2[SHA256_DIGEST_SIZE]);
|
|
|
|
#define sha256_finup_2x_arch sha256_finup_2x_arch
|
|
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
|
|
const u8 *data1, const u8 *data2, size_t len,
|
|
u8 out1[SHA256_DIGEST_SIZE],
|
|
u8 out2[SHA256_DIGEST_SIZE])
|
|
{
|
|
/*
|
|
* The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
|
|
* Further limit len to 65536 to avoid spending too long with preemption
|
|
* disabled. (Of course, in practice len is nearly always 4096 anyway.)
|
|
*/
|
|
if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
|
|
len <= 65536 && likely(irq_fpu_usable())) {
|
|
kernel_fpu_begin();
|
|
sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
|
|
kernel_fpu_end();
|
|
kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
|
|
kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static bool sha256_finup_2x_is_optimized_arch(void)
|
|
{
|
|
return static_key_enabled(&have_sha_ni);
|
|
}
|
|
|
|
#define sha256_mod_init_arch sha256_mod_init_arch
|
|
static void sha256_mod_init_arch(void)
|
|
{
|
|
if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
|
|
static_call_update(sha256_blocks_x86, sha256_blocks_ni);
|
|
static_branch_enable(&have_sha_ni);
|
|
} else if (IS_ENABLED(CONFIG_CPU_SUP_ZHAOXIN) &&
|
|
boot_cpu_has(X86_FEATURE_PHE_EN) &&
|
|
boot_cpu_data.x86 >= 0x07) {
|
|
static_call_update(sha256_blocks_x86, sha256_blocks_phe);
|
|
} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
|
|
NULL) &&
|
|
boot_cpu_has(X86_FEATURE_AVX)) {
|
|
if (boot_cpu_has(X86_FEATURE_AVX2) &&
|
|
boot_cpu_has(X86_FEATURE_BMI2))
|
|
static_call_update(sha256_blocks_x86,
|
|
sha256_blocks_avx2);
|
|
else
|
|
static_call_update(sha256_blocks_x86,
|
|
sha256_blocks_avx);
|
|
} else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
|
|
static_call_update(sha256_blocks_x86, sha256_blocks_ssse3);
|
|
}
|
|
}
|