linux/lib/crypto/x86/sha256.h
AlanSong-oc 44b02a14d9 lib/crypto: x86/sha256: PHE Extensions optimized SHA256 transform function
Zhaoxin CPUs have implemented the SHA(Secure Hash Algorithm) as its CPU
instructions by PHE(Padlock Hash Engine) Extensions, including XSHA1,
XSHA256, XSHA384 and XSHA512 instructions. The instruction specification
is available at the following link.
(https://gitee.com/openzhaoxin/zhaoxin_specifications/blob/20260227/ZX_Padlock_Reference.pdf)

With the help of implementation of SHA in hardware instead of software,
can develop applications with higher performance, more security and more
flexibility.

This patch includes the XSHA256 instruction optimized implementation of
SHA-256 transform function.

The table below shows the benchmark results before and after applying
this patch by using CRYPTO_LIB_BENCHMARK on Zhaoxin KX-7000 platform,
highlighting the achieved speedups.

+---------+--------------------------+
|         |          SHA256          |
+---------+--------+-----------------+
|   Len   | Before |      After      |
+---------+--------+-----------------+
|      1* |    2   |    7 (3.50x)    |
|     16  |   35   |  119 (3.40x)    |
|     64  |   74   |  280 (3.78x)    |
|    127  |   99   |  387 (3.91x)    |
|    128  |  103   |  427 (4.15x)    |
|    200  |  123   |  537 (4.37x)    |
|    256  |  128   |  582 (4.55x)    |
|    511  |  144   |  679 (4.72x)    |
|    512  |  146   |  714 (4.89x)    |
|   1024  |  157   |  796 (5.07x)    |
|   3173  |  167   |  883 (5.28x)    |
|   4096  |  166   |  876 (5.28x)    |
|  16384  |  169   |  899 (5.32x)    |
+---------+--------+-----------------+
*: The length of each data block to be processed by one complete SHA
   sequence.
**: The throughput of processing data blocks, unit is Mb/s.

After applying this patch, the SHA256 KUnit test suite passes on Zhaoxin
platforms. Detailed test logs are shown below.

[    7.767257]     # Subtest: sha256
[    7.770542]     # module: sha256_kunit
[    7.770544]     1..15
[    7.777383]     ok 1 test_hash_test_vectors
[    7.788563]     ok 2 test_hash_all_lens_up_to_4096
[    7.806090]     ok 3 test_hash_incremental_updates
[    7.813553]     ok 4 test_hash_buffer_overruns
[    7.822384]     ok 5 test_hash_overlaps
[    7.829388]     ok 6 test_hash_alignment_consistency
[    7.833843]     ok 7 test_hash_ctx_zeroization
[    7.915191]     ok 8 test_hash_interrupt_context_1
[    8.362312]     ok 9 test_hash_interrupt_context_2
[    8.401607]     ok 10 test_hmac
[    8.415458]     ok 11 test_sha256_finup_2x
[    8.419397]     ok 12 test_sha256_finup_2x_defaultctx
[    8.424107]     ok 13 test_sha256_finup_2x_hugelen
[    8.451289]     # benchmark_hash: len=1: 7 MB/s
[    8.465372]     # benchmark_hash: len=16: 119 MB/s
[    8.481760]     # benchmark_hash: len=64: 280 MB/s
[    8.499344]     # benchmark_hash: len=127: 387 MB/s
[    8.515800]     # benchmark_hash: len=128: 427 MB/s
[    8.531970]     # benchmark_hash: len=200: 537 MB/s
[    8.548241]     # benchmark_hash: len=256: 582 MB/s
[    8.564838]     # benchmark_hash: len=511: 679 MB/s
[    8.580872]     # benchmark_hash: len=512: 714 MB/s
[    8.596858]     # benchmark_hash: len=1024: 796 MB/s
[    8.612567]     # benchmark_hash: len=3173: 883 MB/s
[    8.628546]     # benchmark_hash: len=4096: 876 MB/s
[    8.644482]     # benchmark_hash: len=16384: 899 MB/s
[    8.649773]     ok 14 benchmark_hash
[    8.655505]     ok 15 benchmark_sha256_finup_2x # SKIP not relevant
[    8.659065] # sha256: pass:14 fail:0 skip:1 total:15
[    8.665276] # Totals: pass:14 fail:0 skip:1 total:15
[    8.670195] ok 7 sha256

Signed-off-by: AlanSong-oc <AlanSong-oc@zhaoxin.com>
Link: https://lore.kernel.org/r/20260313080150.9393-3-AlanSong-oc@zhaoxin.com
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
2026-03-14 11:44:18 -07:00

121 lines
4.2 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* SHA-256 optimized for x86_64
*
* Copyright 2025 Google LLC
*/
#include <asm/fpu/api.h>
#include <linux/static_call.h>
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
#define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \
asmlinkage void asm_fn(struct sha256_block_state *state, \
const u8 *data, size_t nblocks); \
static void c_fn(struct sha256_block_state *state, const u8 *data, \
size_t nblocks) \
{ \
if (likely(irq_fpu_usable())) { \
kernel_fpu_begin(); \
asm_fn(state, data, nblocks); \
kernel_fpu_end(); \
} else { \
sha256_blocks_generic(state, data, nblocks); \
} \
}
DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3);
DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx);
DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx);
DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform);
#define PHE_ALIGNMENT 16
static void sha256_blocks_phe(struct sha256_block_state *state,
const u8 *data, size_t nblocks)
{
/*
* On Zhaoxin processors, XSHA256 requires the %rdi register
* in 64-bit mode (or %edi in 32-bit mode) to point to
* a 32-byte, 16-byte-aligned buffer.
*/
u8 buf[32 + PHE_ALIGNMENT - 1];
u8 *dst = PTR_ALIGN(&buf[0], PHE_ALIGNMENT);
size_t padding = -1;
memcpy(dst, state, SHA256_DIGEST_SIZE);
asm volatile(".byte 0xf3,0x0f,0xa6,0xd0" /* REP XSHA256 */
: "+a"(padding), "+c"(nblocks), "+S"(data)
: "D"(dst)
: "memory");
memcpy(state, dst, SHA256_DIGEST_SIZE);
}
static void sha256_blocks(struct sha256_block_state *state,
const u8 *data, size_t nblocks)
{
static_call(sha256_blocks_x86)(state, data, nblocks);
}
static_assert(offsetof(struct __sha256_ctx, state) == 0);
static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
static_assert(offsetof(struct __sha256_ctx, buf) == 40);
asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
const u8 *data1, const u8 *data2, int len,
u8 out1[SHA256_DIGEST_SIZE],
u8 out2[SHA256_DIGEST_SIZE]);
#define sha256_finup_2x_arch sha256_finup_2x_arch
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
const u8 *data1, const u8 *data2, size_t len,
u8 out1[SHA256_DIGEST_SIZE],
u8 out2[SHA256_DIGEST_SIZE])
{
/*
* The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
* Further limit len to 65536 to avoid spending too long with preemption
* disabled. (Of course, in practice len is nearly always 4096 anyway.)
*/
if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
len <= 65536 && likely(irq_fpu_usable())) {
kernel_fpu_begin();
sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
kernel_fpu_end();
kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
return true;
}
return false;
}
static bool sha256_finup_2x_is_optimized_arch(void)
{
return static_key_enabled(&have_sha_ni);
}
#define sha256_mod_init_arch sha256_mod_init_arch
static void sha256_mod_init_arch(void)
{
if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
static_call_update(sha256_blocks_x86, sha256_blocks_ni);
static_branch_enable(&have_sha_ni);
} else if (IS_ENABLED(CONFIG_CPU_SUP_ZHAOXIN) &&
boot_cpu_has(X86_FEATURE_PHE_EN) &&
boot_cpu_data.x86 >= 0x07) {
static_call_update(sha256_blocks_x86, sha256_blocks_phe);
} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
NULL) &&
boot_cpu_has(X86_FEATURE_AVX)) {
if (boot_cpu_has(X86_FEATURE_AVX2) &&
boot_cpu_has(X86_FEATURE_BMI2))
static_call_update(sha256_blocks_x86,
sha256_blocks_avx2);
else
static_call_update(sha256_blocks_x86,
sha256_blocks_avx);
} else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
static_call_update(sha256_blocks_x86, sha256_blocks_ssse3);
}
}