From ac431d597a9bdfc2ba6b314813f29a6ef2b4a3bf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 4 Jul 2025 16:30:50 +0200 Subject: [PATCH 01/11] libceph: define and enforce CEPH_MAX_KEY_LEN When decoding the key, verify that the key material would fit into a fixed-size buffer in process_auth_done() and generally has a sane length. The new CEPH_MAX_KEY_LEN check replaces the existing check for a key with no key material which is a) not universal since CEPH_CRYPTO_NONE has to be excluded and b) doesn't provide much value since a smaller than needed key is just as invalid as no key -- this has to be handled elsewhere anyway. Signed-off-by: Ilya Dryomov --- net/ceph/crypto.c | 8 +++++--- net/ceph/crypto.h | 2 +- net/ceph/messenger_v2.c | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 01b2ce1e8fc0..5601732cf4fa 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -37,9 +37,6 @@ static int set_secret(struct ceph_crypto_key *key, void *buf) return -ENOTSUPP; } - if (!key->len) - return -EINVAL; - key->key = kmemdup(buf, key->len, GFP_NOIO); if (!key->key) { ret = -ENOMEM; @@ -83,6 +80,11 @@ int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) ceph_decode_copy(p, &key->created, sizeof(key->created)); key->len = ceph_decode_16(p); ceph_decode_need(p, end, key->len, bad); + if (key->len > CEPH_MAX_KEY_LEN) { + pr_err("secret too big %d\n", key->len); + return -EINVAL; + } + ret = set_secret(key, *p); memzero_explicit(*p, key->len); *p += key->len; diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 23de29fc613c..a20bad6d1e96 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -5,7 +5,7 @@ #include #include -#define CEPH_KEY_LEN 16 +#define CEPH_MAX_KEY_LEN 16 #define CEPH_MAX_CON_SECRET_LEN 64 /* diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c9d50c0dcd33..31e042dc1b3f 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -2360,7 +2360,7 @@ static int process_auth_reply_more(struct ceph_connection *con, */ static int process_auth_done(struct ceph_connection *con, void *p, void *end) { - u8 session_key_buf[CEPH_KEY_LEN + 16]; + u8 session_key_buf[CEPH_MAX_KEY_LEN + 16]; u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16]; u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16); u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16); From 0ee8bccf7396d50726c9c8dd3135fb64a9fe8426 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 5 Jul 2025 11:28:21 +0200 Subject: [PATCH 02/11] libceph: generalize ceph_x_encrypt_offset() and ceph_x_encrypt_buflen() - introduce the notion of a data offset for ceph_x_encrypt_offset() to allow for e.g. confounder to be prepended before the encryption header in the future. For CEPH_CRYPTO_AES, the data offset is 0 (i.e. nothing is prepended). - adjust ceph_x_encrypt_buflen() accordingly and make it account for PKCS#7 padding that is used by CEPH_CRYPTO_AES precisely instead of just always adding 16. Signed-off-by: Ilya Dryomov --- net/ceph/auth_x.c | 44 +++++++++++++++++++++++++++++--------------- net/ceph/crypto.c | 25 +++++++++++++++++++++++++ net/ceph/crypto.h | 2 ++ 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index a21c157daf7d..5d7245884f95 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -44,23 +44,35 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac) return !!need; } -static int ceph_x_encrypt_offset(void) +static int __ceph_x_encrypt_offset(const struct ceph_crypto_key *key) { - return sizeof(u32) + sizeof(struct ceph_x_encrypt_header); + return ceph_crypt_data_offset(key) + + sizeof(struct ceph_x_encrypt_header); } -static int ceph_x_encrypt_buflen(int ilen) +static int ceph_x_encrypt_offset(const struct ceph_crypto_key *key) { - return ceph_x_encrypt_offset() + ilen + 16; + return sizeof(u32) + __ceph_x_encrypt_offset(key); +} + +/* + * AES: ciphertext_len | hdr | data... | padding + */ +static int ceph_x_encrypt_buflen(const struct ceph_crypto_key *key, + int data_len) +{ + int encrypt_len = sizeof(struct ceph_x_encrypt_header) + data_len; + return sizeof(u32) + ceph_crypt_buflen(key, encrypt_len); } static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf, int buf_len, int plaintext_len) { - struct ceph_x_encrypt_header *hdr = buf + sizeof(u32); + struct ceph_x_encrypt_header *hdr; int ciphertext_len; int ret; + hdr = buf + sizeof(u32) + ceph_crypt_data_offset(secret); hdr->struct_v = 1; hdr->magic = cpu_to_le64(CEPHX_ENC_MAGIC); @@ -77,7 +89,7 @@ static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf, static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p, int ciphertext_len) { - struct ceph_x_encrypt_header *hdr = p; + struct ceph_x_encrypt_header *hdr; int plaintext_len; int ret; @@ -86,6 +98,7 @@ static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p, if (ret) return ret; + hdr = p + ceph_crypt_data_offset(secret); if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) { pr_err("%s bad magic\n", __func__); return -EINVAL; @@ -193,7 +206,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, } /* blob for me */ - dp = *p + ceph_x_encrypt_offset(); + dp = *p + ceph_x_encrypt_offset(secret); ret = ceph_x_decrypt(secret, p, end); if (ret < 0) goto out; @@ -220,7 +233,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, ceph_decode_8_safe(p, end, is_enc, bad); if (is_enc) { /* encrypted */ - tp = *p + ceph_x_encrypt_offset(); + tp = *p + ceph_x_encrypt_offset(&th->session_key); ret = ceph_x_decrypt(&th->session_key, p, end); if (ret < 0) goto out; @@ -312,7 +325,7 @@ static int encrypt_authorizer(struct ceph_x_authorizer *au, p = (void *)(msg_a + 1) + le32_to_cpu(msg_a->ticket_blob.blob_len); end = au->buf->vec.iov_base + au->buf->vec.iov_len; - msg_b = p + ceph_x_encrypt_offset(); + msg_b = p + ceph_x_encrypt_offset(&au->session_key); msg_b->struct_v = 2; msg_b->nonce = cpu_to_le64(au->nonce); if (server_challenge) { @@ -368,7 +381,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, goto out_au; maxlen = sizeof(*msg_a) + ticket_blob_len + - ceph_x_encrypt_buflen(sizeof(*msg_b)); + ceph_x_encrypt_buflen(&au->session_key, sizeof(*msg_b)); dout(" need len %d\n", maxlen); if (au->buf && au->buf->alloc_len < maxlen) { ceph_buffer_put(au->buf); @@ -507,7 +520,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, struct ceph_x_authenticate *auth = (void *)(head + 1); void *enc_buf = xi->auth_authorizer.enc_buf; struct ceph_x_challenge_blob *blob = enc_buf + - ceph_x_encrypt_offset(); + ceph_x_encrypt_offset(&xi->secret); u64 *u; p = auth + 1; @@ -634,7 +647,7 @@ static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id, ceph_decode_need(p, end, len, e_inval); dout("%s connection secret blob len %d\n", __func__, len); if (len > 0) { - dp = *p + ceph_x_encrypt_offset(); + dp = *p + ceph_x_encrypt_offset(&th->session_key); ret = ceph_x_decrypt(&th->session_key, p, *p + len); if (ret < 0) return ret; @@ -804,7 +817,7 @@ static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret, return ret; dout("%s decrypted %d bytes\n", __func__, ret); - dp = challenge + sizeof(struct ceph_x_encrypt_header); + dp = challenge + __ceph_x_encrypt_offset(secret); dend = dp + ret; ceph_decode_skip_8(&dp, dend, e_inval); /* struct_v */ @@ -851,7 +864,7 @@ static int decrypt_authorizer_reply(struct ceph_crypto_key *secret, u8 struct_v; int ret; - dp = *p + ceph_x_encrypt_offset(); + dp = *p + ceph_x_encrypt_offset(secret); ret = ceph_x_decrypt(secret, p, end); if (ret < 0) return ret; @@ -974,7 +987,8 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, __le32 front_crc; __le32 middle_crc; __le32 data_crc; - } __packed *sigblock = enc_buf + ceph_x_encrypt_offset(); + } __packed *sigblock = enc_buf + + ceph_x_encrypt_offset(&au->session_key); sigblock->len = cpu_to_le32(4*sizeof(u32)); sigblock->header_crc = msg->hdr.crc; diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 5601732cf4fa..2b98daffe9af 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -285,6 +285,31 @@ int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt, } } +int ceph_crypt_data_offset(const struct ceph_crypto_key *key) +{ + switch (key->type) { + case CEPH_CRYPTO_NONE: + case CEPH_CRYPTO_AES: + return 0; + default: + BUG(); + } +} + +int ceph_crypt_buflen(const struct ceph_crypto_key *key, int data_len) +{ + switch (key->type) { + case CEPH_CRYPTO_NONE: + return data_len; + case CEPH_CRYPTO_AES: + /* PKCS#7 padding at the end */ + return data_len + AES_BLOCK_SIZE - + (data_len & (AES_BLOCK_SIZE - 1)); + default: + BUG(); + } +} + static int ceph_key_preparse(struct key_preparsed_payload *prep) { struct ceph_crypto_key *ckey; diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index a20bad6d1e96..736ec6d2fbcb 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -28,6 +28,8 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key); /* crypto.c */ int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len); +int ceph_crypt_data_offset(const struct ceph_crypto_key *key); +int ceph_crypt_buflen(const struct ceph_crypto_key *key, int data_len); int ceph_crypto_init(void); void ceph_crypto_shutdown(void); From 6cec0b61aacce4da5125b21c718189f0dc11eb51 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 22 Dec 2025 19:44:24 +0100 Subject: [PATCH 03/11] libceph: introduce ceph_crypto_key_prepare() In preparation for bringing in a new encryption scheme/key type, decouple decoding or cloning the key from allocating required crypto API objects and setting them up. The rationale is that a) in some cases a shallow clone is sufficient and b) ceph_crypto_key_prepare() may grow additional parameters that would be inconvenient to provide at the point the key is originally decoded. Signed-off-by: Ilya Dryomov --- net/ceph/auth_x.c | 26 +++++++++++++---- net/ceph/crypto.c | 73 +++++++++++++++++++++++------------------------ net/ceph/crypto.h | 1 + 3 files changed, 57 insertions(+), 43 deletions(-) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 5d7245884f95..abdd35be263a 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -221,6 +221,10 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; + ret = ceph_crypto_key_prepare(&new_session_key); + if (ret) + goto out; + ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad); ceph_decode_timespec64(&validity, dp); dp += sizeof(struct ceph_timespec); @@ -380,6 +384,10 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, if (ret) goto out_au; + ret = ceph_crypto_key_prepare(&au->session_key); + if (ret) + goto out_au; + maxlen = sizeof(*msg_a) + ticket_blob_len + ceph_x_encrypt_buflen(&au->session_key, sizeof(*msg_b)); dout(" need len %d\n", maxlen); @@ -1106,21 +1114,26 @@ int ceph_x_init(struct ceph_auth_client *ac) int ret; dout("ceph_x_init %p\n", ac); - ret = -ENOMEM; xi = kzalloc(sizeof(*xi), GFP_NOFS); if (!xi) - goto out; + return -ENOMEM; ret = -EINVAL; if (!ac->key) { pr_err("no secret set (for auth_x protocol)\n"); - goto out_nomem; + goto err_xi; } ret = ceph_crypto_key_clone(&xi->secret, ac->key); if (ret < 0) { pr_err("cannot clone key: %d\n", ret); - goto out_nomem; + goto err_xi; + } + + ret = ceph_crypto_key_prepare(&xi->secret); + if (ret) { + pr_err("cannot prepare key: %d\n", ret); + goto err_secret; } xi->starting = true; @@ -1131,8 +1144,9 @@ int ceph_x_init(struct ceph_auth_client *ac) ac->ops = &ceph_x_ops; return 0; -out_nomem: +err_secret: + ceph_crypto_key_destroy(&xi->secret); +err_xi: kfree(xi); -out: return ret; } diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 2b98daffe9af..3453dc303315 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -16,65 +16,61 @@ #include #include "crypto.h" -/* - * Set ->key and ->tfm. The rest of the key should be filled in before - * this function is called. - */ -static int set_secret(struct ceph_crypto_key *key, void *buf) +static int set_aes_tfm(struct ceph_crypto_key *key) { unsigned int noio_flag; int ret; - key->key = NULL; - key->tfm = NULL; - - switch (key->type) { - case CEPH_CRYPTO_NONE: - return 0; /* nothing to do */ - case CEPH_CRYPTO_AES: - break; - default: - return -ENOTSUPP; - } - - key->key = kmemdup(buf, key->len, GFP_NOIO); - if (!key->key) { - ret = -ENOMEM; - goto fail; - } - - /* crypto_alloc_sync_skcipher() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0); memalloc_noio_restore(noio_flag); if (IS_ERR(key->tfm)) { ret = PTR_ERR(key->tfm); key->tfm = NULL; - goto fail; + return ret; } ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len); if (ret) - goto fail; + return ret; return 0; - -fail: - ceph_crypto_key_destroy(key); - return ret; } +int ceph_crypto_key_prepare(struct ceph_crypto_key *key) +{ + switch (key->type) { + case CEPH_CRYPTO_NONE: + return 0; /* nothing to do */ + case CEPH_CRYPTO_AES: + return set_aes_tfm(key); + default: + return -ENOTSUPP; + } +} + +/* + * @dst should be zeroed before this function is called. + */ int ceph_crypto_key_clone(struct ceph_crypto_key *dst, const struct ceph_crypto_key *src) { - memcpy(dst, src, sizeof(struct ceph_crypto_key)); - return set_secret(dst, src->key); + dst->type = src->type; + dst->created = src->created; + dst->len = src->len; + + dst->key = kmemdup(src->key, src->len, GFP_NOIO); + if (!dst->key) + return -ENOMEM; + + return 0; } +/* + * @key should be zeroed before this function is called. + */ int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) { - int ret; - ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad); key->type = ceph_decode_16(p); ceph_decode_copy(p, &key->created, sizeof(key->created)); @@ -85,10 +81,13 @@ int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) return -EINVAL; } - ret = set_secret(key, *p); + key->key = kmemdup(*p, key->len, GFP_NOIO); + if (!key->key) + return -ENOMEM; + memzero_explicit(*p, key->len); *p += key->len; - return ret; + return 0; bad: dout("failed to decode crypto key\n"); @@ -322,7 +321,7 @@ static int ceph_key_preparse(struct key_preparsed_payload *prep) goto err; ret = -ENOMEM; - ckey = kmalloc(sizeof(*ckey), GFP_KERNEL); + ckey = kzalloc(sizeof(*ckey), GFP_KERNEL); if (!ckey) goto err; diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 736ec6d2fbcb..2b8f8f68ff7a 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -19,6 +19,7 @@ struct ceph_crypto_key { struct crypto_sync_skcipher *tfm; }; +int ceph_crypto_key_prepare(struct ceph_crypto_key *key); int ceph_crypto_key_clone(struct ceph_crypto_key *dst, const struct ceph_crypto_key *src); int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end); From b7cc142dbafeaf6c053284ca9121b9f70b6d6d06 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 22 Dec 2025 20:41:27 +0100 Subject: [PATCH 04/11] libceph: add support for CEPH_CRYPTO_AES256KRB5 This is based on AES256-CTS-HMAC384-192 crypto algorithm per RFC 8009 (i.e. Kerberos 5, hence the name) with custom-defined key usage numbers. The implementation allows a given key to have/be linked to between one and three usage numbers. The existing CEPH_CRYPTO_AES remains in place and unchanged. The usage_slot parameter that needed to be added to ceph_crypt() and its wrappers is simply ignored there. Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 5 +- net/ceph/Kconfig | 1 + net/ceph/auth_x.c | 88 ++++++++++++++------ net/ceph/auth_x_protocol.h | 38 +++++++++ net/ceph/crypto.c | 157 +++++++++++++++++++++++++++++++---- net/ceph/crypto.h | 16 +++- 6 files changed, 257 insertions(+), 48 deletions(-) diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 08e5dbe15ca4..69ac3e55a3fe 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -89,8 +89,9 @@ struct ceph_dir_layout { } __attribute__ ((packed)); /* crypto algorithms */ -#define CEPH_CRYPTO_NONE 0x0 -#define CEPH_CRYPTO_AES 0x1 +#define CEPH_CRYPTO_NONE 0x0 +#define CEPH_CRYPTO_AES 0x1 +#define CEPH_CRYPTO_AES256KRB5 0x2 /* AES256-CTS-HMAC384-192 */ #define CEPH_AES_IV "cephsageyudagreg" diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index ea60e3ef0834..7e2528cde4b9 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -6,6 +6,7 @@ config CEPH_LIB select CRYPTO_AES select CRYPTO_CBC select CRYPTO_GCM + select CRYPTO_KRB5 select CRYPTO_LIB_SHA256 select CRYPTO select KEYS diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index abdd35be263a..decd2867f8f1 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -17,6 +17,22 @@ #include "auth_x.h" #include "auth_x_protocol.h" +static const u32 ticket_key_usages[] = { + CEPHX_KEY_USAGE_TICKET_SESSION_KEY, + CEPHX_KEY_USAGE_TICKET_BLOB, + CEPHX_KEY_USAGE_AUTH_CONNECTION_SECRET +}; + +static const u32 authorizer_key_usages[] = { + CEPHX_KEY_USAGE_AUTHORIZE, + CEPHX_KEY_USAGE_AUTHORIZE_CHALLENGE, + CEPHX_KEY_USAGE_AUTHORIZE_REPLY +}; + +static const u32 client_key_usages[] = { + CEPHX_KEY_USAGE_TICKET_SESSION_KEY +}; + static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); static int ceph_x_is_authenticated(struct ceph_auth_client *ac) @@ -57,6 +73,7 @@ static int ceph_x_encrypt_offset(const struct ceph_crypto_key *key) /* * AES: ciphertext_len | hdr | data... | padding + * AES256KRB5: ciphertext_len | confounder | hdr | data... | hmac */ static int ceph_x_encrypt_buflen(const struct ceph_crypto_key *key, int data_len) @@ -65,19 +82,19 @@ static int ceph_x_encrypt_buflen(const struct ceph_crypto_key *key, return sizeof(u32) + ceph_crypt_buflen(key, encrypt_len); } -static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf, - int buf_len, int plaintext_len) +static int ceph_x_encrypt(const struct ceph_crypto_key *key, int usage_slot, + void *buf, int buf_len, int plaintext_len) { struct ceph_x_encrypt_header *hdr; int ciphertext_len; int ret; - hdr = buf + sizeof(u32) + ceph_crypt_data_offset(secret); + hdr = buf + sizeof(u32) + ceph_crypt_data_offset(key); hdr->struct_v = 1; hdr->magic = cpu_to_le64(CEPHX_ENC_MAGIC); - ret = ceph_crypt(secret, true, buf + sizeof(u32), buf_len - sizeof(u32), - plaintext_len + sizeof(struct ceph_x_encrypt_header), + ret = ceph_crypt(key, usage_slot, true, buf + sizeof(u32), + buf_len - sizeof(u32), plaintext_len + sizeof(*hdr), &ciphertext_len); if (ret) return ret; @@ -86,19 +103,19 @@ static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf, return sizeof(u32) + ciphertext_len; } -static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p, - int ciphertext_len) +static int __ceph_x_decrypt(const struct ceph_crypto_key *key, int usage_slot, + void *p, int ciphertext_len) { struct ceph_x_encrypt_header *hdr; int plaintext_len; int ret; - ret = ceph_crypt(secret, false, p, ciphertext_len, ciphertext_len, - &plaintext_len); + ret = ceph_crypt(key, usage_slot, false, p, ciphertext_len, + ciphertext_len, &plaintext_len); if (ret) return ret; - hdr = p + ceph_crypt_data_offset(secret); + hdr = p + ceph_crypt_data_offset(key); if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) { pr_err("%s bad magic\n", __func__); return -EINVAL; @@ -107,7 +124,8 @@ static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p, return plaintext_len - sizeof(*hdr); } -static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end) +static int ceph_x_decrypt(const struct ceph_crypto_key *key, int usage_slot, + void **p, void *end) { int ciphertext_len; int ret; @@ -115,7 +133,7 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end) ceph_decode_32_safe(p, end, ciphertext_len, e_inval); ceph_decode_need(p, end, ciphertext_len, e_inval); - ret = __ceph_x_decrypt(secret, *p, ciphertext_len); + ret = __ceph_x_decrypt(key, usage_slot, *p, ciphertext_len); if (ret < 0) return ret; @@ -207,7 +225,9 @@ static int process_one_ticket(struct ceph_auth_client *ac, /* blob for me */ dp = *p + ceph_x_encrypt_offset(secret); - ret = ceph_x_decrypt(secret, p, end); + ret = ceph_x_decrypt(secret, + 0 /* CEPHX_KEY_USAGE_TICKET_SESSION_KEY */, + p, end); if (ret < 0) goto out; dout(" decrypted %d bytes\n", ret); @@ -221,7 +241,8 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; - ret = ceph_crypto_key_prepare(&new_session_key); + ret = ceph_crypto_key_prepare(&new_session_key, ticket_key_usages, + ARRAY_SIZE(ticket_key_usages)); if (ret) goto out; @@ -238,7 +259,9 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (is_enc) { /* encrypted */ tp = *p + ceph_x_encrypt_offset(&th->session_key); - ret = ceph_x_decrypt(&th->session_key, p, end); + ret = ceph_x_decrypt(&th->session_key, + 1 /* CEPHX_KEY_USAGE_TICKET_BLOB */, + p, end); if (ret < 0) goto out; dout(" encrypted ticket, decrypted %d bytes\n", ret); @@ -341,7 +364,9 @@ static int encrypt_authorizer(struct ceph_x_authorizer *au, msg_b->server_challenge_plus_one = 0; } - ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b)); + ret = ceph_x_encrypt(&au->session_key, + 0 /* CEPHX_KEY_USAGE_AUTHORIZE */, + p, end - p, sizeof(*msg_b)); if (ret < 0) return ret; @@ -384,7 +409,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, if (ret) goto out_au; - ret = ceph_crypto_key_prepare(&au->session_key); + ret = ceph_crypto_key_prepare(&au->session_key, authorizer_key_usages, + ARRAY_SIZE(authorizer_key_usages)); if (ret) goto out_au; @@ -542,7 +568,8 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, get_random_bytes(&auth->client_challenge, sizeof(u64)); blob->client_challenge = auth->client_challenge; blob->server_challenge = cpu_to_le64(xi->server_challenge); - ret = ceph_x_encrypt(&xi->secret, enc_buf, CEPHX_AU_ENC_BUF_LEN, + ret = ceph_x_encrypt(&xi->secret, 0 /* dummy */, + enc_buf, CEPHX_AU_ENC_BUF_LEN, sizeof(*blob)); if (ret < 0) return ret; @@ -656,7 +683,9 @@ static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id, dout("%s connection secret blob len %d\n", __func__, len); if (len > 0) { dp = *p + ceph_x_encrypt_offset(&th->session_key); - ret = ceph_x_decrypt(&th->session_key, p, *p + len); + ret = ceph_x_decrypt(&th->session_key, + 2 /* CEPHX_KEY_USAGE_AUTH_CONNECTION_SECRET */, + p, *p + len); if (ret < 0) return ret; @@ -820,7 +849,9 @@ static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret, int ret; /* no leading len */ - ret = __ceph_x_decrypt(secret, challenge, challenge_len); + ret = __ceph_x_decrypt(secret, + 1 /* CEPHX_KEY_USAGE_AUTHORIZE_CHALLENGE */, + challenge, challenge_len); if (ret < 0) return ret; @@ -873,7 +904,8 @@ static int decrypt_authorizer_reply(struct ceph_crypto_key *secret, int ret; dp = *p + ceph_x_encrypt_offset(secret); - ret = ceph_x_decrypt(secret, p, end); + ret = ceph_x_decrypt(secret, 2 /* CEPHX_KEY_USAGE_AUTHORIZE_REPLY */, + p, end); if (ret < 0) return ret; @@ -1004,8 +1036,9 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, sigblock->middle_crc = msg->footer.middle_crc; sigblock->data_crc = msg->footer.data_crc; - ret = ceph_x_encrypt(&au->session_key, enc_buf, - CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock)); + ret = ceph_x_encrypt(&au->session_key, 0 /* dummy */, + enc_buf, CEPHX_AU_ENC_BUF_LEN, + sizeof(*sigblock)); if (ret < 0) return ret; @@ -1036,9 +1069,9 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, sigblock->seq_lower_word = *(__le32 *)&msg->hdr.seq; /* no leading len, no ceph_x_encrypt_header */ - ret = ceph_crypt(&au->session_key, true, enc_buf, - CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock), - &ciphertext_len); + ret = ceph_crypt(&au->session_key, 0 /* dummy */, + true, enc_buf, CEPHX_AU_ENC_BUF_LEN, + sizeof(*sigblock), &ciphertext_len); if (ret) return ret; @@ -1130,7 +1163,8 @@ int ceph_x_init(struct ceph_auth_client *ac) goto err_xi; } - ret = ceph_crypto_key_prepare(&xi->secret); + ret = ceph_crypto_key_prepare(&xi->secret, client_key_usages, + ARRAY_SIZE(client_key_usages)); if (ret) { pr_err("cannot prepare key: %d\n", ret); goto err_secret; diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 9c60feeb1bcb..d097b3651c99 100644 --- a/net/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h @@ -6,6 +6,44 @@ #define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200 #define CEPHX_GET_ROTATING_KEY 0x0400 +/* Client <-> AuthMonitor */ +/* + * The AUTH session's connection secret: encrypted with the AUTH + * ticket session key + */ +#define CEPHX_KEY_USAGE_AUTH_CONNECTION_SECRET 0x03 +/* + * The ticket's blob for the client ("blob for me", contains the + * session key): encrypted with the client's secret key in case of + * the AUTH ticket and the AUTH ticket session key in case of other + * service tickets + */ +#define CEPHX_KEY_USAGE_TICKET_SESSION_KEY 0x04 +/* + * The ticket's blob for the service (ceph_x_ticket_blob): possibly + * encrypted with the old AUTH ticket session key in case of the AUTH + * ticket and not encrypted in case of other service tickets + */ +#define CEPHX_KEY_USAGE_TICKET_BLOB 0x05 + +/* Client <-> Service */ +/* + * The client's authorization request (ceph_x_authorize_b): + * encrypted with the service ticket session key + */ +#define CEPHX_KEY_USAGE_AUTHORIZE 0x10 +/* + * The service's challenge (ceph_x_authorize_challenge): + * encrypted with the service ticket session key + */ +#define CEPHX_KEY_USAGE_AUTHORIZE_CHALLENGE 0x11 +/* + * The service's final reply (ceph_x_authorize_reply + the service + * session's connection secret): encrypted with the service ticket + * session key + */ +#define CEPHX_KEY_USAGE_AUTHORIZE_REPLY 0x12 + /* common bits */ struct ceph_x_ticket_blob { __u8 struct_v; diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 3453dc303315..b54085d8d5f0 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -22,28 +23,68 @@ static int set_aes_tfm(struct ceph_crypto_key *key) int ret; noio_flag = memalloc_noio_save(); - key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0); + key->aes_tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0); memalloc_noio_restore(noio_flag); - if (IS_ERR(key->tfm)) { - ret = PTR_ERR(key->tfm); - key->tfm = NULL; + if (IS_ERR(key->aes_tfm)) { + ret = PTR_ERR(key->aes_tfm); + key->aes_tfm = NULL; return ret; } - ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len); + ret = crypto_sync_skcipher_setkey(key->aes_tfm, key->key, key->len); if (ret) return ret; return 0; } -int ceph_crypto_key_prepare(struct ceph_crypto_key *key) +static int set_krb5_tfms(struct ceph_crypto_key *key, const u32 *key_usages, + int key_usage_cnt) +{ + struct krb5_buffer TK = { .len = key->len, .data = key->key }; + unsigned int noio_flag; + int ret = 0; + int i; + + if (WARN_ON_ONCE(key_usage_cnt > ARRAY_SIZE(key->krb5_tfms))) + return -EINVAL; + + key->krb5_type = crypto_krb5_find_enctype( + KRB5_ENCTYPE_AES256_CTS_HMAC_SHA384_192); + if (!key->krb5_type) + return -ENOPKG; + + /* + * Despite crypto_krb5_prepare_encryption() taking a gfp mask, + * crypto_alloc_aead() inside of it allocates with GFP_KERNEL. + */ + noio_flag = memalloc_noio_save(); + for (i = 0; i < key_usage_cnt; i++) { + key->krb5_tfms[i] = crypto_krb5_prepare_encryption( + key->krb5_type, &TK, key_usages[i], + GFP_NOIO); + if (IS_ERR(key->krb5_tfms[i])) { + ret = PTR_ERR(key->krb5_tfms[i]); + key->krb5_tfms[i] = NULL; + goto out_flag; + } + } + +out_flag: + memalloc_noio_restore(noio_flag); + return ret; +} + +int ceph_crypto_key_prepare(struct ceph_crypto_key *key, + const u32 *key_usages, int key_usage_cnt) { switch (key->type) { case CEPH_CRYPTO_NONE: return 0; /* nothing to do */ case CEPH_CRYPTO_AES: return set_aes_tfm(key); + case CEPH_CRYPTO_AES256KRB5: + return set_krb5_tfms(key, key_usages, key_usage_cnt); default: return -ENOTSUPP; } @@ -123,12 +164,25 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) void ceph_crypto_key_destroy(struct ceph_crypto_key *key) { - if (key) { - kfree_sensitive(key->key); - key->key = NULL; - if (key->tfm) { - crypto_free_sync_skcipher(key->tfm); - key->tfm = NULL; + int i; + + if (!key) + return; + + kfree_sensitive(key->key); + key->key = NULL; + + if (key->type == CEPH_CRYPTO_AES) { + if (key->aes_tfm) { + crypto_free_sync_skcipher(key->aes_tfm); + key->aes_tfm = NULL; + } + } else if (key->type == CEPH_CRYPTO_AES256KRB5) { + for (i = 0; i < ARRAY_SIZE(key->krb5_tfms); i++) { + if (key->krb5_tfms[i]) { + crypto_free_aead(key->krb5_tfms[i]); + key->krb5_tfms[i] = NULL; + } } } } @@ -208,7 +262,7 @@ static void teardown_sgtable(struct sg_table *sgt) static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len) { - SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->aes_tfm); struct sg_table sgt; struct scatterlist prealloc_sg; char iv[AES_BLOCK_SIZE] __aligned(8); @@ -224,7 +278,7 @@ static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, return ret; memcpy(iv, aes_iv, AES_BLOCK_SIZE); - skcipher_request_set_sync_tfm(req, key->tfm); + skcipher_request_set_sync_tfm(req, key->aes_tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv); @@ -269,7 +323,68 @@ static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, return ret; } -int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt, +static int ceph_krb5_encrypt(const struct ceph_crypto_key *key, int usage_slot, + void *buf, int buf_len, int in_len, int *pout_len) +{ + struct sg_table sgt; + struct scatterlist prealloc_sg; + int ret; + + if (WARN_ON_ONCE(usage_slot >= ARRAY_SIZE(key->krb5_tfms))) + return -EINVAL; + + ret = setup_sgtable(&sgt, &prealloc_sg, buf, buf_len); + if (ret) + return ret; + + ret = crypto_krb5_encrypt(key->krb5_type, key->krb5_tfms[usage_slot], + sgt.sgl, sgt.nents, buf_len, AES_BLOCK_SIZE, + in_len, false); + if (ret < 0) { + pr_err("%s encrypt failed: %d\n", __func__, ret); + goto out_sgt; + } + + *pout_len = ret; + ret = 0; + +out_sgt: + teardown_sgtable(&sgt); + return ret; +} + +static int ceph_krb5_decrypt(const struct ceph_crypto_key *key, int usage_slot, + void *buf, int buf_len, int in_len, int *pout_len) +{ + struct sg_table sgt; + struct scatterlist prealloc_sg; + size_t data_off = 0; + size_t data_len = in_len; + int ret; + + if (WARN_ON_ONCE(usage_slot >= ARRAY_SIZE(key->krb5_tfms))) + return -EINVAL; + + ret = setup_sgtable(&sgt, &prealloc_sg, buf, in_len); + if (ret) + return ret; + + ret = crypto_krb5_decrypt(key->krb5_type, key->krb5_tfms[usage_slot], + sgt.sgl, sgt.nents, &data_off, &data_len); + if (ret) { + pr_err("%s decrypt failed: %d\n", __func__, ret); + goto out_sgt; + } + + WARN_ON(data_off != AES_BLOCK_SIZE); + *pout_len = data_len; + +out_sgt: + teardown_sgtable(&sgt); + return ret; +} + +int ceph_crypt(const struct ceph_crypto_key *key, int usage_slot, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len) { switch (key->type) { @@ -279,6 +394,12 @@ int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt, case CEPH_CRYPTO_AES: return ceph_aes_crypt(key, encrypt, buf, buf_len, in_len, pout_len); + case CEPH_CRYPTO_AES256KRB5: + return encrypt ? + ceph_krb5_encrypt(key, usage_slot, buf, buf_len, in_len, + pout_len) : + ceph_krb5_decrypt(key, usage_slot, buf, buf_len, in_len, + pout_len); default: return -ENOTSUPP; } @@ -290,6 +411,9 @@ int ceph_crypt_data_offset(const struct ceph_crypto_key *key) case CEPH_CRYPTO_NONE: case CEPH_CRYPTO_AES: return 0; + case CEPH_CRYPTO_AES256KRB5: + /* confounder */ + return AES_BLOCK_SIZE; default: BUG(); } @@ -304,6 +428,9 @@ int ceph_crypt_buflen(const struct ceph_crypto_key *key, int data_len) /* PKCS#7 padding at the end */ return data_len + AES_BLOCK_SIZE - (data_len & (AES_BLOCK_SIZE - 1)); + case CEPH_CRYPTO_AES256KRB5: + /* confounder at the beginning and 192-bit HMAC at the end */ + return AES_BLOCK_SIZE + data_len + 24; default: BUG(); } diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 2b8f8f68ff7a..2c37c54d0f56 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -5,7 +5,7 @@ #include #include -#define CEPH_MAX_KEY_LEN 16 +#define CEPH_MAX_KEY_LEN 32 #define CEPH_MAX_CON_SECRET_LEN 64 /* @@ -16,10 +16,18 @@ struct ceph_crypto_key { struct ceph_timespec created; int len; void *key; - struct crypto_sync_skcipher *tfm; + + union { + struct crypto_sync_skcipher *aes_tfm; + struct { + const struct krb5_enctype *krb5_type; + struct crypto_aead *krb5_tfms[3]; + }; + }; }; -int ceph_crypto_key_prepare(struct ceph_crypto_key *key); +int ceph_crypto_key_prepare(struct ceph_crypto_key *key, + const u32 *key_usages, int key_usage_cnt); int ceph_crypto_key_clone(struct ceph_crypto_key *dst, const struct ceph_crypto_key *src); int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end); @@ -27,7 +35,7 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in); void ceph_crypto_key_destroy(struct ceph_crypto_key *key); /* crypto.c */ -int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt, +int ceph_crypt(const struct ceph_crypto_key *key, int usage_slot, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len); int ceph_crypt_data_offset(const struct ceph_crypto_key *key); int ceph_crypt_buflen(const struct ceph_crypto_key *key, int data_len); From 8356b4b1103b8c970648c94bab724aa30e42d869 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 12 Jul 2025 17:11:55 +0200 Subject: [PATCH 05/11] libceph: adapt ceph_x_challenge_blob hashing and msgr1 message signing The existing approach where ceph_x_challenge_blob is encrypted with the client's secret key and then the digest derived from the ciphertext is used for the test doesn't work with CEPH_CRYPTO_AES256KRB5 because the confounder randomizes the ciphertext: the client and the server get two different ciphertexts and therefore two different digests. msgr1 signatures are affected the same way: a digest derived from the ciphertext for the message's "sigblock" is what becomes a signature and the two sides disagree on the expected value. For CEPH_CRYPTO_AES256KRB5 (and potential future encryption schemes), switch to HMAC-SHA256 function keyed in the same way as the existing encryption. For CEPH_CRYPTO_AES, everything is preserved as is. Signed-off-by: Ilya Dryomov --- net/ceph/auth_x.c | 59 ++++++++++++++++++++++++++++++----------- net/ceph/crypto.c | 18 +++++++++++++ net/ceph/crypto.h | 4 +++ net/ceph/messenger_v2.c | 14 +++++----- 4 files changed, 72 insertions(+), 23 deletions(-) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index decd2867f8f1..13b3df9af0ac 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -553,8 +553,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, if (need & CEPH_ENTITY_TYPE_AUTH) { struct ceph_x_authenticate *auth = (void *)(head + 1); void *enc_buf = xi->auth_authorizer.enc_buf; - struct ceph_x_challenge_blob *blob = enc_buf + - ceph_x_encrypt_offset(&xi->secret); + struct ceph_x_challenge_blob *blob; u64 *u; p = auth + 1; @@ -564,15 +563,29 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, dout(" get_auth_session_key\n"); head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY); - /* encrypt and hash */ + if (xi->secret.type == CEPH_CRYPTO_AES) { + blob = enc_buf + ceph_x_encrypt_offset(&xi->secret); + } else { + BUILD_BUG_ON(SHA256_DIGEST_SIZE + sizeof(*blob) > + CEPHX_AU_ENC_BUF_LEN); + blob = enc_buf + SHA256_DIGEST_SIZE; + } + get_random_bytes(&auth->client_challenge, sizeof(u64)); blob->client_challenge = auth->client_challenge; blob->server_challenge = cpu_to_le64(xi->server_challenge); - ret = ceph_x_encrypt(&xi->secret, 0 /* dummy */, - enc_buf, CEPHX_AU_ENC_BUF_LEN, - sizeof(*blob)); - if (ret < 0) - return ret; + + if (xi->secret.type == CEPH_CRYPTO_AES) { + ret = ceph_x_encrypt(&xi->secret, 0 /* dummy */, + enc_buf, CEPHX_AU_ENC_BUF_LEN, + sizeof(*blob)); + if (ret < 0) + return ret; + } else { + ceph_hmac_sha256(&xi->secret, blob, sizeof(*blob), + enc_buf); + ret = SHA256_DIGEST_SIZE; + } auth->struct_v = 3; /* nautilus+ */ auth->key = 0; @@ -1053,11 +1066,19 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, __le32 data_crc; __le32 data_len; __le32 seq_lower_word; - } __packed *sigblock = enc_buf; + } __packed *sigblock; struct { __le64 a, b, c, d; } __packed *penc = enc_buf; - int ciphertext_len; + + if (au->session_key.type == CEPH_CRYPTO_AES) { + /* no leading len, no ceph_x_encrypt_header */ + sigblock = enc_buf; + } else { + BUILD_BUG_ON(SHA256_DIGEST_SIZE + sizeof(*sigblock) > + CEPHX_AU_ENC_BUF_LEN); + sigblock = enc_buf + SHA256_DIGEST_SIZE; + } sigblock->header_crc = msg->hdr.crc; sigblock->front_crc = msg->footer.front_crc; @@ -1068,12 +1089,18 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, sigblock->data_len = msg->hdr.data_len; sigblock->seq_lower_word = *(__le32 *)&msg->hdr.seq; - /* no leading len, no ceph_x_encrypt_header */ - ret = ceph_crypt(&au->session_key, 0 /* dummy */, - true, enc_buf, CEPHX_AU_ENC_BUF_LEN, - sizeof(*sigblock), &ciphertext_len); - if (ret) - return ret; + if (au->session_key.type == CEPH_CRYPTO_AES) { + int ciphertext_len; /* unused */ + + ret = ceph_crypt(&au->session_key, 0 /* dummy */, + true, enc_buf, CEPHX_AU_ENC_BUF_LEN, + sizeof(*sigblock), &ciphertext_len); + if (ret) + return ret; + } else { + ceph_hmac_sha256(&au->session_key, sigblock, + sizeof(*sigblock), enc_buf); + } *psig = penc->a ^ penc->b ^ penc->c ^ penc->d; } diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index b54085d8d5f0..b2067ea6c38a 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -84,6 +84,7 @@ int ceph_crypto_key_prepare(struct ceph_crypto_key *key, case CEPH_CRYPTO_AES: return set_aes_tfm(key); case CEPH_CRYPTO_AES256KRB5: + hmac_sha256_preparekey(&key->hmac_key, key->key, key->len); return set_krb5_tfms(key, key_usages, key_usage_cnt); default: return -ENOTSUPP; @@ -178,6 +179,7 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key) key->aes_tfm = NULL; } } else if (key->type == CEPH_CRYPTO_AES256KRB5) { + memzero_explicit(&key->hmac_key, sizeof(key->hmac_key)); for (i = 0; i < ARRAY_SIZE(key->krb5_tfms); i++) { if (key->krb5_tfms[i]) { crypto_free_aead(key->krb5_tfms[i]); @@ -436,6 +438,22 @@ int ceph_crypt_buflen(const struct ceph_crypto_key *key, int data_len) } } +void ceph_hmac_sha256(const struct ceph_crypto_key *key, const void *buf, + int buf_len, u8 hmac[SHA256_DIGEST_SIZE]) +{ + switch (key->type) { + case CEPH_CRYPTO_NONE: + case CEPH_CRYPTO_AES: + memset(hmac, 0, SHA256_DIGEST_SIZE); + return; + case CEPH_CRYPTO_AES256KRB5: + hmac_sha256(&key->hmac_key, buf, buf_len, hmac); + return; + default: + BUG(); + } +} + static int ceph_key_preparse(struct key_preparsed_payload *prep) { struct ceph_crypto_key *ckey; diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 2c37c54d0f56..3a2ade15abbc 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -2,6 +2,7 @@ #ifndef _FS_CEPH_CRYPTO_H #define _FS_CEPH_CRYPTO_H +#include #include #include @@ -20,6 +21,7 @@ struct ceph_crypto_key { union { struct crypto_sync_skcipher *aes_tfm; struct { + struct hmac_sha256_key hmac_key; const struct krb5_enctype *krb5_type; struct crypto_aead *krb5_tfms[3]; }; @@ -39,6 +41,8 @@ int ceph_crypt(const struct ceph_crypto_key *key, int usage_slot, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len); int ceph_crypt_data_offset(const struct ceph_crypto_key *key); int ceph_crypt_buflen(const struct ceph_crypto_key *key, int data_len); +void ceph_hmac_sha256(const struct ceph_crypto_key *key, const void *buf, + int buf_len, u8 hmac[SHA256_DIGEST_SIZE]); int ceph_crypto_init(void); void ceph_crypto_shutdown(void); diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 31e042dc1b3f..5ec3272cd2dd 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -779,9 +779,9 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_x, secure mode */ } -static void ceph_hmac_sha256(struct ceph_connection *con, - const struct kvec *kvecs, int kvec_cnt, - u8 hmac[SHA256_DIGEST_SIZE]) +static void con_hmac_sha256(struct ceph_connection *con, + const struct kvec *kvecs, int kvec_cnt, + u8 hmac[SHA256_DIGEST_SIZE]) { struct hmac_sha256_ctx ctx; int i; @@ -1438,8 +1438,8 @@ static int prepare_auth_signature(struct ceph_connection *con) if (!buf) return -ENOMEM; - ceph_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, - CTRL_BODY(buf)); + con_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, + CTRL_BODY(buf)); return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf, SHA256_DIGEST_SIZE); @@ -2436,8 +2436,8 @@ static int process_auth_signature(struct ceph_connection *con, return -EINVAL; } - ceph_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt, - hmac); + con_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt, + hmac); ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad); if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) { From f16bd3fa74a2084ee7e16a8a2be7e7399b970907 Mon Sep 17 00:00:00 2001 From: ethanwu Date: Thu, 25 Sep 2025 18:42:05 +0800 Subject: [PATCH 06/11] ceph: supply snapshot context in ceph_zero_partial_object() The ceph_zero_partial_object function was missing proper snapshot context for its OSD write operations, which could lead to data inconsistencies in snapshots. Reproducer: ../src/vstart.sh --new -x --localhost --bluestore ./bin/ceph auth caps client.fs_a mds 'allow rwps fsname=a' mon 'allow r fsname=a' osd 'allow rw tag cephfs data=a' mount -t ceph fs_a@.a=/ /mnt/mycephfs/ -o conf=./ceph.conf dd if=/dev/urandom of=/mnt/mycephfs/foo bs=64K count=1 mkdir /mnt/mycephfs/.snap/snap1 md5sum /mnt/mycephfs/.snap/snap1/foo fallocate -p -o 0 -l 4096 /mnt/mycephfs/foo echo 3 > /proc/sys/vm/drop/caches md5sum /mnt/mycephfs/.snap/snap1/foo # get different md5sum!! Cc: stable@vger.kernel.org Fixes: ad7a60de882ac ("ceph: punch hole support") Signed-off-by: ethanwu Reviewed-by: Viacheslav Dubeyko Tested-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 983390069f73..9152b4722710 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2568,6 +2568,7 @@ static int ceph_zero_partial_object(struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_osd_request *req; + struct ceph_snap_context *snapc; int ret = 0; loff_t zero = 0; int op; @@ -2582,12 +2583,25 @@ static int ceph_zero_partial_object(struct inode *inode, op = CEPH_OSD_OP_ZERO; } + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), offset, length, 0, 1, op, CEPH_OSD_FLAG_WRITE, - NULL, 0, 0, false); + snapc, 0, 0, false); if (IS_ERR(req)) { ret = PTR_ERR(req); goto out; @@ -2601,6 +2615,7 @@ static int ceph_zero_partial_object(struct inode *inode, ceph_osdc_put_request(req); out: + ceph_put_snap_context(snapc); return ret; } From 305ff6b3a03c230d3c07b61457e961406d979693 Mon Sep 17 00:00:00 2001 From: ethanwu Date: Thu, 25 Sep 2025 18:42:06 +0800 Subject: [PATCH 07/11] ceph: supply snapshot context in ceph_uninline_data() The ceph_uninline_data function was missing proper snapshot context handling for its OSD write operations. Both CEPH_OSD_OP_CREATE and CEPH_OSD_OP_WRITE requests were passing NULL instead of the appropriate snapshot context, which could lead to unnecessary object clone. Reproducer: ../src/vstart.sh --new -x --localhost --bluestore // turn on cephfs inline data ./bin/ceph fs set a inline_data true --yes-i-really-really-mean-it // allow fs_a client to take snapshot ./bin/ceph auth caps client.fs_a mds 'allow rwps fsname=a' mon 'allow r fsname=a' osd 'allow rw tag cephfs data=a' // mount cephfs with fuse, since kernel cephfs doesn't support inline write ceph-fuse --id fs_a -m 127.0.0.1:40318 --conf ceph.conf -d /mnt/mycephfs/ // bump snapshot seq mkdir /mnt/mycephfs/.snap/snap1 echo "foo" > /mnt/mycephfs/test // umount and mount it again using kernel cephfs client umount /mnt/mycephfs mount -t ceph fs_a@.a=/ /mnt/mycephfs/ -o conf=./ceph.conf echo "bar" >> /mnt/mycephfs/test ./bin/rados listsnaps -p cephfs.a.data $(printf "%x\n" $(stat -c %i /mnt/mycephfs/test)).00000000 will see this object does unnecessary clone 1000000000a.00000000 (seq:2): cloneid snaps size overlap 2 2 4 [] head - 8 but it's expected to see 10000000000.00000000 (seq:2): cloneid snaps size overlap head - 8 since there's no snapshot between these 2 writes clone happened because the first osd request CEPH_OSD_OP_CREATE doesn't pass snap context so object is created with snap seq 0, but later data writeback is equipped with snapshot context. snap.seq(1) > object snap seq(0), so osd does object clone. This fix properly acquiring the snapshot context before performing write operations. Signed-off-by: ethanwu Reviewed-by: Viacheslav Dubeyko Tested-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 63b75d214210..faecd9025ee9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -2199,6 +2199,7 @@ int ceph_uninline_data(struct file *file) struct ceph_osd_request *req = NULL; struct ceph_cap_flush *prealloc_cf = NULL; struct folio *folio = NULL; + struct ceph_snap_context *snapc = NULL; u64 inline_version = CEPH_INLINE_NONE; struct page *pages[1]; int err = 0; @@ -2226,6 +2227,24 @@ int ceph_uninline_data(struct file *file) if (inline_version == 1) /* initial version, no data */ goto out_uninline; + down_read(&fsc->mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + if (!ci->i_head_snapc) { + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + } + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + up_read(&fsc->mdsc->snap_rwsem); + folio = read_mapping_folio(inode->i_mapping, 0, file); if (IS_ERR(folio)) { err = PTR_ERR(folio); @@ -2241,7 +2260,7 @@ int ceph_uninline_data(struct file *file) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, - NULL, 0, 0, false); + snapc, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out_unlock; @@ -2257,7 +2276,7 @@ int ceph_uninline_data(struct file *file) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - NULL, ci->i_truncate_seq, + snapc, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -2320,6 +2339,7 @@ int ceph_uninline_data(struct file *file) folio_put(folio); } out: + ceph_put_snap_context(snapc); ceph_free_cap_flush(prealloc_cf); doutc(cl, "%llx.%llx inline_version %llu = %d\n", ceph_vinop(inode), inline_version, err); From 707104682e3c163f7c14cdd6b07a3e95fb374759 Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Sun, 25 Jan 2026 18:30:52 -0800 Subject: [PATCH 08/11] ceph: do not propagate page array emplacement errors as batch errors When fscrypt is enabled, move_dirty_folio_in_page_array() may fail because it needs to allocate bounce buffers to store the encrypted versions of each folio. Each folio beyond the first allocates its bounce buffer with GFP_NOWAIT. Failures are common (and expected) under this allocation mode; they should flush (not abort) the batch. However, ceph_process_folio_batch() uses the same `rc` variable for its own return code and for capturing the return codes of its routine calls; failing to reset `rc` back to 0 results in the error being propagated out to the main writeback loop, which cannot actually tolerate any errors here: once `ceph_wbc.pages` is allocated, it must be passed to ceph_submit_write() to be freed. If it survives until the next iteration (e.g. due to the goto being followed), ceph_allocate_page_array()'s BUG_ON() will oops the worker. Note that this failure mode is currently masked due to another bug (addressed next in this series) that prevents multiple encrypted folios from being selected for the same write. For now, just reset `rc` when redirtying the folio to prevent errors in move_dirty_folio_in_page_array() from propagating. Note that move_dirty_folio_in_page_array() is careful never to return errors on the first folio, so there is no need to check for that. After this change, ceph_process_folio_batch() no longer returns errors; its only remaining failure indicator is `locked_pages == 0`, which the caller already handles correctly. Cc: stable@vger.kernel.org Fixes: ce80b76dd327 ("ceph: introduce ceph_process_folio_batch() method") Signed-off-by: Sam Edwards Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index faecd9025ee9..3cfe3df6e6a2 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1369,6 +1369,7 @@ int ceph_process_folio_batch(struct address_space *mapping, rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc, folio); if (rc) { + rc = 0; folio_redirty_for_writepage(wbc, folio); folio_unlock(folio); break; From cac190c7674fea71620d754ffcdaaeed7c551dbc Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Sun, 25 Jan 2026 18:30:53 -0800 Subject: [PATCH 09/11] ceph: fix write storm on fscrypted files CephFS stores file data across multiple RADOS objects. An object is the atomic unit of storage, so the writeback code must clean only folios that belong to the same object with each OSD request. CephFS also supports RAID0-style striping of file contents: if enabled, each object stores multiple unbroken "stripe units" covering different portions of the file; if disabled, a "stripe unit" is simply the whole object. The stripe unit is (usually) reported as the inode's block size. Though the writeback logic could, in principle, lock all dirty folios belonging to the same object, its current design is to lock only a single stripe unit at a time. Ever since this code was first written, it has determined this size by checking the inode's block size. However, the relatively-new fscrypt support needed to reduce the block size for encrypted inodes to the crypto block size (see 'fixes' commit), which causes an unnecessarily high number of write operations (~1024x as many, with 4MiB objects) and correspondingly degraded performance. Fix this (and clarify intent) by using i_layout.stripe_unit directly in ceph_define_write_size() so that encrypted inodes are written back with the same number of operations as if they were unencrypted. This patch depends on the preceding commit ("ceph: do not propagate page array emplacement errors as batch errors") for correctness. While it applies cleanly on its own, applying it alone will introduce a regression. This dependency is only relevant for kernels where ce80b76dd327 ("ceph: introduce ceph_process_folio_batch() method") has been applied; stable kernels without that commit are unaffected. Cc: stable@vger.kernel.org Fixes: 94af0470924c ("ceph: add some fscrypt guardrails") Signed-off-by: Sam Edwards Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3cfe3df6e6a2..c6c853748942 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1000,7 +1000,8 @@ unsigned int ceph_define_write_size(struct address_space *mapping) { struct inode *inode = mapping->host; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); - unsigned int wsize = i_blocksize(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + unsigned int wsize = ci->i_layout.stripe_unit; if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; From fa589acaac08f1877185b5eb22d0985664536101 Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Sun, 25 Jan 2026 18:30:54 -0800 Subject: [PATCH 10/11] ceph: remove error return from ceph_process_folio_batch() Following an earlier commit, ceph_process_folio_batch() no longer returns errors because the writeback loop cannot handle them. Since this function already indicates failure to lock any pages by leaving `ceph_wbc.locked_pages == 0`, and the writeback loop has no way to handle abandonment of a locked batch, change the return type of ceph_process_folio_batch() to `void` and remove the pathological goto in the writeback loop. The lack of a return code emphasizes that ceph_process_folio_batch() is designed to be abort-free: that is, once it commits a folio for writeback, it will not later abandon it or propagate an error for that folio. Any future changes requiring "abort" logic should follow this invariant by cleaning up its array and resetting ceph_wbc.locked_pages appropriately. Signed-off-by: Sam Edwards Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c6c853748942..daca56993390 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1284,16 +1284,16 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, } static -int ceph_process_folio_batch(struct address_space *mapping, - struct writeback_control *wbc, - struct ceph_writeback_ctl *ceph_wbc) +void ceph_process_folio_batch(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) { struct inode *inode = mapping->host; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct folio *folio = NULL; unsigned i; - int rc = 0; + int rc; for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) { folio = ceph_wbc->fbatch.folios[i]; @@ -1323,12 +1323,10 @@ int ceph_process_folio_batch(struct address_space *mapping, rc = ceph_check_page_before_write(mapping, wbc, ceph_wbc, folio); if (rc == -ENODATA) { - rc = 0; folio_unlock(folio); ceph_wbc->fbatch.folios[i] = NULL; continue; } else if (rc == -E2BIG) { - rc = 0; folio_unlock(folio); ceph_wbc->fbatch.folios[i] = NULL; break; @@ -1370,7 +1368,6 @@ int ceph_process_folio_batch(struct address_space *mapping, rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc, folio); if (rc) { - rc = 0; folio_redirty_for_writepage(wbc, folio); folio_unlock(folio); break; @@ -1381,8 +1378,6 @@ int ceph_process_folio_batch(struct address_space *mapping, } ceph_wbc->processed_in_fbatch = i; - - return rc; } static inline @@ -1686,10 +1681,8 @@ static int ceph_writepages_start(struct address_space *mapping, break; process_folio_batch: - rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + ceph_process_folio_batch(mapping, wbc, &ceph_wbc); ceph_shift_unused_folios_left(&ceph_wbc.fbatch); - if (rc) - goto release_folios; /* did we get anything? */ if (!ceph_wbc.locked_pages) From cfdde144ae455b8612a756fe7419d57c9b7833c1 Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Sun, 25 Jan 2026 18:30:55 -0800 Subject: [PATCH 11/11] ceph: assert loop invariants in ceph_writepages_start() If `locked_pages` is zero, the page array must not be allocated: ceph_process_folio_batch() uses `locked_pages` to decide when to allocate `pages`, and redundant allocations trigger ceph_allocate_page_array()'s BUG_ON(), resulting in a worker oops (and writeback stall) or even a kernel panic. Consequently, the main loop in ceph_writepages_start() assumes that the lifetime of `pages` is confined to a single iteration. This expectation is currently not clear enough, as evidenced by the recent patch which fixed an oops caused by `pages` persisting into the next loop iteration: - "ceph: do not propagate page array emplacement errors as batch errors" Use an explicit BUG_ON() at the top of the loop to assert the loop's preexisting expectation that `pages` is cleaned up by the previous iteration. Because this is closely tied to `locked_pages`, also make it the previous iteration's responsibility to guarantee its reset, and verify with a second new BUG_ON() instead of handling (and masking) failures to do so. This patch does not change invariants, behavior, or failure modes. The added BUG_ON() lines catch conditions that would already trigger oops, but do so earlier for easier debugging and programmer clarity. Signed-off-by: Sam Edwards Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index daca56993390..ce09ff3e020f 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1663,7 +1663,9 @@ static int ceph_writepages_start(struct address_space *mapping, tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end); while (!has_writeback_done(&ceph_wbc)) { - ceph_wbc.locked_pages = 0; + BUG_ON(ceph_wbc.locked_pages); + BUG_ON(ceph_wbc.pages); + ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT; get_more_pages: