From 5199c125d25aeae8615c4fc31652cc0fe624338e Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Wed, 18 Mar 2026 18:09:03 +0100 Subject: [PATCH 01/12] libceph: Prevent potential null-ptr-deref in ceph_handle_auth_reply() If a message of type CEPH_MSG_AUTH_REPLY contains a zero value for both protocol and result, this is currently not treated as an error. In case of ac->negotiating == true and ac->protocol > 0, this leads to setting ac->protocol = 0 and ac->ops = NULL. Thereafter, the check for ac->protocol != protocol returns false, and init_protocol() is not called. Subsequently, ac->ops->handle_reply() is called, which leads to a null pointer dereference, because ac->ops is still NULL. This patch changes the check for ac->protocol != protocol to !ac->protocol, as this also includes the case when the protocol was set to zero in the message. This causes the message to be treated as containing a bad auth protocol. Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/auth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/auth.c b/net/ceph/auth.c index 901b93530b21..3314705e5914 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -245,7 +245,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ac->protocol = 0; ac->ops = NULL; } - if (ac->protocol != protocol) { + if (!ac->protocol) { ret = init_protocol(ac, protocol); if (ret) { pr_err("auth protocol '%s' init failed: %d\n", From a0d9555bf9eaeba34fe6b6bb86f442fe08ba3842 Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Tue, 17 Mar 2026 19:37:33 -0700 Subject: [PATCH 02/12] ceph: fix num_ops off-by-one when crypto allocation fails move_dirty_folio_in_page_array() may fail if the file is encrypted, the dirty folio is not the first in the batch, and it fails to allocate a bounce buffer to hold the ciphertext. When that happens, ceph_process_folio_batch() simply redirties the folio and flushes the current batch -- it can retry that folio in a future batch. However, if this failed folio is not contiguous with the last folio that did make it into the batch, then ceph_process_folio_batch() has already incremented `ceph_wbc->num_ops`; because it doesn't follow through and add the discontiguous folio to the array, ceph_submit_write() -- which expects that `ceph_wbc->num_ops` accurately reflects the number of contiguous ranges (and therefore the required number of "write extent" ops) in the writeback -- will panic the kernel: BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops); This issue can be reproduced on affected kernels by writing to fscrypt-enabled CephFS file(s) with a 4KiB-written/4KiB-skipped/repeat pattern (total filesize should not matter) and gradually increasing the system's memory pressure until a bounce buffer allocation fails. Fix this crash by decrementing `ceph_wbc->num_ops` back to the correct value when move_dirty_folio_in_page_array() fails, but the folio already started counting a new (i.e. still-empty) extent. The defect corrected by this patch has existed since 2022 (see first `Fixes:`), but another bug blocked multi-folio encrypted writeback until recently (see second `Fixes:`). The second commit made it into 6.18.16, 6.19.6, and 7.0-rc1, unmasking the panic in those versions. This patch therefore fixes a regression (panic) introduced by cac190c7674f. Cc: stable@vger.kernel.org Fixes: d55207717ded ("ceph: add encryption support to writepage and writepages") Fixes: cac190c7674f ("ceph: fix write storm on fscrypted files") Signed-off-by: Sam Edwards Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 2090fc78529c..44553556ac74 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1365,6 +1365,10 @@ void ceph_process_folio_batch(struct address_space *mapping, rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc, folio); if (rc) { + /* Did we just begin a new contiguous op? Nevermind! */ + if (ceph_wbc->len == 0) + ceph_wbc->num_ops--; + folio_redirty_for_writepage(wbc, folio); folio_unlock(folio); break; From c7aac00c2c1dc8f6cb66ce10c730e0cd871408bf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 14 Mar 2026 14:25:19 -0700 Subject: [PATCH 03/12] libceph: Remove obsolete session key alignment logic Since the call to crypto_shash_setkey() was replaced with hmac_sha256_preparekey() which doesn't allocate memory regardless of the alignment of the input key, remove the session key alignment logic from process_auth_done(). Also remove the inclusion of crypto/hash.h, which is no longer needed since crypto_shash is no longer used. [ idryomov: rewrap comment ] Signed-off-by: Eric Biggers Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/messenger_v2.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 50f65820f623..05f6eea299fc 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -8,7 +8,6 @@ #include #include -#include #include #include #include @@ -2352,16 +2351,14 @@ static int process_auth_reply_more(struct ceph_connection *con, } /* - * Align session_key and con_secret to avoid GFP_ATOMIC allocation - * inside crypto_shash_setkey() and crypto_aead_setkey() called from - * setup_crypto(). __aligned(16) isn't guaranteed to work for stack - * objects, so do it by hand. + * Align con_secret to avoid GFP_ATOMIC allocation inside + * crypto_aead_setkey() called from setup_crypto(). __aligned(16) + * isn't guaranteed to work for stack objects, so do it by hand. */ static int process_auth_done(struct ceph_connection *con, void *p, void *end) { - u8 session_key_buf[CEPH_MAX_KEY_LEN + 16]; + u8 session_key[CEPH_MAX_KEY_LEN]; u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16]; - u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16); u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16); int session_key_len, con_secret_len; int payload_len; @@ -2415,7 +2412,7 @@ static int process_auth_done(struct ceph_connection *con, void *p, void *end) con->state = CEPH_CON_S_V2_AUTH_SIGNATURE; out: - memzero_explicit(session_key_buf, sizeof(session_key_buf)); + memzero_explicit(session_key, sizeof(session_key)); memzero_explicit(con_secret_buf, sizeof(con_secret_buf)); return ret; From eff0e55f90b0c4a005b04fd0598fe70260ed4e7d Mon Sep 17 00:00:00 2001 From: kexinsun Date: Mon, 23 Feb 2026 21:15:07 +0800 Subject: [PATCH 04/12] libceph: update outdated comment in ceph_sock_write_space() The function try_write() was renamed to ceph_con_v1_try_write() in commit 566050e17e53 ("libceph: separate msgr1 protocol implementation") and subsequently moved to net/ceph/messenger_v1.c in commit 2f713615ddd9 ("libceph: move msgr1 protocol implementation to its own file"). Update the comment in ceph_sock_write_space() accordingly. [ idryomov: account for msgr2 in the updated comment as well ] Signed-off-by: kexinsun Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 108adb583744..34b3097b4c7b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -368,8 +368,8 @@ static void ceph_sock_write_space(struct sock *sk) /* only queue to workqueue if there is data we want to write, * and there is sufficient space in the socket buffer to accept * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() - * doesn't get called again until try_write() fills the socket - * buffer. See net/ipv4/tcp_input.c:tcp_check_space() + * doesn't get called again until ceph_con_v[12]_try_write() fills + * the socket buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) { From 803447f93d75ab6e40c85e6d12b5630d281d70d6 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 27 Mar 2026 17:23:08 +0100 Subject: [PATCH 05/12] ceph: only d_add() negative dentries when they are unhashed Ceph can call d_add(dentry, NULL) on a negative dentry that is already present in the primary dcache hash. In the current VFS that is not safe. d_add() goes through __d_add() to __d_rehash(), which unconditionally reinserts dentry->d_hash into the hlist_bl bucket. If the dentry is already hashed, reinserting the same node can corrupt the bucket, including creating a self-loop. Once that happens, __d_lookup() can spin forever in the hlist_bl walk, typically looping only on the d_name.hash mismatch check and eventually triggering RCU stall reports like this one: rcu: INFO: rcu_sched self-detected stall on CPU rcu: 87-....: (2100 ticks this GP) idle=3a4c/1/0x4000000000000000 softirq=25003319/25003319 fqs=829 rcu: (t=2101 jiffies g=79058445 q=698988 ncpus=192) CPU: 87 UID: 2952868916 PID: 3933303 Comm: php-cgi8.3 Not tainted 6.18.17-i1-amd #950 NONE Hardware name: Dell Inc. PowerEdge R7615/0G9DHV, BIOS 1.6.6 09/22/2023 RIP: 0010:__d_lookup+0x46/0xb0 Code: c1 e8 07 48 8d 04 c2 48 8b 00 49 89 fc 49 89 f5 48 89 c3 48 83 e3 fe 48 83 f8 01 77 0f eb 2d 0f 1f 44 00 00 48 8b 1b 48 85 db <74> 20 39 6b 18 75 f3 48 8d 7b 78 e8 ba 85 d0 00 4c 39 63 10 74 1f RSP: 0018:ff745a70c8253898 EFLAGS: 00000282 RAX: ff26e470054cb208 RBX: ff26e470054cb208 RCX: 000000006e958966 RDX: ff26e48267340000 RSI: ff745a70c82539b0 RDI: ff26e458f74655c0 RBP: 000000006e958966 R08: 0000000000000180 R09: 9cd08d909b919a89 R10: ff26e458f74655c0 R11: 0000000000000000 R12: ff26e458f74655c0 R13: ff745a70c82539b0 R14: d0d0d0d0d0d0d0d0 R15: 2f2f2f2f2f2f2f2f FS: 00007f5770896980(0000) GS:ff26e482c5d88000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f5764de50c0 CR3: 000000a72abb5001 CR4: 0000000000771ef0 PKRU: 55555554 Call Trace: lookup_fast+0x9f/0x100 walk_component+0x1f/0x150 link_path_walk+0x20e/0x3d0 path_lookupat+0x68/0x180 filename_lookup+0xdc/0x1e0 vfs_statx+0x6c/0x140 vfs_fstatat+0x67/0xa0 __do_sys_newfstatat+0x24/0x60 do_syscall_64+0x6a/0x230 entry_SYSCALL_64_after_hwframe+0x76/0x7e This is reachable with reused cached negative dentries. A Ceph lookup or atomic_open can be handed a negative dentry that is already hashed, and fs/ceph/dir.c then hits one of two paths that incorrectly assume "negative" also means "unhashed": - ceph_finish_lookup(): MDS reply is -ENOENT with no trace -> d_add(dentry, NULL) - ceph_lookup(): local ENOENT fast path for a complete directory with shared caps -> d_add(dentry, NULL) Both paths can therefore re-add an already-hashed negative dentry. Ceph already uses the correct pattern elsewhere: ceph_fill_trace() only calls d_add(dn, NULL) for a negative null-dentry reply when d_unhashed(dn) is true. Fix both fs/ceph/dir.c sites the same way: only call d_add() for a negative dentry when it is actually unhashed. If the negative dentry is already hashed, leave it in place and reuse it as-is. This preserves the existing behavior for unhashed dentries while avoiding d_hash list corruption for reused hashed negatives. Cc: stable@vger.kernel.org Fixes: 2817b000b02c ("ceph: directory operations") Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index bac9cfb6b982..27ce9e55e947 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -769,7 +769,8 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, d_drop(dentry); err = -ENOENT; } else { - d_add(dentry, NULL); + if (d_unhashed(dentry)) + d_add(dentry, NULL); } } } @@ -840,7 +841,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, spin_unlock(&ci->i_ceph_lock); doutc(cl, " dir %llx.%llx complete, -ENOENT\n", ceph_vinop(dir)); - d_add(dentry, NULL); + if (d_unhashed(dentry)) + d_add(dentry, NULL); di->lease_shared_gen = atomic_read(&ci->i_shared_gen); return NULL; } From cc5643095419d45927a1dee9cb3da7c2f9e779f6 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 30 Mar 2026 10:43:19 +0200 Subject: [PATCH 06/12] ceph: clear s_cap_reconnect when ceph_pagelist_encode_32() fails This MDS reconnect error path leaves s_cap_reconnect set. send_mds_reconnect() sets the bit at the beginning of the reconnect, but the first failing operation after that, ceph_pagelist_encode_32(), can jump to `fail:` without clearing it. __ceph_remove_cap() consults that flag to decide whether cap releases should be queued. A reconnect-preparation failure therefore leaves the session in reconnect mode from the cap-release path's point of view and can strand release work until some later state transition repairs it. Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b1746273f186..4fa471d9b3b2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4956,7 +4956,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, /* placeholder for nr_caps */ err = ceph_pagelist_encode_32(recon_state.pagelist, 0); if (err) - goto fail; + goto fail_clear_cap_reconnect; if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) { recon_state.msg_version = 3; @@ -5046,6 +5046,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, ceph_pagelist_release(recon_state.pagelist); return; +fail_clear_cap_reconnect: + spin_lock(&session->s_cap_lock); + session->s_cap_reconnect = 0; + spin_unlock(&session->s_cap_lock); fail: ceph_msg_put(reply); up_read(&mdsc->snap_rwsem); From 3a2e519cd4332576989c0985b3e61ac08eb2b458 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Mon, 30 Mar 2026 13:46:53 -0700 Subject: [PATCH 07/12] crush: cleanup in crush_do_rule() method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 41ebcc0907c5 ("crush: remove forcefeed functionality") from May 7, 2012 (linux-next), leads to the following Smatch static checker warning: net/ceph/crush/mapper.c:1015 crush_do_rule() warn: iterator 'j' not incremented Before commit 41ebcc0907c5 ("crush: remove forcefeed functionality"), we had this logic: j = 0; if (osize == 0 && force_pos >= 0) { o[osize] = force_context[force_pos]; if (recurse_to_leaf) c[osize] = force_context[0]; j++; /* <-- this was the only increment, now gone */ force_pos--; } /* then crush_choose_*(..., o+osize, j, ...) */ Now, the variable j is dead code — a variable that is set and never meaningfully varied. This patch simply removes the dead code. Reported-by: Dan Carpenter Signed-off-by: Viacheslav Dubeyko Reviewed-by: Alex Markuze Signed-off-by: Ilya Dryomov --- net/ceph/crush/mapper.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 3a5bd1cd1e99..17b041779fb9 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -911,7 +911,7 @@ int crush_do_rule(const struct crush_map *map, int osize; const struct crush_rule *rule; __u32 step; - int i, j; + int i; int numrep; int out_size; /* @@ -1012,7 +1012,6 @@ int crush_do_rule(const struct crush_map *map, if (numrep <= 0) continue; } - j = 0; /* make sure bucket id is valid */ bno = -1 - w[i]; if (bno < 0 || bno >= map->max_buckets) { @@ -1036,7 +1035,7 @@ int crush_do_rule(const struct crush_map *map, weight, weight_max, x, numrep, curstep->arg2, - o+osize, j, + o+osize, 0, result_max-osize, choose_tries, recurse_tries, @@ -1058,7 +1057,7 @@ int crush_do_rule(const struct crush_map *map, weight, weight_max, x, out_size, numrep, curstep->arg2, - o+osize, j, + o+osize, 0, choose_tries, choose_leaf_tries ? choose_leaf_tries : 1, From d1fef92e414433ca7b89abf85cb0df42b8d475eb Mon Sep 17 00:00:00 2001 From: Dawei Feng Date: Sun, 19 Apr 2026 17:03:48 +0800 Subject: [PATCH 08/12] rbd: fix null-ptr-deref when device_add_disk() fails do_rbd_add() publishes the device with device_add() before calling device_add_disk(). If device_add_disk() fails after device_add() succeeds, the error path calls rbd_free_disk() directly and then later falls through to rbd_dev_device_release(), which calls rbd_free_disk() again. This double teardown can leave blk-mq cleanup operating on invalid state and trigger a null-ptr-deref in __blk_mq_free_map_and_rqs(), reached from blk_mq_free_tag_set(). Fix this by following the normal remove ordering: call device_del() before rbd_dev_device_release() when device_add_disk() fails after device_add(). That keeps the teardown sequence consistent and avoids re-entering disk cleanup through the wrong path. The bug was first flagged by an experimental analysis tool we are developing for kernel memory-management bugs while analyzing v6.13-rc1. The tool is still under development and is not yet publicly available. We reproduced the bug on v7.0 with a real Ceph backend and a QEMU x86_64 guest booted with KASAN and CONFIG_FAILSLAB enabled. The reproducer confines failslab injections to the __add_disk() range and injects fail-nth while mapping an RBD image through /sys/bus/rbd/add_single_major. On the unpatched kernel, fail-nth=4 reliably triggered the fault: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 0 UID: 0 PID: 273 Comm: bash Not tainted 7.0.0-01247-gd60bc1401583 #6 PREEMPT(lazy) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 RIP: 0010:__blk_mq_free_map_and_rqs+0x8c/0x240 Code: 00 00 48 8b 6b 60 41 89 f4 49 c1 e4 03 4c 01 e5 45 85 ed 0f 85 0a 01 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 e9 48 c1 e9 03 <80> 3c 01 00 0f 85 31 01 00 00 4c 8b 6d 00 4d 85 ed 0f 84 e2 00 00 RSP: 0018:ff1100000ab0fac8 EFLAGS: 00000246 RAX: dffffc0000000000 RBX: ff1100000c4806a0 RCX: 0000000000000000 RDX: 0000000000000002 RSI: 0000000000000000 RDI: ff1100000c4806f4 RBP: 0000000000000000 R08: 0000000000000001 R09: ffe21c000189001b R10: ff1100000c4800df R11: ff1100006cf37be0 R12: 0000000000000000 R13: 0000000000000000 R14: ff1100000c480700 R15: ff1100000c480004 FS: 00007f0fbe8fe740(0000) GS:ff110000e5851000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe53473b2e0 CR3: 0000000012eef000 CR4: 00000000007516f0 PKRU: 55555554 Call Trace: blk_mq_free_tag_set+0x77/0x460 do_rbd_add+0x1446/0x2b80 ? __pfx_do_rbd_add+0x10/0x10 ? lock_acquire+0x18c/0x300 ? find_held_lock+0x2b/0x80 ? sysfs_file_kobj+0xb6/0x1b0 ? __pfx_sysfs_kf_write+0x10/0x10 kernfs_fop_write_iter+0x2f4/0x4a0 vfs_write+0x98e/0x1000 ? expand_files+0x51f/0x850 ? __pfx_vfs_write+0x10/0x10 ksys_write+0xf2/0x1d0 ? __pfx_ksys_write+0x10/0x10 do_syscall_64+0x115/0x690 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f0fbea15907 Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007ffe22346ea8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000058 RCX: 00007f0fbea15907 RDX: 0000000000000058 RSI: 0000563ace6c0ef0 RDI: 0000000000000001 RBP: 0000563ace6c0ef0 R08: 0000563ace6c0ef0 R09: 6b6435726d694141 R10: 5250337279762f78 R11: 0000000000000246 R12: 0000000000000058 R13: 00007f0fbeb1c780 R14: ff1100000c480700 R15: ff1100000c480004 With this fix applied, rerunning the reproducer over fail-nth=1..256 yields no KASAN reports. [ idryomov: rename err_out_device_del -> err_out_device ] Cc: stable@vger.kernel.org Fixes: 27c97abc30e2 ("rbd: add add_disk() error handling") Signed-off-by: Zilin Guan Signed-off-by: Dawei Feng Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e7da06200c1e..4065336ebd1f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -7165,7 +7165,7 @@ static ssize_t do_rbd_add(const char *buf, size_t count) rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL); if (rc) - goto err_out_cleanup_disk; + goto err_out_device; spin_lock(&rbd_dev_list_lock); list_add_tail(&rbd_dev->node, &rbd_dev_list); @@ -7179,8 +7179,8 @@ static ssize_t do_rbd_add(const char *buf, size_t count) module_put(THIS_MODULE); return rc; -err_out_cleanup_disk: - rbd_free_disk(rbd_dev); +err_out_device: + device_del(&rbd_dev->dev); err_out_image_lock: rbd_dev_image_unlock(rbd_dev); rbd_dev_device_release(rbd_dev); From 1c439de70b1c3eb3c6bffa8245c16b9fc318f114 Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Tue, 21 Apr 2026 10:27:01 +0200 Subject: [PATCH 09/12] libceph: Fix slab-out-of-bounds access in auth message processing If a (potentially corrupted) message of type CEPH_MSG_AUTH_REPLY contains a positive value in its result field, it is treated as an error code by ceph_handle_auth_reply() and returned to handle_auth_reply(). Thereafter, an attempt is made to send the preallocated message of type CEPH_MSG_AUTH, where the returned value is interpreted as the size of the front segment to send. If the result value in the message is greater than the size of the memory buffer allocated for the front segment, an out-of-bounds access occurs, and the content of the memory region beyond this buffer is sent out. This patch fixes the issue by treating only negative values in the result field as errors. Positive values are therefore treated as success in the same way as a zero value. Additionally, a BUG_ON is added to __send_prepared_auth_request() comparing the len parameter to front_alloc_len to prevent sending the message if it exceeds the bounds of the allocation and to make it easier to catch any logic flaws leading to this. Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/auth.c | 2 +- net/ceph/mon_client.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ceph/auth.c b/net/ceph/auth.c index 3314705e5914..17660bde896b 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -257,7 +257,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ac->negotiating = false; } - if (result) { + if (result < 0) { pr_err("auth protocol '%s' mauth authentication failed: %d\n", ceph_auth_proto_name(ac->protocol), result); ret = result; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index d5080530ce0c..d2cdc8ee3155 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -174,6 +174,8 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) */ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) { + BUG_ON(len > monc->m_auth->front_alloc_len); + monc->pending_auth = 1; monc->m_auth->front.iov_len = len; monc->m_auth->hdr.front_len = cpu_to_le32(len); From e58103cafff2e3ee2196d6d3347fc47d6e0a047a Mon Sep 17 00:00:00 2001 From: Alex Markuze Date: Tue, 10 Feb 2026 09:06:24 +0000 Subject: [PATCH 10/12] ceph: handle InodeStat v8 versioned field in reply parsing Add forward-compatible handling for the new versioned field introduced in InodeStat v8. This patch only skips the field without using it, preparing for future protocol extensions. The v8 encoding adds a versioned sub-structure that needs to be properly decoded and skipped to maintain compatibility with newer MDS versions. Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4fa471d9b3b2..3b534e6522e7 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -232,6 +232,26 @@ static int parse_reply_info_in(void **p, void *end, info->fscrypt_file_len, bad); } } + + /* + * InodeStat encoding versions: + * v1-v7: various fields added over time + * v8: added optmetadata (versioned sub-structure containing + * optional inode metadata like charmap for case-insensitive + * filesystems). The kernel client doesn't support + * case-insensitive lookups, so we skip this field. + * v9: added subvolume_id (parsed below) + */ + if (struct_v >= 8) { + u32 v8_struct_len; + + /* skip optmetadata versioned sub-structure */ + ceph_decode_skip_8(p, end, bad); /* struct_v */ + ceph_decode_skip_8(p, end, bad); /* struct_compat */ + ceph_decode_32_safe(p, end, v8_struct_len, bad); + ceph_decode_skip_n(p, end, v8_struct_len, bad); + } + *p = end; } else { /* legacy (unversioned) struct */ From 4a1c5434792df72c4df6225fb697494a2405a137 Mon Sep 17 00:00:00 2001 From: Alex Markuze Date: Tue, 10 Feb 2026 09:06:25 +0000 Subject: [PATCH 11/12] ceph: parse subvolume_id from InodeStat v9 and store in inode Add support for parsing the subvolume_id field from InodeStat v9 and storing it in the inode for later use by subvolume metrics tracking. The subvolume_id identifies which CephFS subvolume an inode belongs to, enabling per-subvolume I/O metrics collection and reporting. This patch: - Adds subvolume_id field to struct ceph_mds_reply_info_in - Adds i_subvolume_id field to struct ceph_inode_info - Parses subvolume_id from v9 InodeStat in parse_reply_info_in() - Adds ceph_inode_set_subvolume() helper to propagate the ID to inodes - Initializes i_subvolume_id in inode allocation and clears on destroy Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 41 +++++++++++++++++++++++++++++++++++++++++ fs/ceph/mds_client.c | 38 ++++++++++++++++++++++++-------------- fs/ceph/mds_client.h | 1 + fs/ceph/super.h | 10 ++++++++++ 4 files changed, 76 insertions(+), 14 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d99e12d1100b..22c7da1ea61c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -638,6 +638,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_max_bytes = 0; ci->i_max_files = 0; + ci->i_subvolume_id = CEPH_SUBVOLUME_ID_NONE; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); @@ -742,6 +743,8 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); + ci->i_subvolume_id = CEPH_SUBVOLUME_ID_NONE; + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) @@ -873,6 +876,40 @@ int ceph_fill_file_size(struct inode *inode, int issued, return queue_trunc; } +/* + * Set the subvolume ID for an inode. + * + * The subvolume_id identifies which CephFS subvolume this inode belongs to. + * CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset - the MDS only sends + * non-zero IDs for inodes within subvolumes. + * + * An inode's subvolume membership is immutable - once an inode is created + * in a subvolume, it stays there. Therefore, if we already have a valid + * (non-zero) subvolume_id and receive a different one, that indicates a bug. + */ +void ceph_inode_set_subvolume(struct inode *inode, u64 subvolume_id) +{ + struct ceph_inode_info *ci; + u64 old; + + if (!inode || subvolume_id == CEPH_SUBVOLUME_ID_NONE) + return; + + ci = ceph_inode(inode); + old = READ_ONCE(ci->i_subvolume_id); + + if (old == subvolume_id) + return; + + if (old != CEPH_SUBVOLUME_ID_NONE) { + /* subvolume_id should not change once set */ + WARN_ON_ONCE(1); + return; + } + + WRITE_ONCE(ci->i_subvolume_id, subvolume_id); +} + void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec64 *ctime, struct timespec64 *mtime, struct timespec64 *atime) @@ -1076,6 +1113,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, new_issued = ~issued & info_caps; __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); + ceph_inode_set_subvolume(inode, iinfo->subvolume_id); #ifdef CONFIG_FS_ENCRYPTION if (iinfo->fscrypt_auth_len && @@ -1583,6 +1621,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) goto done; } if (parent_dir) { + ceph_inode_set_subvolume(parent_dir, + rinfo->diri.subvolume_id); err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri, rinfo->dirfrag, session, -1, &req->r_caps_reservation); @@ -1671,6 +1711,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) BUG_ON(!req->r_target_inode); in = req->r_target_inode; + ceph_inode_set_subvolume(in, rinfo->targeti.subvolume_id); err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, session, (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3b534e6522e7..267bd37eb608 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -96,19 +96,19 @@ static int parse_reply_info_quota(void **p, void *end, return -EIO; } -/* - * parse individual inode info - */ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, - u64 features) + u64 features, + struct ceph_mds_client *mdsc) { int err = 0; u8 struct_v = 0; + u8 struct_compat = 0; + u32 struct_len = 0; + + info->subvolume_id = CEPH_SUBVOLUME_ID_NONE; if (features == (u64)-1) { - u32 struct_len; - u8 struct_compat; ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); /* struct_v is expected to be >= 1. we only understand @@ -252,6 +252,10 @@ static int parse_reply_info_in(void **p, void *end, ceph_decode_skip_n(p, end, v8_struct_len, bad); } + /* struct_v 9 added subvolume_id */ + if (struct_v >= 9) + ceph_decode_64_safe(p, end, info->subvolume_id, bad); + *p = end; } else { /* legacy (unversioned) struct */ @@ -384,12 +388,13 @@ static int parse_reply_info_lease(void **p, void *end, */ static int parse_reply_info_trace(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - u64 features) + u64 features, + struct ceph_mds_client *mdsc) { int err; if (info->head->is_dentry) { - err = parse_reply_info_in(p, end, &info->diri, features); + err = parse_reply_info_in(p, end, &info->diri, features, mdsc); if (err < 0) goto out_bad; @@ -409,7 +414,8 @@ static int parse_reply_info_trace(void **p, void *end, } if (info->head->is_target) { - err = parse_reply_info_in(p, end, &info->targeti, features); + err = parse_reply_info_in(p, end, &info->targeti, features, + mdsc); if (err < 0) goto out_bad; } @@ -430,7 +436,8 @@ static int parse_reply_info_trace(void **p, void *end, */ static int parse_reply_info_readdir(void **p, void *end, struct ceph_mds_request *req, - u64 features) + u64 features, + struct ceph_mds_client *mdsc) { struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; struct ceph_client *cl = req->r_mdsc->fsc->client; @@ -545,7 +552,7 @@ static int parse_reply_info_readdir(void **p, void *end, rde->name_len = oname.len; /* inode */ - err = parse_reply_info_in(p, end, &rde->inode, features); + err = parse_reply_info_in(p, end, &rde->inode, features, mdsc); if (err < 0) goto out_bad; /* ceph_readdir_prepopulate() will update it */ @@ -753,7 +760,8 @@ static int parse_reply_info_extra(void **p, void *end, if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) - return parse_reply_info_readdir(p, end, req, features); + return parse_reply_info_readdir(p, end, req, features, + req->r_mdsc); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features, s); else if (op == CEPH_MDS_OP_GETVXATTR) @@ -782,7 +790,8 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); - err = parse_reply_info_trace(&p, p+len, info, features); + err = parse_reply_info_trace(&p, p + len, info, features, + s->s_mdsc); if (err < 0) goto out_bad; } @@ -791,7 +800,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); - err = parse_reply_info_extra(&p, p+len, req, features, s); + err = parse_reply_info_extra(&p, p + len, req, features, s); if (err < 0) goto out_bad; } @@ -3989,6 +3998,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) goto out_err; } req->r_target_inode = in; + ceph_inode_set_subvolume(in, rinfo->targeti.subvolume_id); } mutex_lock(&session->s_mutex); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 0428a5eaf28c..bd3690baa65c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -118,6 +118,7 @@ struct ceph_mds_reply_info_in { u32 fscrypt_file_len; u64 rsnaps; u64 change_attr; + u64 subvolume_id; }; struct ceph_mds_reply_dir_entry { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 29a980e22dc2..cd5f71061264 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -398,6 +398,15 @@ struct ceph_inode_info { /* quotas */ u64 i_max_bytes, i_max_files; + /* + * Subvolume ID this inode belongs to. CEPH_SUBVOLUME_ID_NONE (0) + * means unknown/unset, matching the FUSE client convention. + * Once set to a valid (non-zero) value, it should not change + * during the inode's lifetime. + */ +#define CEPH_SUBVOLUME_ID_NONE 0 + u64 i_subvolume_id; + s32 i_dir_pin; struct rb_root i_fragtree; @@ -1069,6 +1078,7 @@ extern struct inode *ceph_get_inode(struct super_block *sb, extern struct inode *ceph_get_snapdir(struct inode *parent); extern int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size); +extern void ceph_inode_set_subvolume(struct inode *inode, u64 subvolume_id); extern void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec64 *ctime, struct timespec64 *mtime, From b1137e0b3d4bad1cad73fa9bac763c74ddd1813d Mon Sep 17 00:00:00 2001 From: Alex Markuze Date: Tue, 10 Feb 2026 09:06:26 +0000 Subject: [PATCH 12/12] ceph: add subvolume metrics collection and reporting Add complete infrastructure for per-subvolume I/O metrics collection and reporting to the MDS. This enables administrators to monitor I/O patterns at the subvolume granularity, which is useful for multi-tenant CephFS deployments. This patch adds: - CEPHFS_FEATURE_SUBVOLUME_METRICS feature flag for MDS negotiation - CEPH_SUBVOLUME_ID_NONE constant (0) for unknown/unset state - Red-black tree based metrics tracker for efficient per-subvolume aggregation with kmem_cache for entry allocations - Wire format encoding matching the MDS C++ AggregatedIOMetrics struct - Integration with the existing CLIENT_METRICS message - Recording of I/O operations from file read/write and writeback paths - Debugfs interfaces for monitoring (metrics/subvolumes, metrics/metric_features) Metrics tracked per subvolume include: - Read/write operation counts - Read/write byte counts - Read/write latency sums (for average calculation) The metrics are periodically sent to the MDS as part of the existing metrics reporting infrastructure when the MDS advertises support for the SUBVOLUME_METRICS feature. CEPH_SUBVOLUME_ID_NONE enforces subvolume_id immutability. Following the FUSE client convention, 0 means unknown/unset. Once an inode has a valid (non-zero) subvolume_id, it should not change during the inode's lifetime. Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/Makefile | 2 +- fs/ceph/addr.c | 14 ++ fs/ceph/debugfs.c | 157 ++++++++++++++ fs/ceph/file.c | 68 +++++- fs/ceph/mds_client.c | 34 ++- fs/ceph/mds_client.h | 13 +- fs/ceph/metric.c | 183 +++++++++++++++- fs/ceph/metric.h | 39 +++- fs/ceph/subvolume_metrics.c | 416 ++++++++++++++++++++++++++++++++++++ fs/ceph/subvolume_metrics.h | 97 +++++++++ fs/ceph/super.c | 8 + fs/ceph/super.h | 1 + 12 files changed, 1018 insertions(+), 14 deletions(-) create mode 100644 fs/ceph/subvolume_metrics.c create mode 100644 fs/ceph/subvolume_metrics.h diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 1f77ca04c426..ebb29d11ac22 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o quota.o io.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ - debugfs.o util.o metric.o + debugfs.o util.o metric.o subvolume_metrics.o ceph-$(CONFIG_CEPH_FSCACHE) += cache.o ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 44553556ac74..5a4ad6a0d270 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -19,6 +19,7 @@ #include "mds_client.h" #include "cache.h" #include "metric.h" +#include "subvolume_metrics.h" #include "crypto.h" #include #include @@ -259,6 +260,10 @@ static void finish_netfs_read(struct ceph_osd_request *req) osd_data->length), false); } if (err > 0) { + ceph_subvolume_metrics_record_io(fsc->mdsc, ceph_inode(inode), + false, err, + req->r_start_latency, + req->r_end_latency); subreq->transferred = err; err = 0; } @@ -823,6 +828,10 @@ static int write_folio_nounlock(struct folio *folio, ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); + if (err >= 0 && len > 0) + ceph_subvolume_metrics_record_io(fsc->mdsc, ci, true, len, + req->r_start_latency, + req->r_end_latency); fscrypt_free_bounce_page(bounce_page); ceph_osdc_put_request(req); if (err == 0) @@ -963,6 +972,11 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, rc); + if (rc >= 0 && len > 0) + ceph_subvolume_metrics_record_io(mdsc, ci, true, len, + req->r_start_latency, + req->r_end_latency); + ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); osd_data = osd_req_op_extent_osd_data(req, 0); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 7dc307790240..e2463f93cf6b 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -9,11 +9,13 @@ #include #include #include +#include #include #include #include #include +#include #include "super.h" @@ -21,6 +23,36 @@ #include "mds_client.h" #include "metric.h" +#include "subvolume_metrics.h" + +/** + * struct ceph_session_feature_desc - Maps feature bits to names for debugfs + * @bit: Feature bit number from enum ceph_feature_type (see mds_client.h) + * @name: Human-readable feature name for debugfs output + * + * Used by metric_features_show() to display negotiated session features. + */ +struct ceph_session_feature_desc { + unsigned int bit; + const char *name; +}; + +static const struct ceph_session_feature_desc ceph_session_feature_table[] = { + { CEPHFS_FEATURE_METRIC_COLLECT, "METRIC_COLLECT" }, + { CEPHFS_FEATURE_REPLY_ENCODING, "REPLY_ENCODING" }, + { CEPHFS_FEATURE_RECLAIM_CLIENT, "RECLAIM_CLIENT" }, + { CEPHFS_FEATURE_LAZY_CAP_WANTED, "LAZY_CAP_WANTED" }, + { CEPHFS_FEATURE_MULTI_RECONNECT, "MULTI_RECONNECT" }, + { CEPHFS_FEATURE_DELEG_INO, "DELEG_INO" }, + { CEPHFS_FEATURE_ALTERNATE_NAME, "ALTERNATE_NAME" }, + { CEPHFS_FEATURE_NOTIFY_SESSION_STATE, "NOTIFY_SESSION_STATE" }, + { CEPHFS_FEATURE_OP_GETVXATTR, "OP_GETVXATTR" }, + { CEPHFS_FEATURE_32BITS_RETRY_FWD, "32BITS_RETRY_FWD" }, + { CEPHFS_FEATURE_NEW_SNAPREALM_INFO, "NEW_SNAPREALM_INFO" }, + { CEPHFS_FEATURE_HAS_OWNER_UIDGID, "HAS_OWNER_UIDGID" }, + { CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, "MDS_AUTH_CAPS_CHECK" }, + { CEPHFS_FEATURE_SUBVOLUME_METRICS, "SUBVOLUME_METRICS" }, +}; static int mdsmap_show(struct seq_file *s, void *p) { @@ -360,6 +392,59 @@ static int status_show(struct seq_file *s, void *p) return 0; } +static int subvolume_metrics_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_subvol_metric_snapshot *snapshot = NULL; + u32 nr = 0; + u64 total_sent = 0; + u64 nonzero_sends = 0; + u32 i; + + if (!mdsc) { + seq_puts(s, "mds client unavailable\n"); + return 0; + } + + mutex_lock(&mdsc->subvol_metrics_last_mutex); + if (mdsc->subvol_metrics_last && mdsc->subvol_metrics_last_nr) { + nr = mdsc->subvol_metrics_last_nr; + snapshot = kmemdup_array(mdsc->subvol_metrics_last, nr, + sizeof(*snapshot), GFP_KERNEL); + if (!snapshot) + nr = 0; + } + total_sent = mdsc->subvol_metrics_sent; + nonzero_sends = mdsc->subvol_metrics_nonzero_sends; + mutex_unlock(&mdsc->subvol_metrics_last_mutex); + + seq_puts(s, "Last sent subvolume metrics:\n"); + if (!nr) { + seq_puts(s, " (none)\n"); + } else { + seq_puts(s, " subvol_id rd_ops wr_ops rd_bytes wr_bytes rd_lat_us wr_lat_us\n"); + for (i = 0; i < nr; i++) { + const struct ceph_subvol_metric_snapshot *e = &snapshot[i]; + + seq_printf(s, " %-18llu %-9llu %-9llu %-14llu %-14llu %-14llu %-14llu\n", + e->subvolume_id, + e->read_ops, e->write_ops, + e->read_bytes, e->write_bytes, + e->read_latency_us, e->write_latency_us); + } + } + kfree(snapshot); + + seq_puts(s, "\nStatistics:\n"); + seq_printf(s, " entries_sent: %llu\n", total_sent); + seq_printf(s, " non_zero_sends: %llu\n", nonzero_sends); + + seq_puts(s, "\nPending (unsent) subvolume metrics:\n"); + ceph_subvolume_metrics_dump(&mdsc->subvol_metrics, s); + return 0; +} + DEFINE_SHOW_ATTRIBUTE(mdsmap); DEFINE_SHOW_ATTRIBUTE(mdsc); DEFINE_SHOW_ATTRIBUTE(caps); @@ -369,7 +454,72 @@ DEFINE_SHOW_ATTRIBUTE(metrics_file); DEFINE_SHOW_ATTRIBUTE(metrics_latency); DEFINE_SHOW_ATTRIBUTE(metrics_size); DEFINE_SHOW_ATTRIBUTE(metrics_caps); +DEFINE_SHOW_ATTRIBUTE(subvolume_metrics); +static int metric_features_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + unsigned long session_features = 0; + bool have_session = false; + bool metric_collect = false; + bool subvol_support = false; + bool metrics_enabled = false; + bool subvol_enabled = false; + int i; + + if (!mdsc) { + seq_puts(s, "mds client unavailable\n"); + return 0; + } + + mutex_lock(&mdsc->mutex); + if (mdsc->metric.session) { + have_session = true; + session_features = mdsc->metric.session->s_features; + } + mutex_unlock(&mdsc->mutex); + + if (have_session) { + metric_collect = + test_bit(CEPHFS_FEATURE_METRIC_COLLECT, + &session_features); + subvol_support = + test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, + &session_features); + } + + metrics_enabled = !disable_send_metrics && have_session && metric_collect; + subvol_enabled = metrics_enabled && subvol_support; + + seq_printf(s, + "metrics_enabled: %s (disable_send_metrics=%d, session=%s, metric_collect=%s)\n", + metrics_enabled ? "yes" : "no", + disable_send_metrics ? 1 : 0, + have_session ? "yes" : "no", + metric_collect ? "yes" : "no"); + seq_printf(s, "subvolume_metrics_enabled: %s\n", + subvol_enabled ? "yes" : "no"); + seq_printf(s, "session_feature_bits: 0x%lx\n", session_features); + + if (!have_session) { + seq_puts(s, "(no active MDS session for metrics)\n"); + return 0; + } + + for (i = 0; i < ARRAY_SIZE(ceph_session_feature_table); i++) { + const struct ceph_session_feature_desc *desc = + &ceph_session_feature_table[i]; + bool set = test_bit(desc->bit, &session_features); + + seq_printf(s, " %-24s : %s\n", desc->name, + set ? "yes" : "no"); + } + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(metric_features); /* * debugfs @@ -404,6 +554,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) debugfs_remove(fsc->debugfs_caps); debugfs_remove(fsc->debugfs_status); debugfs_remove(fsc->debugfs_mdsc); + debugfs_remove(fsc->debugfs_subvolume_metrics); debugfs_remove_recursive(fsc->debugfs_metrics_dir); doutc(fsc->client, "done\n"); } @@ -468,6 +619,12 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) &metrics_size_fops); debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc, &metrics_caps_fops); + debugfs_create_file("metric_features", 0400, fsc->debugfs_metrics_dir, + fsc, &metric_features_fops); + fsc->debugfs_subvolume_metrics = + debugfs_create_file("subvolumes", 0400, + fsc->debugfs_metrics_dir, fsc, + &subvolume_metrics_fops); doutc(fsc->client, "done\n"); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5e7c73a29aa3..d54d71669176 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -19,6 +19,25 @@ #include "cache.h" #include "io.h" #include "metric.h" +#include "subvolume_metrics.h" + +/* + * Record I/O for subvolume metrics tracking. + * + * Callers must ensure bytes > 0 for reads (ret > 0 check) to avoid counting + * EOF as an I/O operation. For writes, the condition is (ret >= 0 && len > 0). + */ +static inline void ceph_record_subvolume_io(struct inode *inode, bool is_write, + ktime_t start, ktime_t end, + size_t bytes) +{ + if (!bytes) + return; + + ceph_subvolume_metrics_record_io(ceph_sb_to_mdsc(inode->i_sb), + ceph_inode(inode), + is_write, bytes, start, end); +} static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags) { @@ -1140,6 +1159,15 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, req->r_start_latency, req->r_end_latency, read_len, ret); + /* + * Only record subvolume metrics for actual bytes read. + * ret == 0 means EOF (no data), not an I/O operation. + */ + if (ret > 0) + ceph_record_subvolume_io(inode, false, + req->r_start_latency, + req->r_end_latency, + ret); if (ret > 0) objver = req->r_version; @@ -1385,12 +1413,23 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) /* r_start_latency == 0 means the request was not submitted */ if (req->r_start_latency) { - if (aio_req->write) + if (aio_req->write) { ceph_update_write_metrics(metric, req->r_start_latency, req->r_end_latency, len, rc); - else + if (rc >= 0 && len) + ceph_record_subvolume_io(inode, true, + req->r_start_latency, + req->r_end_latency, + len); + } else { ceph_update_read_metrics(metric, req->r_start_latency, req->r_end_latency, len, rc); + if (rc > 0) + ceph_record_subvolume_io(inode, false, + req->r_start_latency, + req->r_end_latency, + rc); + } } put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, @@ -1614,12 +1653,23 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ceph_osdc_start_request(req->r_osdc, req); ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - if (write) + if (write) { ceph_update_write_metrics(metric, req->r_start_latency, req->r_end_latency, len, ret); - else + if (ret >= 0 && len) + ceph_record_subvolume_io(inode, true, + req->r_start_latency, + req->r_end_latency, + len); + } else { ceph_update_read_metrics(metric, req->r_start_latency, req->r_end_latency, len, ret); + if (ret > 0) + ceph_record_subvolume_io(inode, false, + req->r_start_latency, + req->r_end_latency, + ret); + } size = i_size_read(inode); if (!write) { @@ -1872,6 +1922,11 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, req->r_start_latency, req->r_end_latency, read_len, ret); + if (ret > 0) + ceph_record_subvolume_io(inode, false, + req->r_start_latency, + req->r_end_latency, + ret); /* Ok if object is not already present */ if (ret == -ENOENT) { @@ -2036,6 +2091,11 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, ret); + if (ret >= 0 && write_len) + ceph_record_subvolume_io(inode, true, + req->r_start_latency, + req->r_end_latency, + write_len); ceph_osdc_put_request(req); if (ret != 0) { doutc(cl, "osd write returned %d\n", ret); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 267bd37eb608..fa476497d41d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -68,6 +68,21 @@ static void ceph_cap_reclaim_work(struct work_struct *work); static const struct ceph_connection_operations mds_con_ops; +static void ceph_metric_bind_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_session *old; + + if (!mdsc || !session || disable_send_metrics) + return; + + old = mdsc->metric.session; + mdsc->metric.session = ceph_get_mds_session(session); + if (old) + ceph_put_mds_session(old); + + metric_schedule_delayed(&mdsc->metric); +} /* * mds reply parsing @@ -4347,6 +4362,11 @@ static void handle_session(struct ceph_mds_session *session, } mdsc->s_cap_auths_num = cap_auths_num; mdsc->s_cap_auths = cap_auths; + + session->s_features = features; + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, + &session->s_features)) + ceph_metric_bind_session(mdsc, session); } if (op == CEPH_SESSION_CLOSE) { ceph_get_mds_session(session); @@ -4373,7 +4393,11 @@ static void handle_session(struct ceph_mds_session *session, pr_info_client(cl, "mds%d reconnect success\n", session->s_mds); - session->s_features = features; + if (test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, + &session->s_features)) + ceph_subvolume_metrics_enable(&mdsc->subvol_metrics, true); + else + ceph_subvolume_metrics_enable(&mdsc->subvol_metrics, false); if (session->s_state == CEPH_MDS_SESSION_OPEN) { pr_notice_client(cl, "mds%d is already opened\n", session->s_mds); @@ -5616,6 +5640,12 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) err = ceph_metric_init(&mdsc->metric); if (err) goto err_mdsmap; + ceph_subvolume_metrics_init(&mdsc->subvol_metrics); + mutex_init(&mdsc->subvol_metrics_last_mutex); + mdsc->subvol_metrics_last = NULL; + mdsc->subvol_metrics_last_nr = 0; + mdsc->subvol_metrics_sent = 0; + mdsc->subvol_metrics_nonzero_sends = 0; spin_lock_init(&mdsc->dentry_list_lock); INIT_LIST_HEAD(&mdsc->dentry_leases); @@ -6149,6 +6179,8 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) ceph_mdsc_stop(mdsc); ceph_metric_destroy(&mdsc->metric); + ceph_subvolume_metrics_destroy(&mdsc->subvol_metrics); + kfree(mdsc->subvol_metrics_last); fsc->mdsc = NULL; kfree(mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index bd3690baa65c..4e6c87f8414c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -18,6 +18,7 @@ #include "mdsmap.h" #include "metric.h" +#include "subvolume_metrics.h" #include "super.h" /* The first 8 bits are reserved for old ceph releases */ @@ -36,8 +37,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_NEW_SNAPREALM_INFO, CEPHFS_FEATURE_HAS_OWNER_UIDGID, CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, + CEPHFS_FEATURE_SUBVOLUME_METRICS, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_SUBVOLUME_METRICS, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -54,6 +56,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_32BITS_RETRY_FWD, \ CEPHFS_FEATURE_HAS_OWNER_UIDGID, \ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, \ + CEPHFS_FEATURE_SUBVOLUME_METRICS, \ } /* @@ -537,6 +540,14 @@ struct ceph_mds_client { struct list_head dentry_dir_leases; /* lru list */ struct ceph_client_metric metric; + struct ceph_subvolume_metrics_tracker subvol_metrics; + + /* Subvolume metrics send tracking */ + struct mutex subvol_metrics_last_mutex; + struct ceph_subvol_metric_snapshot *subvol_metrics_last; + u32 subvol_metrics_last_nr; + u64 subvol_metrics_sent; + u64 subvol_metrics_nonzero_sends; spinlock_t snapid_map_lock; struct rb_root snapid_map_tree; diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 871c1090e520..b6450fdace94 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -4,10 +4,84 @@ #include #include #include +#include + +#include #include "metric.h" #include "mds_client.h" +static bool metrics_disable_warned; + +static inline u32 ceph_subvolume_entry_payload_len(void) +{ + return sizeof(struct ceph_subvolume_metric_entry_wire); +} + +static inline u32 ceph_subvolume_entry_encoded_len(void) +{ + return CEPH_ENCODING_START_BLK_LEN + + ceph_subvolume_entry_payload_len(); +} + +static inline u32 ceph_subvolume_outer_payload_len(u32 nr_subvols) +{ + /* count is encoded as le64 (size_t on wire) to match FUSE client */ + return sizeof(__le64) + + nr_subvols * ceph_subvolume_entry_encoded_len(); +} + +static inline u32 ceph_subvolume_metric_data_len(u32 nr_subvols) +{ + return CEPH_ENCODING_START_BLK_LEN + + ceph_subvolume_outer_payload_len(nr_subvols); +} + +static inline u32 ceph_subvolume_clamp_u32(u64 val) +{ + return val > U32_MAX ? U32_MAX : (u32)val; +} + +static void ceph_init_subvolume_wire_entry( + struct ceph_subvolume_metric_entry_wire *dst, + const struct ceph_subvol_metric_snapshot *src) +{ + dst->subvolume_id = cpu_to_le64(src->subvolume_id); + dst->read_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->read_ops)); + dst->write_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->write_ops)); + dst->read_bytes = cpu_to_le64(src->read_bytes); + dst->write_bytes = cpu_to_le64(src->write_bytes); + dst->read_latency_us = cpu_to_le64(src->read_latency_us); + dst->write_latency_us = cpu_to_le64(src->write_latency_us); + dst->time_stamp = 0; +} + +static int ceph_encode_subvolume_metrics(void **p, void *end, + struct ceph_subvol_metric_snapshot *subvols, + u32 nr_subvols) +{ + u32 i; + + ceph_start_encoding(p, 1, 1, + ceph_subvolume_outer_payload_len(nr_subvols)); + /* count is encoded as le64 (size_t on wire) to match FUSE client */ + ceph_encode_64_safe(p, end, (u64)nr_subvols, enc_err); + + for (i = 0; i < nr_subvols; i++) { + struct ceph_subvolume_metric_entry_wire wire_entry; + + ceph_init_subvolume_wire_entry(&wire_entry, &subvols[i]); + ceph_start_encoding(p, 1, 1, + ceph_subvolume_entry_payload_len()); + ceph_encode_copy_safe(p, end, &wire_entry, + sizeof(wire_entry), enc_err); + } + + return 0; +enc_err: + return -ERANGE; +} + static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val) { struct timespec64 t = ktime_to_timespec64(val); @@ -29,10 +103,14 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_read_io_size *rsize; struct ceph_write_io_size *wsize; struct ceph_client_metric *m = &mdsc->metric; + struct ceph_subvol_metric_snapshot *subvols = NULL; u64 nr_caps = atomic64_read(&m->total_caps); u32 header_len = sizeof(struct ceph_metric_header); struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; + u32 nr_subvols = 0; + size_t subvol_len = 0; + void *cursor; s64 sum; s32 items = 0; s32 len; @@ -45,15 +123,42 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, } mutex_unlock(&mdsc->mutex); + if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) && + test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, &s->s_features)) { + int ret; + + ret = ceph_subvolume_metrics_snapshot(&mdsc->subvol_metrics, + &subvols, &nr_subvols, + true); + if (ret) { + pr_warn_client(cl, "failed to snapshot subvolume metrics: %d\n", + ret); + /* + * On error, ceph_subvolume_metrics_snapshot() guarantees + * *out = NULL and *nr = 0 at function entry, so subvols + * is already NULL here - no cleanup needed. + */ + nr_subvols = 0; + subvols = NULL; + } + } + + if (nr_subvols) { + /* type (le32) + ENCODE_START payload - no metric header */ + subvol_len = sizeof(__le32) + + ceph_subvolume_metric_data_len(nr_subvols); + } + len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize) - + sizeof(*wsize); + + sizeof(*wsize) + subvol_len; msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { pr_err_client(cl, "to mds%d, failed to allocate message\n", s->s_mds); + kfree(subvols); return false; } @@ -172,13 +277,56 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum); items++; + cursor = wsize + 1; + + if (nr_subvols) { + void *payload; + void *payload_end; + int ret; + + /* Emit only the type (le32), no ver/compat/data_len */ + ceph_encode_32(&cursor, CLIENT_METRIC_TYPE_SUBVOLUME_METRICS); + items++; + + payload = cursor; + payload_end = (char *)payload + + ceph_subvolume_metric_data_len(nr_subvols); + + ret = ceph_encode_subvolume_metrics(&payload, payload_end, + subvols, nr_subvols); + if (ret) { + pr_warn_client(cl, + "failed to encode subvolume metrics\n"); + kfree(subvols); + ceph_msg_put(msg); + return false; + } + + WARN_ON(payload != payload_end); + cursor = payload; + } + put_unaligned_le32(items, &head->num); - msg->front.iov_len = len; + msg->front.iov_len = (char *)cursor - (char *)head; msg->hdr.version = cpu_to_le16(1); msg->hdr.compat_version = cpu_to_le16(1); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + ceph_con_send(&s->s_con, msg); + if (nr_subvols) { + mutex_lock(&mdsc->subvol_metrics_last_mutex); + kfree(mdsc->subvol_metrics_last); + mdsc->subvol_metrics_last = subvols; + mdsc->subvol_metrics_last_nr = nr_subvols; + mdsc->subvol_metrics_sent += nr_subvols; + mdsc->subvol_metrics_nonzero_sends++; + mutex_unlock(&mdsc->subvol_metrics_last_mutex); + + subvols = NULL; + } + kfree(subvols); + return true; } @@ -198,9 +346,20 @@ static void metric_get_session(struct ceph_mds_client *mdsc) * Skip it if MDS doesn't support the metric collection, * or the MDS will close the session's socket connection * directly when it get this message. + * + * Also skip sessions that don't support SUBVOLUME_METRICS + * when subvolume metrics collection is enabled. This ensures + * we only send subvolume metrics to MDSs that understand them. + * If no session supports the feature, metrics won't be sent. */ if (check_session_state(s) && test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) { + if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) && + !test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, + &s->s_features)) { + ceph_put_mds_session(s); + continue; + } mdsc->metric.session = s; break; } @@ -217,9 +376,18 @@ static void metric_delayed_work(struct work_struct *work) struct ceph_mds_client *mdsc = container_of(m, struct ceph_mds_client, metric); - if (mdsc->stopping || disable_send_metrics) + if (mdsc->stopping) return; + if (disable_send_metrics) { + if (!metrics_disable_warned) { + pr_info("ceph: metrics sending disabled via module parameter\n"); + metrics_disable_warned = true; + } + return; + } + metrics_disable_warned = false; + if (!m->session || !check_session_state(m->session)) { if (m->session) { ceph_put_mds_session(m->session); @@ -227,10 +395,13 @@ static void metric_delayed_work(struct work_struct *work) } metric_get_session(mdsc); } - if (m->session) { + + if (m->session) ceph_mdsc_send_metrics(mdsc, m->session); - metric_schedule_delayed(m); - } + else + pr_warn_ratelimited("ceph: metrics worker has no MDS session\n"); + + metric_schedule_delayed(m); } int ceph_metric_init(struct ceph_client_metric *m) diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 0d0c44bd3332..519cd4d47aaa 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -25,8 +25,9 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, + CLIENT_METRIC_TYPE_SUBVOLUME_METRICS, - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_SUBVOLUME_METRICS, }; /* @@ -50,6 +51,7 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_SUBVOLUME_METRICS, \ \ CLIENT_METRIC_TYPE_MAX, \ } @@ -139,6 +141,41 @@ struct ceph_write_io_size { __le64 total_size; } __packed; +/** + * struct ceph_subvolume_metric_entry_wire - On-wire format sent to MDS + * @subvolume_id: Subvolume identifier + * @read_ops: Read operation count (32-bit, clamped from 64-bit internal) + * @write_ops: Write operation count (32-bit, clamped from 64-bit internal) + * @read_bytes: Total bytes read + * @write_bytes: Total bytes written + * @read_latency_us: Cumulative read latency in microseconds + * @write_latency_us: Cumulative write latency in microseconds + * @time_stamp: Collection timestamp (currently unused, set to 0) + * + * Wire format must match C++ AggregatedIOMetrics struct in MDS. + */ +struct ceph_subvolume_metric_entry_wire { + __le64 subvolume_id; + __le32 read_ops; + __le32 write_ops; + __le64 read_bytes; + __le64 write_bytes; + __le64 read_latency_us; + __le64 write_latency_us; + __le64 time_stamp; +} __packed; + +/* Old struct kept for internal tracking, not used on wire */ +struct ceph_subvolume_metric_entry { + __le64 subvolume_id; + __le64 read_ops; + __le64 write_ops; + __le64 read_bytes; + __le64 write_bytes; + __le64 read_latency_us; + __le64 write_latency_us; +} __packed; + struct ceph_metric_head { __le32 num; /* the number of metrics that will be sent */ } __packed; diff --git a/fs/ceph/subvolume_metrics.c b/fs/ceph/subvolume_metrics.c new file mode 100644 index 000000000000..03fda1f9257b --- /dev/null +++ b/fs/ceph/subvolume_metrics.c @@ -0,0 +1,416 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include + +#include "subvolume_metrics.h" +#include "mds_client.h" +#include "super.h" + +/** + * struct ceph_subvol_metric_rb_entry - Per-subvolume I/O metrics node + * @node: Red-black tree linkage for tracker->tree + * @subvolume_id: Subvolume identifier (key for rb-tree lookup) + * @read_ops: Accumulated read operation count since last snapshot + * @write_ops: Accumulated write operation count since last snapshot + * @read_bytes: Accumulated bytes read since last snapshot + * @write_bytes: Accumulated bytes written since last snapshot + * @read_latency_us: Sum of read latencies in microseconds + * @write_latency_us: Sum of write latencies in microseconds + */ +struct ceph_subvol_metric_rb_entry { + struct rb_node node; + u64 subvolume_id; + u64 read_ops; + u64 write_ops; + u64 read_bytes; + u64 write_bytes; + u64 read_latency_us; + u64 write_latency_us; +}; + +static struct kmem_cache *ceph_subvol_metric_entry_cachep; + +void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker) +{ + spin_lock_init(&tracker->lock); + tracker->tree = RB_ROOT_CACHED; + tracker->nr_entries = 0; + tracker->enabled = false; + atomic64_set(&tracker->snapshot_attempts, 0); + atomic64_set(&tracker->snapshot_empty, 0); + atomic64_set(&tracker->snapshot_failures, 0); + atomic64_set(&tracker->record_calls, 0); + atomic64_set(&tracker->record_disabled, 0); + atomic64_set(&tracker->record_no_subvol, 0); + atomic64_set(&tracker->total_read_ops, 0); + atomic64_set(&tracker->total_read_bytes, 0); + atomic64_set(&tracker->total_write_ops, 0); + atomic64_set(&tracker->total_write_bytes, 0); +} + +static struct ceph_subvol_metric_rb_entry * +__lookup_entry(struct ceph_subvolume_metrics_tracker *tracker, u64 subvol_id) +{ + struct rb_node *node; + + node = tracker->tree.rb_root.rb_node; + while (node) { + struct ceph_subvol_metric_rb_entry *entry = + rb_entry(node, struct ceph_subvol_metric_rb_entry, node); + + if (subvol_id < entry->subvolume_id) + node = node->rb_left; + else if (subvol_id > entry->subvolume_id) + node = node->rb_right; + else + return entry; + } + + return NULL; +} + +static struct ceph_subvol_metric_rb_entry * +__insert_entry(struct ceph_subvolume_metrics_tracker *tracker, + struct ceph_subvol_metric_rb_entry *entry) +{ + struct rb_node **link = &tracker->tree.rb_root.rb_node; + struct rb_node *parent = NULL; + bool leftmost = true; + + while (*link) { + struct ceph_subvol_metric_rb_entry *cur = + rb_entry(*link, struct ceph_subvol_metric_rb_entry, node); + + parent = *link; + if (entry->subvolume_id < cur->subvolume_id) + link = &(*link)->rb_left; + else if (entry->subvolume_id > cur->subvolume_id) { + link = &(*link)->rb_right; + leftmost = false; + } else + return cur; + } + + rb_link_node(&entry->node, parent, link); + rb_insert_color_cached(&entry->node, &tracker->tree, leftmost); + tracker->nr_entries++; + return entry; +} + +static void ceph_subvolume_metrics_clear_locked( + struct ceph_subvolume_metrics_tracker *tracker) +{ + struct rb_node *node = rb_first_cached(&tracker->tree); + + while (node) { + struct ceph_subvol_metric_rb_entry *entry = + rb_entry(node, struct ceph_subvol_metric_rb_entry, node); + struct rb_node *next = rb_next(node); + + rb_erase_cached(&entry->node, &tracker->tree); + tracker->nr_entries--; + kmem_cache_free(ceph_subvol_metric_entry_cachep, entry); + node = next; + } + + tracker->tree = RB_ROOT_CACHED; +} + +void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker) +{ + spin_lock(&tracker->lock); + ceph_subvolume_metrics_clear_locked(tracker); + tracker->enabled = false; + spin_unlock(&tracker->lock); +} + +void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker, + bool enable) +{ + spin_lock(&tracker->lock); + if (enable) { + tracker->enabled = true; + } else { + tracker->enabled = false; + ceph_subvolume_metrics_clear_locked(tracker); + } + spin_unlock(&tracker->lock); +} + +void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker, + u64 subvol_id, bool is_write, + size_t size, u64 latency_us) +{ + struct ceph_subvol_metric_rb_entry *entry, *new_entry = NULL; + bool retry = false; + + /* CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset subvolume */ + if (!READ_ONCE(tracker->enabled) || + subvol_id == CEPH_SUBVOLUME_ID_NONE || !size || !latency_us) + return; + + /* + * Retry loop for lock-free allocation pattern: + * 1. First iteration: lookup under lock, if miss -> drop lock, alloc, retry + * 2. Second iteration: lookup again (may have been inserted), insert if still missing + * 3. On race (another thread inserted same key): free our alloc, retry + * All successful paths exit via return, so retry flag doesn't need reset. + */ + do { + spin_lock(&tracker->lock); + if (!tracker->enabled) { + spin_unlock(&tracker->lock); + if (new_entry) + kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry); + return; + } + + entry = __lookup_entry(tracker, subvol_id); + if (!entry) { + if (!new_entry) { + spin_unlock(&tracker->lock); + new_entry = kmem_cache_zalloc(ceph_subvol_metric_entry_cachep, + GFP_NOFS); + if (!new_entry) + return; + new_entry->subvolume_id = subvol_id; + retry = true; + continue; + } + entry = __insert_entry(tracker, new_entry); + if (entry != new_entry) { + /* raced with another insert */ + spin_unlock(&tracker->lock); + kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry); + new_entry = NULL; + retry = true; + continue; + } + new_entry = NULL; + } + + if (is_write) { + entry->write_ops++; + entry->write_bytes += size; + entry->write_latency_us += latency_us; + atomic64_inc(&tracker->total_write_ops); + atomic64_add(size, &tracker->total_write_bytes); + } else { + entry->read_ops++; + entry->read_bytes += size; + entry->read_latency_us += latency_us; + atomic64_inc(&tracker->total_read_ops); + atomic64_add(size, &tracker->total_read_bytes); + } + spin_unlock(&tracker->lock); + if (new_entry) + kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry); + return; + } while (retry); +} + +int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker, + struct ceph_subvol_metric_snapshot **out, + u32 *nr, bool consume) +{ + struct ceph_subvol_metric_snapshot *snap = NULL; + struct rb_node *node; + u32 count = 0, idx = 0; + int ret = 0; + + *out = NULL; + *nr = 0; + + if (!READ_ONCE(tracker->enabled)) + return 0; + + atomic64_inc(&tracker->snapshot_attempts); + + spin_lock(&tracker->lock); + for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) { + struct ceph_subvol_metric_rb_entry *entry = + rb_entry(node, struct ceph_subvol_metric_rb_entry, node); + + /* Include entries with ANY I/O activity (read OR write) */ + if (entry->read_ops || entry->write_ops) + count++; + } + spin_unlock(&tracker->lock); + + if (!count) { + atomic64_inc(&tracker->snapshot_empty); + return 0; + } + + snap = kcalloc(count, sizeof(*snap), GFP_NOFS); + if (!snap) { + atomic64_inc(&tracker->snapshot_failures); + return -ENOMEM; + } + + spin_lock(&tracker->lock); + node = rb_first_cached(&tracker->tree); + while (node) { + struct ceph_subvol_metric_rb_entry *entry = + rb_entry(node, struct ceph_subvol_metric_rb_entry, node); + struct rb_node *next = rb_next(node); + + /* Skip entries with NO I/O activity at all */ + if (!entry->read_ops && !entry->write_ops) { + rb_erase_cached(&entry->node, &tracker->tree); + tracker->nr_entries--; + kmem_cache_free(ceph_subvol_metric_entry_cachep, entry); + node = next; + continue; + } + + if (idx >= count) { + pr_warn("ceph: subvol metrics snapshot race (idx=%u count=%u)\n", + idx, count); + break; + } + + snap[idx].subvolume_id = entry->subvolume_id; + snap[idx].read_ops = entry->read_ops; + snap[idx].write_ops = entry->write_ops; + snap[idx].read_bytes = entry->read_bytes; + snap[idx].write_bytes = entry->write_bytes; + snap[idx].read_latency_us = entry->read_latency_us; + snap[idx].write_latency_us = entry->write_latency_us; + idx++; + + if (consume) { + entry->read_ops = 0; + entry->write_ops = 0; + entry->read_bytes = 0; + entry->write_bytes = 0; + entry->read_latency_us = 0; + entry->write_latency_us = 0; + rb_erase_cached(&entry->node, &tracker->tree); + tracker->nr_entries--; + kmem_cache_free(ceph_subvol_metric_entry_cachep, entry); + } + node = next; + } + spin_unlock(&tracker->lock); + + if (!idx) { + kfree(snap); + snap = NULL; + ret = 0; + } else { + *nr = idx; + *out = snap; + } + + return ret; +} + +void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot) +{ + kfree(snapshot); +} + +/* + * Dump subvolume metrics to a seq_file for debugfs. + * + * Iterates the rb-tree directly under spinlock to avoid allocation. + * The lock hold time is minimal since we're only doing seq_printf calls. + */ +void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker, + struct seq_file *s) +{ + struct rb_node *node; + bool found = false; + + spin_lock(&tracker->lock); + if (!tracker->enabled) { + spin_unlock(&tracker->lock); + seq_puts(s, "subvolume metrics disabled\n"); + return; + } + + for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) { + struct ceph_subvol_metric_rb_entry *entry = + rb_entry(node, struct ceph_subvol_metric_rb_entry, node); + u64 avg_rd_lat, avg_wr_lat; + + if (!entry->read_ops && !entry->write_ops) + continue; + + if (!found) { + seq_puts(s, "subvol_id rd_ops rd_bytes rd_avg_lat_us wr_ops wr_bytes wr_avg_lat_us\n"); + seq_puts(s, "------------------------------------------------------------------------------------------------\n"); + found = true; + } + + avg_rd_lat = entry->read_ops ? + div64_u64(entry->read_latency_us, entry->read_ops) : 0; + avg_wr_lat = entry->write_ops ? + div64_u64(entry->write_latency_us, entry->write_ops) : 0; + + seq_printf(s, "%-15llu%-10llu%-12llu%-16llu%-10llu%-12llu%-16llu\n", + entry->subvolume_id, + entry->read_ops, + entry->read_bytes, + avg_rd_lat, + entry->write_ops, + entry->write_bytes, + avg_wr_lat); + } + spin_unlock(&tracker->lock); + + if (!found) + seq_puts(s, "(no subvolume metrics collected)\n"); +} + +void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci, + bool is_write, size_t bytes, + ktime_t start, ktime_t end) +{ + struct ceph_subvolume_metrics_tracker *tracker; + u64 subvol_id; + s64 delta_us; + + if (!mdsc || !ci || !bytes) + return; + + tracker = &mdsc->subvol_metrics; + atomic64_inc(&tracker->record_calls); + + if (!ceph_subvolume_metrics_enabled(tracker)) { + atomic64_inc(&tracker->record_disabled); + return; + } + + subvol_id = READ_ONCE(ci->i_subvolume_id); + if (subvol_id == CEPH_SUBVOLUME_ID_NONE) { + atomic64_inc(&tracker->record_no_subvol); + return; + } + + delta_us = ktime_to_us(ktime_sub(end, start)); + if (delta_us <= 0) + delta_us = 1; + + ceph_subvolume_metrics_record(tracker, subvol_id, is_write, + bytes, (u64)delta_us); +} + +int __init ceph_subvolume_metrics_cache_init(void) +{ + ceph_subvol_metric_entry_cachep = KMEM_CACHE(ceph_subvol_metric_rb_entry, + SLAB_RECLAIM_ACCOUNT); + if (!ceph_subvol_metric_entry_cachep) + return -ENOMEM; + return 0; +} + +void ceph_subvolume_metrics_cache_destroy(void) +{ + kmem_cache_destroy(ceph_subvol_metric_entry_cachep); +} diff --git a/fs/ceph/subvolume_metrics.h b/fs/ceph/subvolume_metrics.h new file mode 100644 index 000000000000..6f53ff726c75 --- /dev/null +++ b/fs/ceph/subvolume_metrics.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_SUBVOLUME_METRICS_H +#define _FS_CEPH_SUBVOLUME_METRICS_H + +#include +#include +#include +#include +#include + +struct seq_file; +struct ceph_mds_client; +struct ceph_inode_info; + +/** + * struct ceph_subvol_metric_snapshot - Point-in-time snapshot of subvolume metrics + * @subvolume_id: Subvolume identifier (inode number of subvolume root) + * @read_ops: Number of read operations since last snapshot + * @write_ops: Number of write operations since last snapshot + * @read_bytes: Total bytes read since last snapshot + * @write_bytes: Total bytes written since last snapshot + * @read_latency_us: Sum of read latencies in microseconds (for avg calculation) + * @write_latency_us: Sum of write latencies in microseconds (for avg calculation) + */ +struct ceph_subvol_metric_snapshot { + u64 subvolume_id; + u64 read_ops; + u64 write_ops; + u64 read_bytes; + u64 write_bytes; + u64 read_latency_us; + u64 write_latency_us; +}; + +/** + * struct ceph_subvolume_metrics_tracker - Tracks per-subvolume I/O metrics + * @lock: Protects @tree and @nr_entries during concurrent access + * @tree: Red-black tree of per-subvolume entries, keyed by subvolume_id + * @nr_entries: Number of entries currently in @tree + * @enabled: Whether collection is enabled (requires MDS feature support) + * @snapshot_attempts: Debug counter: total ceph_subvolume_metrics_snapshot() calls + * @snapshot_empty: Debug counter: snapshots that found no data to report + * @snapshot_failures: Debug counter: snapshots that failed to allocate memory + * @record_calls: Debug counter: total ceph_subvolume_metrics_record() calls + * @record_disabled: Debug counter: record calls skipped because disabled + * @record_no_subvol: Debug counter: record calls skipped (no subvolume_id) + * @total_read_ops: Cumulative read ops across all snapshots (never reset) + * @total_read_bytes: Cumulative bytes read across all snapshots (never reset) + * @total_write_ops: Cumulative write ops across all snapshots (never reset) + * @total_write_bytes: Cumulative bytes written across all snapshots (never reset) + */ +struct ceph_subvolume_metrics_tracker { + spinlock_t lock; + struct rb_root_cached tree; + u32 nr_entries; + bool enabled; + atomic64_t snapshot_attempts; + atomic64_t snapshot_empty; + atomic64_t snapshot_failures; + atomic64_t record_calls; + atomic64_t record_disabled; + atomic64_t record_no_subvol; + atomic64_t total_read_ops; + atomic64_t total_read_bytes; + atomic64_t total_write_ops; + atomic64_t total_write_bytes; +}; + +void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker); +void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker); +void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker, + bool enable); +void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker, + u64 subvol_id, bool is_write, + size_t size, u64 latency_us); +int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker, + struct ceph_subvol_metric_snapshot **out, + u32 *nr, bool consume); +void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot); +void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker, + struct seq_file *s); + +void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci, + bool is_write, size_t bytes, + ktime_t start, ktime_t end); + +static inline bool ceph_subvolume_metrics_enabled( + const struct ceph_subvolume_metrics_tracker *tracker) +{ + return READ_ONCE(tracker->enabled); +} + +int __init ceph_subvolume_metrics_cache_init(void); +void ceph_subvolume_metrics_cache_destroy(void); + +#endif /* _FS_CEPH_SUBVOLUME_METRICS_H */ diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 2aed6b3359b6..c05fbd4237f8 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -21,6 +21,7 @@ #include "mds_client.h" #include "cache.h" #include "crypto.h" +#include "subvolume_metrics.h" #include #include @@ -966,8 +967,14 @@ static int __init init_caches(void) if (!ceph_wb_pagevec_pool) goto bad_pagevec_pool; + error = ceph_subvolume_metrics_cache_init(); + if (error) + goto bad_subvol_metrics; + return 0; +bad_subvol_metrics: + mempool_destroy(ceph_wb_pagevec_pool); bad_pagevec_pool: kmem_cache_destroy(ceph_mds_request_cachep); bad_mds_req: @@ -1004,6 +1011,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_dir_file_cachep); kmem_cache_destroy(ceph_mds_request_cachep); mempool_destroy(ceph_wb_pagevec_pool); + ceph_subvolume_metrics_cache_destroy(); } static void __ceph_umount_begin(struct ceph_fs_client *fsc) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cd5f71061264..afc89ce91804 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -179,6 +179,7 @@ struct ceph_fs_client { struct dentry *debugfs_status; struct dentry *debugfs_mds_sessions; struct dentry *debugfs_metrics_dir; + struct dentry *debugfs_subvolume_metrics; #endif #ifdef CONFIG_CEPH_FSCACHE