We have a series from Alex which extends CephFS client metrics with

support for per-subvolume data I/O performance and latency tracking (metadata operations aren't included) and a good variety of fixes and cleanups across RBD and CephFS. -----BEGIN PGP SIGNATURE----- iQFHBAABCgAxFiEEydHwtzie9C7TfviiSn/eOAIR84sFAmnrq1YTHGlkcnlvbW92 QGdtYWlsLmNvbQAKCRBKf944AhHzi+WFCACA2Yc6oj6W4yXX2LSGCFCN3FOanSb3 6ZvPeSrmAALzwD9ZXdef6j50An6w05P7kXKmAyKTgmW2tpiRciJs6uT6y7By/aph uGZCaPoJWPDvTlo8d05MAVuyfoKH5eU8pwx2YiEMN5W6kfo7VJQze6BgLbvQt7yH ToIzzBLifYONH4vF3nfsHj/uCr38Cbpr6GWY8LIPo8QtInWKJTcwF7HWVVicCaMs yqf1t+/CWzlIsnnIQtp+aSxWlpoA5lAqWxGt3jSfd3eVTCAL8eDzw5fkbGMRJYgM paH3kZ+LuJWkRXe2ts/RMrXWLJF3ZWOVD6sWU6sfnXf+vBe4SkiwwcUt =Ooc5 -----END PGP SIGNATURE----- Merge tag 'ceph-for-7.1-rc1' of https://github.com/ceph/ceph-client Pull ceph updates from Ilya Dryomov: "We have a series from Alex which extends CephFS client metrics with support for per-subvolume data I/O performance and latency tracking (metadata operations aren't included) and a good variety of fixes and cleanups across RBD and CephFS" * tag 'ceph-for-7.1-rc1' of https://github.com/ceph/ceph-client: ceph: add subvolume metrics collection and reporting ceph: parse subvolume_id from InodeStat v9 and store in inode ceph: handle InodeStat v8 versioned field in reply parsing libceph: Fix slab-out-of-bounds access in auth message processing rbd: fix null-ptr-deref when device_add_disk() fails crush: cleanup in crush_do_rule() method ceph: clear s_cap_reconnect when ceph_pagelist_encode_32() fails ceph: only d_add() negative dentries when they are unhashed libceph: update outdated comment in ceph_sock_write_space() libceph: Remove obsolete session key alignment logic ceph: fix num_ops off-by-one when crypto allocation fails libceph: Prevent potential null-ptr-deref in ceph_handle_auth_reply()
2026-05-12 16:18:45 +02:00 · 2026-04-24 13:47:19 -07:00 · 2026-04-24 13:47:19 -07:00 · ac2dc6d574
commit ac2dc6d574
parent ff9726d7a0 b1137e0b3d
20 changed files with 1144 additions and 50 deletions
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@ -7165,7 +7165,7 @@ static ssize_t do_rbd_add(const char *buf, size_t count)
 	rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
 	if (rc)
-		goto err_out_cleanup_disk;
+		goto err_out_device;
 	spin_lock(&rbd_dev_list_lock);
 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
@ -7179,8 +7179,8 @@ static ssize_t do_rbd_add(const char *buf, size_t count)
 	module_put(THIS_MODULE);
 	return rc;
-err_out_cleanup_disk:
+err_out_device:
-	rbd_free_disk(rbd_dev);
+	device_del(&rbd_dev->dev);
 err_out_image_lock:
 	rbd_dev_image_unlock(rbd_dev);
 	rbd_dev_device_release(rbd_dev);
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
 ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
 	export.o caps.o snap.o xattr.o quota.o io.o \
 	mds_client.o mdsmap.o strings.o ceph_frag.o \
-	debugfs.o util.o metric.o
+	debugfs.o util.o metric.o subvolume_metrics.o
 ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
 ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@ -19,6 +19,7 @@
 #include "mds_client.h"
 #include "cache.h"
 #include "metric.h"
 #include "subvolume_metrics.h"
 #include "crypto.h"
 #include <linux/ceph/osd_client.h>
 #include <linux/ceph/striper.h>
@ -259,6 +260,10 @@ static void finish_netfs_read(struct ceph_osd_request *req)
 					osd_data->length), false);
 	}
 	if (err > 0) {
 		ceph_subvolume_metrics_record_io(fsc->mdsc, ceph_inode(inode),
 						 false, err,
 						 req->r_start_latency,
 						 req->r_end_latency);
 		subreq->transferred = err;
 		err = 0;
 	}
@ -823,6 +828,10 @@ static int write_folio_nounlock(struct folio *folio,
 	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 				  req->r_end_latency, len, err);
 	if (err >= 0 && len > 0)
 		ceph_subvolume_metrics_record_io(fsc->mdsc, ci, true, len,
 						 req->r_start_latency,
 						 req->r_end_latency);
 	fscrypt_free_bounce_page(bounce_page);
 	ceph_osdc_put_request(req);
 	if (err == 0)
@ -963,6 +972,11 @@ static void writepages_finish(struct ceph_osd_request *req)
 	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 				  req->r_end_latency, len, rc);
 	if (rc >= 0 && len > 0)
 		ceph_subvolume_metrics_record_io(mdsc, ci, true, len,
 						 req->r_start_latency,
 						 req->r_end_latency);
 	ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
 	osd_data = osd_req_op_extent_osd_data(req, 0);
@ -1365,6 +1379,10 @@ void ceph_process_folio_batch(struct address_space *mapping,
 		rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc,
 				folio);
 		if (rc) {
 			/* Did we just begin a new contiguous op? Nevermind! */
 			if (ceph_wbc->len == 0)
 				ceph_wbc->num_ops--;
 			folio_redirty_for_writepage(wbc, folio);
 			folio_unlock(folio);
 			break;
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@ -9,11 +9,13 @@
 #include <linux/seq_file.h>
 #include <linux/math64.h>
 #include <linux/ktime.h>
 #include <linux/atomic.h>
 #include <linux/ceph/libceph.h>
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/auth.h>
 #include <linux/ceph/debugfs.h>
 #include <linux/ceph/decode.h>
 #include "super.h"
@ -21,6 +23,36 @@
 #include "mds_client.h"
 #include "metric.h"
 #include "subvolume_metrics.h"
 /**
 * struct ceph_session_feature_desc - Maps feature bits to names for debugfs
 * @bit: Feature bit number from enum ceph_feature_type (see mds_client.h)
 * @name: Human-readable feature name for debugfs output
 *
 * Used by metric_features_show() to display negotiated session features.
 */
 struct ceph_session_feature_desc {
 	unsigned int bit;
 	const char *name;
 };
 static const struct ceph_session_feature_desc ceph_session_feature_table[] = {
 	{ CEPHFS_FEATURE_METRIC_COLLECT, "METRIC_COLLECT" },
 	{ CEPHFS_FEATURE_REPLY_ENCODING, "REPLY_ENCODING" },
 	{ CEPHFS_FEATURE_RECLAIM_CLIENT, "RECLAIM_CLIENT" },
 	{ CEPHFS_FEATURE_LAZY_CAP_WANTED, "LAZY_CAP_WANTED" },
 	{ CEPHFS_FEATURE_MULTI_RECONNECT, "MULTI_RECONNECT" },
 	{ CEPHFS_FEATURE_DELEG_INO, "DELEG_INO" },
 	{ CEPHFS_FEATURE_ALTERNATE_NAME, "ALTERNATE_NAME" },
 	{ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, "NOTIFY_SESSION_STATE" },
 	{ CEPHFS_FEATURE_OP_GETVXATTR, "OP_GETVXATTR" },
 	{ CEPHFS_FEATURE_32BITS_RETRY_FWD, "32BITS_RETRY_FWD" },
 	{ CEPHFS_FEATURE_NEW_SNAPREALM_INFO, "NEW_SNAPREALM_INFO" },
 	{ CEPHFS_FEATURE_HAS_OWNER_UIDGID, "HAS_OWNER_UIDGID" },
 	{ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, "MDS_AUTH_CAPS_CHECK" },
 	{ CEPHFS_FEATURE_SUBVOLUME_METRICS, "SUBVOLUME_METRICS" },
 };
 static int mdsmap_show(struct seq_file *s, void *p)
 {
@ -360,6 +392,59 @@ static int status_show(struct seq_file *s, void *p)
 	return 0;
 }
 static int subvolume_metrics_show(struct seq_file *s, void *p)
 {
 	struct ceph_fs_client *fsc = s->private;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_subvol_metric_snapshot *snapshot = NULL;
 	u32 nr = 0;
 	u64 total_sent = 0;
 	u64 nonzero_sends = 0;
 	u32 i;
 	if (!mdsc) {
 		seq_puts(s, "mds client unavailable\n");
 		return 0;
 	}
 	mutex_lock(&mdsc->subvol_metrics_last_mutex);
 	if (mdsc->subvol_metrics_last && mdsc->subvol_metrics_last_nr) {
 		nr = mdsc->subvol_metrics_last_nr;
 		snapshot = kmemdup_array(mdsc->subvol_metrics_last, nr,
 					 sizeof(*snapshot), GFP_KERNEL);
 		if (!snapshot)
 			nr = 0;
 	}
 	total_sent = mdsc->subvol_metrics_sent;
 	nonzero_sends = mdsc->subvol_metrics_nonzero_sends;
 	mutex_unlock(&mdsc->subvol_metrics_last_mutex);
 	seq_puts(s, "Last sent subvolume metrics:\n");
 	if (!nr) {
 		seq_puts(s, "  (none)\n");
 	} else {
 		seq_puts(s, "  subvol_id          rd_ops    wr_ops    rd_bytes       wr_bytes       rd_lat_us      wr_lat_us\n");
 		for (i = 0; i < nr; i++) {
 			const struct ceph_subvol_metric_snapshot *e = &snapshot[i];
 			seq_printf(s, "  %-18llu %-9llu %-9llu %-14llu %-14llu %-14llu %-14llu\n",
 				   e->subvolume_id,
 				   e->read_ops, e->write_ops,
 				   e->read_bytes, e->write_bytes,
 				   e->read_latency_us, e->write_latency_us);
 		}
 	}
 	kfree(snapshot);
 	seq_puts(s, "\nStatistics:\n");
 	seq_printf(s, "  entries_sent:      %llu\n", total_sent);
 	seq_printf(s, "  non_zero_sends:    %llu\n", nonzero_sends);
 	seq_puts(s, "\nPending (unsent) subvolume metrics:\n");
 	ceph_subvolume_metrics_dump(&mdsc->subvol_metrics, s);
 	return 0;
 }
 DEFINE_SHOW_ATTRIBUTE(mdsmap);
 DEFINE_SHOW_ATTRIBUTE(mdsc);
 DEFINE_SHOW_ATTRIBUTE(caps);
@ -369,7 +454,72 @@ DEFINE_SHOW_ATTRIBUTE(metrics_file);
 DEFINE_SHOW_ATTRIBUTE(metrics_latency);
 DEFINE_SHOW_ATTRIBUTE(metrics_size);
 DEFINE_SHOW_ATTRIBUTE(metrics_caps);
 DEFINE_SHOW_ATTRIBUTE(subvolume_metrics);
 static int metric_features_show(struct seq_file *s, void *p)
 {
 	struct ceph_fs_client *fsc = s->private;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned long session_features = 0;
 	bool have_session = false;
 	bool metric_collect = false;
 	bool subvol_support = false;
 	bool metrics_enabled = false;
 	bool subvol_enabled = false;
 	int i;
 	if (!mdsc) {
 		seq_puts(s, "mds client unavailable\n");
 		return 0;
 	}
 	mutex_lock(&mdsc->mutex);
 	if (mdsc->metric.session) {
 		have_session = true;
 		session_features = mdsc->metric.session->s_features;
 	}
 	mutex_unlock(&mdsc->mutex);
 	if (have_session) {
 		metric_collect =
 			test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
 				 &session_features);
 		subvol_support =
 			test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS,
 				 &session_features);
 	}
 	metrics_enabled = !disable_send_metrics && have_session && metric_collect;
 	subvol_enabled = metrics_enabled && subvol_support;
 	seq_printf(s,
 		   "metrics_enabled: %s (disable_send_metrics=%d, session=%s, metric_collect=%s)\n",
 		   metrics_enabled ? "yes" : "no",
 		   disable_send_metrics ? 1 : 0,
 		   have_session ? "yes" : "no",
 		   metric_collect ? "yes" : "no");
 	seq_printf(s, "subvolume_metrics_enabled: %s\n",
 		   subvol_enabled ? "yes" : "no");
 	seq_printf(s, "session_feature_bits: 0x%lx\n", session_features);
 	if (!have_session) {
 		seq_puts(s, "(no active MDS session for metrics)\n");
 		return 0;
 	}
 	for (i = 0; i < ARRAY_SIZE(ceph_session_feature_table); i++) {
 		const struct ceph_session_feature_desc *desc =
 			&ceph_session_feature_table[i];
 		bool set = test_bit(desc->bit, &session_features);
 		seq_printf(s, "  %-24s : %s\n", desc->name,
 			   set ? "yes" : "no");
 	}
 	return 0;
 }
 DEFINE_SHOW_ATTRIBUTE(metric_features);
 /*
 * debugfs
@ -404,6 +554,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 	debugfs_remove(fsc->debugfs_caps);
 	debugfs_remove(fsc->debugfs_status);
 	debugfs_remove(fsc->debugfs_mdsc);
 	debugfs_remove(fsc->debugfs_subvolume_metrics);
 	debugfs_remove_recursive(fsc->debugfs_metrics_dir);
 	doutc(fsc->client, "done\n");
 }
@ -468,6 +619,12 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 			    &metrics_size_fops);
 	debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc,
 			    &metrics_caps_fops);
 	debugfs_create_file("metric_features", 0400, fsc->debugfs_metrics_dir,
 			    fsc, &metric_features_fops);
 	fsc->debugfs_subvolume_metrics =
 		debugfs_create_file("subvolumes", 0400,
 				    fsc->debugfs_metrics_dir, fsc,
 				    &subvolume_metrics_fops);
 	doutc(fsc->client, "done\n");
 }
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@ -769,7 +769,8 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 				d_drop(dentry);
 				err = -ENOENT;
 			} else {
-				d_add(dentry, NULL);
+				if (d_unhashed(dentry))
 					d_add(dentry, NULL);
 			}
 		}
 	}
@ -840,7 +841,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 			spin_unlock(&ci->i_ceph_lock);
 			doutc(cl, " dir %llx.%llx complete, -ENOENT\n",
 			      ceph_vinop(dir));
-			d_add(dentry, NULL);
+			if (d_unhashed(dentry))
 				d_add(dentry, NULL);
 			di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
 			return NULL;
 		}
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@ -19,6 +19,25 @@
 #include "cache.h"
 #include "io.h"
 #include "metric.h"
 #include "subvolume_metrics.h"
 /*
 * Record I/O for subvolume metrics tracking.
 *
 * Callers must ensure bytes > 0 for reads (ret > 0 check) to avoid counting
 * EOF as an I/O operation. For writes, the condition is (ret >= 0 && len > 0).
 */
 static inline void ceph_record_subvolume_io(struct inode *inode, bool is_write,
 					    ktime_t start, ktime_t end,
 					    size_t bytes)
 {
 	if (!bytes)
 		return;
 	ceph_subvolume_metrics_record_io(ceph_sb_to_mdsc(inode->i_sb),
 					 ceph_inode(inode),
 					 is_write, bytes, start, end);
 }
 static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags)
 {
@ -1140,6 +1159,15 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 					 req->r_start_latency,
 					 req->r_end_latency,
 					 read_len, ret);
 		/*
 		 * Only record subvolume metrics for actual bytes read.
 		 * ret == 0 means EOF (no data), not an I/O operation.
 		 */
 		if (ret > 0)
 			ceph_record_subvolume_io(inode, false,
 						 req->r_start_latency,
 						 req->r_end_latency,
 						 ret);
 		if (ret > 0)
 			objver = req->r_version;
@ -1385,12 +1413,23 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
 	/* r_start_latency == 0 means the request was not submitted */
 	if (req->r_start_latency) {
-		if (aio_req->write)
+		if (aio_req->write) {
 			ceph_update_write_metrics(metric, req->r_start_latency,
 						  req->r_end_latency, len, rc);
-		else
+			if (rc >= 0 && len)
 				ceph_record_subvolume_io(inode, true,
 							 req->r_start_latency,
 							 req->r_end_latency,
 							 len);
 		} else {
 			ceph_update_read_metrics(metric, req->r_start_latency,
 						 req->r_end_latency, len, rc);
 			if (rc > 0)
 				ceph_record_subvolume_io(inode, false,
 							 req->r_start_latency,
 							 req->r_end_latency,
 							 rc);
 		}
 	}
 	put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
@ -1614,12 +1653,23 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		ceph_osdc_start_request(req->r_osdc, req);
 		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
-		if (write)
+		if (write) {
 			ceph_update_write_metrics(metric, req->r_start_latency,
 						  req->r_end_latency, len, ret);
-		else
+			if (ret >= 0 && len)
 				ceph_record_subvolume_io(inode, true,
 							 req->r_start_latency,
 							 req->r_end_latency,
 							 len);
 		} else {
 			ceph_update_read_metrics(metric, req->r_start_latency,
 						 req->r_end_latency, len, ret);
 			if (ret > 0)
 				ceph_record_subvolume_io(inode, false,
 							 req->r_start_latency,
 							 req->r_end_latency,
 							 ret);
 		}
 		size = i_size_read(inode);
 		if (!write) {
@ -1872,6 +1922,11 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 						 req->r_start_latency,
 						 req->r_end_latency,
 						 read_len, ret);
 			if (ret > 0)
 				ceph_record_subvolume_io(inode, false,
 							 req->r_start_latency,
 							 req->r_end_latency,
 							 ret);
 			/* Ok if object is not already present */
 			if (ret == -ENOENT) {
@ -2036,6 +2091,11 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 					  req->r_end_latency, len, ret);
 		if (ret >= 0 && write_len)
 			ceph_record_subvolume_io(inode, true,
 						 req->r_start_latency,
 						 req->r_end_latency,
 						 write_len);
 		ceph_osdc_put_request(req);
 		if (ret != 0) {
 			doutc(cl, "osd write returned %d\n", ret);
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@ -638,6 +638,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_max_bytes = 0;
 	ci->i_max_files = 0;
 	ci->i_subvolume_id = CEPH_SUBVOLUME_ID_NONE;
 	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
 	memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
@ -742,6 +743,8 @@ void ceph_evict_inode(struct inode *inode)
 	percpu_counter_dec(&mdsc->metric.total_inodes);
 	ci->i_subvolume_id = CEPH_SUBVOLUME_ID_NONE;
 	netfs_wait_for_outstanding_io(inode);
 	truncate_inode_pages_final(&inode->i_data);
 	if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
@ -873,6 +876,40 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 	return queue_trunc;
 }
 /*
 * Set the subvolume ID for an inode.
 *
 * The subvolume_id identifies which CephFS subvolume this inode belongs to.
 * CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset - the MDS only sends
 * non-zero IDs for inodes within subvolumes.
 *
 * An inode's subvolume membership is immutable - once an inode is created
 * in a subvolume, it stays there. Therefore, if we already have a valid
 * (non-zero) subvolume_id and receive a different one, that indicates a bug.
 */
 void ceph_inode_set_subvolume(struct inode *inode, u64 subvolume_id)
 {
 	struct ceph_inode_info *ci;
 	u64 old;
 	if (!inode || subvolume_id == CEPH_SUBVOLUME_ID_NONE)
 		return;
 	ci = ceph_inode(inode);
 	old = READ_ONCE(ci->i_subvolume_id);
 	if (old == subvolume_id)
 		return;
 	if (old != CEPH_SUBVOLUME_ID_NONE) {
 		/* subvolume_id should not change once set */
 		WARN_ON_ONCE(1);
 		return;
 	}
 	WRITE_ONCE(ci->i_subvolume_id, subvolume_id);
 }
 void ceph_fill_file_time(struct inode *inode, int issued,
 			 u64 time_warp_seq, struct timespec64 *ctime,
 			 struct timespec64 *mtime, struct timespec64 *atime)
@ -1076,6 +1113,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 	new_issued = ~issued & info_caps;
 	__ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
 	ceph_inode_set_subvolume(inode, iinfo->subvolume_id);
 #ifdef CONFIG_FS_ENCRYPTION
 	if (iinfo->fscrypt_auth_len &&
@ -1583,6 +1621,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 			goto done;
 		}
 		if (parent_dir) {
 			ceph_inode_set_subvolume(parent_dir,
 						 rinfo->diri.subvolume_id);
 			err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
 					      rinfo->dirfrag, session, -1,
 					      &req->r_caps_reservation);
@ -1671,6 +1711,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 		BUG_ON(!req->r_target_inode);
 		in = req->r_target_inode;
 		ceph_inode_set_subvolume(in, rinfo->targeti.subvolume_id);
 		err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
 				NULL, session,
 				(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@ -68,6 +68,21 @@ static void ceph_cap_reclaim_work(struct work_struct *work);
 static const struct ceph_connection_operations mds_con_ops;
 static void ceph_metric_bind_session(struct ceph_mds_client *mdsc,
 				     struct ceph_mds_session *session)
 {
 	struct ceph_mds_session *old;
 	if (!mdsc || !session || disable_send_metrics)
 		return;
 	old = mdsc->metric.session;
 	mdsc->metric.session = ceph_get_mds_session(session);
 	if (old)
 		ceph_put_mds_session(old);
 	metric_schedule_delayed(&mdsc->metric);
 }
 /*
 * mds reply parsing
@ -96,19 +111,19 @@ static int parse_reply_info_quota(void **p, void *end,
 	return -EIO;
 }
 /*
 * parse individual inode info
 */
 static int parse_reply_info_in(void **p, void *end,
 			       struct ceph_mds_reply_info_in *info,
-			       u64 features)
+			       u64 features,
 			       struct ceph_mds_client *mdsc)
 {
 	int err = 0;
 	u8 struct_v = 0;
 	u8 struct_compat = 0;
 	u32 struct_len = 0;
 	info->subvolume_id = CEPH_SUBVOLUME_ID_NONE;
 	if (features == (u64)-1) {
 		u32 struct_len;
 		u8 struct_compat;
 		ceph_decode_8_safe(p, end, struct_v, bad);
 		ceph_decode_8_safe(p, end, struct_compat, bad);
 		/* struct_v is expected to be >= 1. we only understand
@ -232,6 +247,30 @@ static int parse_reply_info_in(void **p, void *end,
 						      info->fscrypt_file_len, bad);
 			}
 		}
 		/*
 		 * InodeStat encoding versions:
 		 *   v1-v7: various fields added over time
 		 *   v8: added optmetadata (versioned sub-structure containing
 		 *       optional inode metadata like charmap for case-insensitive
 		 *       filesystems). The kernel client doesn't support
 		 *       case-insensitive lookups, so we skip this field.
 		 *   v9: added subvolume_id (parsed below)
 		 */
 		if (struct_v >= 8) {
 			u32 v8_struct_len;
 			/* skip optmetadata versioned sub-structure */
 			ceph_decode_skip_8(p, end, bad);  /* struct_v */
 			ceph_decode_skip_8(p, end, bad);  /* struct_compat */
 			ceph_decode_32_safe(p, end, v8_struct_len, bad);
 			ceph_decode_skip_n(p, end, v8_struct_len, bad);
 		}
 		/* struct_v 9 added subvolume_id */
 		if (struct_v >= 9)
 			ceph_decode_64_safe(p, end, info->subvolume_id, bad);
 		*p = end;
 	} else {
 		/* legacy (unversioned) struct */
@ -364,12 +403,13 @@ static int parse_reply_info_lease(void **p, void *end,
 */
 static int parse_reply_info_trace(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
-				  u64 features)
+				  u64 features,
 				  struct ceph_mds_client *mdsc)
 {
 	int err;
 	if (info->head->is_dentry) {
-		err = parse_reply_info_in(p, end, &info->diri, features);
+		err = parse_reply_info_in(p, end, &info->diri, features, mdsc);
 		if (err < 0)
 			goto out_bad;
@ -389,7 +429,8 @@ static int parse_reply_info_trace(void **p, void *end,
 	}
 	if (info->head->is_target) {
-		err = parse_reply_info_in(p, end, &info->targeti, features);
+		err = parse_reply_info_in(p, end, &info->targeti, features,
 					  mdsc);
 		if (err < 0)
 			goto out_bad;
 	}
@ -410,7 +451,8 @@ static int parse_reply_info_trace(void **p, void *end,
 */
 static int parse_reply_info_readdir(void **p, void *end,
 				    struct ceph_mds_request *req,
-				    u64 features)
+				    u64 features,
 				    struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
 	struct ceph_client *cl = req->r_mdsc->fsc->client;
@ -525,7 +567,7 @@ static int parse_reply_info_readdir(void **p, void *end,
 		rde->name_len = oname.len;
 		/* inode */
-		err = parse_reply_info_in(p, end, &rde->inode, features);
+		err = parse_reply_info_in(p, end, &rde->inode, features, mdsc);
 		if (err < 0)
 			goto out_bad;
 		/* ceph_readdir_prepopulate() will update it */
@ -733,7 +775,8 @@ static int parse_reply_info_extra(void **p, void *end,
 	if (op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
 	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
-		return parse_reply_info_readdir(p, end, req, features);
+		return parse_reply_info_readdir(p, end, req, features,
 						req->r_mdsc);
 	else if (op == CEPH_MDS_OP_CREATE)
 		return parse_reply_info_create(p, end, info, features, s);
 	else if (op == CEPH_MDS_OP_GETVXATTR)
@ -762,7 +805,8 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
 		ceph_decode_need(&p, end, len, bad);
-		err = parse_reply_info_trace(&p, p+len, info, features);
+		err = parse_reply_info_trace(&p, p + len, info, features,
 					     s->s_mdsc);
 		if (err < 0)
 			goto out_bad;
 	}
@ -771,7 +815,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
 		ceph_decode_need(&p, end, len, bad);
-		err = parse_reply_info_extra(&p, p+len, req, features, s);
+		err = parse_reply_info_extra(&p, p + len, req, features, s);
 		if (err < 0)
 			goto out_bad;
 	}
@ -3969,6 +4013,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 			goto out_err;
 		}
 		req->r_target_inode = in;
 		ceph_inode_set_subvolume(in, rinfo->targeti.subvolume_id);
 	}
 	mutex_lock(&session->s_mutex);
@ -4317,6 +4362,11 @@ static void handle_session(struct ceph_mds_session *session,
 		}
 		mdsc->s_cap_auths_num = cap_auths_num;
 		mdsc->s_cap_auths = cap_auths;
 		session->s_features = features;
 		if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
 			     &session->s_features))
 			ceph_metric_bind_session(mdsc, session);
 	}
 	if (op == CEPH_SESSION_CLOSE) {
 		ceph_get_mds_session(session);
@ -4343,7 +4393,11 @@ static void handle_session(struct ceph_mds_session *session,
 			pr_info_client(cl, "mds%d reconnect success\n",
 				       session->s_mds);
-		session->s_features = features;
+		if (test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS,
 			     &session->s_features))
 			ceph_subvolume_metrics_enable(&mdsc->subvol_metrics, true);
 		else
 			ceph_subvolume_metrics_enable(&mdsc->subvol_metrics, false);
 		if (session->s_state == CEPH_MDS_SESSION_OPEN) {
 			pr_notice_client(cl, "mds%d is already opened\n",
 					 session->s_mds);
@ -4956,7 +5010,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	/* placeholder for nr_caps */
 	err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
 	if (err)
-		goto fail;
+		goto fail_clear_cap_reconnect;
 	if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
 		recon_state.msg_version = 3;
@ -5046,6 +5100,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	ceph_pagelist_release(recon_state.pagelist);
 	return;
 fail_clear_cap_reconnect:
 	spin_lock(&session->s_cap_lock);
 	session->s_cap_reconnect = 0;
 	spin_unlock(&session->s_cap_lock);
 fail:
 	ceph_msg_put(reply);
 	up_read(&mdsc->snap_rwsem);
@ -5582,6 +5640,12 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	err = ceph_metric_init(&mdsc->metric);
 	if (err)
 		goto err_mdsmap;
 	ceph_subvolume_metrics_init(&mdsc->subvol_metrics);
 	mutex_init(&mdsc->subvol_metrics_last_mutex);
 	mdsc->subvol_metrics_last = NULL;
 	mdsc->subvol_metrics_last_nr = 0;
 	mdsc->subvol_metrics_sent = 0;
 	mdsc->subvol_metrics_nonzero_sends = 0;
 	spin_lock_init(&mdsc->dentry_list_lock);
 	INIT_LIST_HEAD(&mdsc->dentry_leases);
@ -6115,6 +6179,8 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
 	ceph_mdsc_stop(mdsc);
 	ceph_metric_destroy(&mdsc->metric);
 	ceph_subvolume_metrics_destroy(&mdsc->subvol_metrics);
 	kfree(mdsc->subvol_metrics_last);
 	fsc->mdsc = NULL;
 	kfree(mdsc);
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@ -18,6 +18,7 @@
 #include "mdsmap.h"
 #include "metric.h"
 #include "subvolume_metrics.h"
 #include "super.h"
 /* The first 8 bits are reserved for old ceph releases */
@ -36,8 +37,9 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
 	CEPHFS_FEATURE_HAS_OWNER_UIDGID,
 	CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK,
 	CEPHFS_FEATURE_SUBVOLUME_METRICS,
-	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK,
+	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_SUBVOLUME_METRICS,
 };
 #define CEPHFS_FEATURES_CLIENT_SUPPORTED {	\
@ -54,6 +56,7 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_32BITS_RETRY_FWD,	\
 	CEPHFS_FEATURE_HAS_OWNER_UIDGID,	\
 	CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK,	\
 	CEPHFS_FEATURE_SUBVOLUME_METRICS,	\
 }
 /*
@ -118,6 +121,7 @@ struct ceph_mds_reply_info_in {
 	u32 fscrypt_file_len;
 	u64 rsnaps;
 	u64 change_attr;
 	u64 subvolume_id;
 };
 struct ceph_mds_reply_dir_entry {
@ -536,6 +540,14 @@ struct ceph_mds_client {
 	struct list_head  dentry_dir_leases; /* lru list */
 	struct ceph_client_metric metric;
 	struct ceph_subvolume_metrics_tracker subvol_metrics;
 	/* Subvolume metrics send tracking */
 	struct mutex		subvol_metrics_last_mutex;
 	struct ceph_subvol_metric_snapshot *subvol_metrics_last;
 	u32			subvol_metrics_last_nr;
 	u64			subvol_metrics_sent;
 	u64			subvol_metrics_nonzero_sends;
 	spinlock_t		snapid_map_lock;
 	struct rb_root		snapid_map_tree;
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@ -4,10 +4,84 @@
 #include <linux/types.h>
 #include <linux/percpu_counter.h>
 #include <linux/math64.h>
 #include <linux/ratelimit.h>
 #include <linux/ceph/decode.h>
 #include "metric.h"
 #include "mds_client.h"
 static bool metrics_disable_warned;
 static inline u32 ceph_subvolume_entry_payload_len(void)
 {
 	return sizeof(struct ceph_subvolume_metric_entry_wire);
 }
 static inline u32 ceph_subvolume_entry_encoded_len(void)
 {
 	return CEPH_ENCODING_START_BLK_LEN +
 		ceph_subvolume_entry_payload_len();
 }
 static inline u32 ceph_subvolume_outer_payload_len(u32 nr_subvols)
 {
 	/* count is encoded as le64 (size_t on wire) to match FUSE client */
 	return sizeof(__le64) +
 		nr_subvols * ceph_subvolume_entry_encoded_len();
 }
 static inline u32 ceph_subvolume_metric_data_len(u32 nr_subvols)
 {
 	return CEPH_ENCODING_START_BLK_LEN +
 		ceph_subvolume_outer_payload_len(nr_subvols);
 }
 static inline u32 ceph_subvolume_clamp_u32(u64 val)
 {
 	return val > U32_MAX ? U32_MAX : (u32)val;
 }
 static void ceph_init_subvolume_wire_entry(
 	struct ceph_subvolume_metric_entry_wire *dst,
 	const struct ceph_subvol_metric_snapshot *src)
 {
 	dst->subvolume_id = cpu_to_le64(src->subvolume_id);
 	dst->read_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->read_ops));
 	dst->write_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->write_ops));
 	dst->read_bytes = cpu_to_le64(src->read_bytes);
 	dst->write_bytes = cpu_to_le64(src->write_bytes);
 	dst->read_latency_us = cpu_to_le64(src->read_latency_us);
 	dst->write_latency_us = cpu_to_le64(src->write_latency_us);
 	dst->time_stamp = 0;
 }
 static int ceph_encode_subvolume_metrics(void **p, void *end,
 					 struct ceph_subvol_metric_snapshot *subvols,
 					 u32 nr_subvols)
 {
 	u32 i;
 	ceph_start_encoding(p, 1, 1,
 			    ceph_subvolume_outer_payload_len(nr_subvols));
 	/* count is encoded as le64 (size_t on wire) to match FUSE client */
 	ceph_encode_64_safe(p, end, (u64)nr_subvols, enc_err);
 	for (i = 0; i < nr_subvols; i++) {
 		struct ceph_subvolume_metric_entry_wire wire_entry;
 		ceph_init_subvolume_wire_entry(&wire_entry, &subvols[i]);
 		ceph_start_encoding(p, 1, 1,
 				    ceph_subvolume_entry_payload_len());
 		ceph_encode_copy_safe(p, end, &wire_entry,
 				      sizeof(wire_entry), enc_err);
 	}
 	return 0;
 enc_err:
 	return -ERANGE;
 }
 static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val)
 {
 	struct timespec64 t = ktime_to_timespec64(val);
@ -29,10 +103,14 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
 	struct ceph_read_io_size *rsize;
 	struct ceph_write_io_size *wsize;
 	struct ceph_client_metric *m = &mdsc->metric;
 	struct ceph_subvol_metric_snapshot *subvols = NULL;
 	u64 nr_caps = atomic64_read(&m->total_caps);
 	u32 header_len = sizeof(struct ceph_metric_header);
 	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_msg *msg;
 	u32 nr_subvols = 0;
 	size_t subvol_len = 0;
 	void *cursor;
 	s64 sum;
 	s32 items = 0;
 	s32 len;
@ -45,15 +123,42 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
 	}
 	mutex_unlock(&mdsc->mutex);
 	if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) &&
 	    test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, &s->s_features)) {
 		int ret;
 		ret = ceph_subvolume_metrics_snapshot(&mdsc->subvol_metrics,
 						      &subvols, &nr_subvols,
 						      true);
 		if (ret) {
 			pr_warn_client(cl, "failed to snapshot subvolume metrics: %d\n",
 				       ret);
 			/*
 			 * On error, ceph_subvolume_metrics_snapshot() guarantees
 			 * *out = NULL and *nr = 0 at function entry, so subvols
 			 * is already NULL here - no cleanup needed.
 			 */
 			nr_subvols = 0;
 			subvols = NULL;
 		}
 	}
 	if (nr_subvols) {
 		/* type (le32) + ENCODE_START payload - no metric header */
 		subvol_len = sizeof(__le32) +
 			     ceph_subvolume_metric_data_len(nr_subvols);
 	}
 	len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
 	      + sizeof(*meta) + sizeof(*dlease) + sizeof(*files)
 	      + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize)
-	      + sizeof(*wsize);
+	      + sizeof(*wsize) + subvol_len;
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
 	if (!msg) {
 		pr_err_client(cl, "to mds%d, failed to allocate message\n",
 			      s->s_mds);
 		kfree(subvols);
 		return false;
 	}
@ -172,13 +277,56 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
 	wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum);
 	items++;
 	cursor = wsize + 1;
 	if (nr_subvols) {
 		void *payload;
 		void *payload_end;
 		int ret;
 		/* Emit only the type (le32), no ver/compat/data_len */
 		ceph_encode_32(&cursor, CLIENT_METRIC_TYPE_SUBVOLUME_METRICS);
 		items++;
 		payload = cursor;
 		payload_end = (char *)payload +
 			      ceph_subvolume_metric_data_len(nr_subvols);
 		ret = ceph_encode_subvolume_metrics(&payload, payload_end,
 						    subvols, nr_subvols);
 		if (ret) {
 			pr_warn_client(cl,
 				       "failed to encode subvolume metrics\n");
 			kfree(subvols);
 			ceph_msg_put(msg);
 			return false;
 		}
 		WARN_ON(payload != payload_end);
 		cursor = payload;
 	}
 	put_unaligned_le32(items, &head->num);
-	msg->front.iov_len = len;
+	msg->front.iov_len = (char *)cursor - (char *)head;
 	msg->hdr.version = cpu_to_le16(1);
 	msg->hdr.compat_version = cpu_to_le16(1);
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 	ceph_con_send(&s->s_con, msg);
 	if (nr_subvols) {
 		mutex_lock(&mdsc->subvol_metrics_last_mutex);
 		kfree(mdsc->subvol_metrics_last);
 		mdsc->subvol_metrics_last = subvols;
 		mdsc->subvol_metrics_last_nr = nr_subvols;
 		mdsc->subvol_metrics_sent += nr_subvols;
 		mdsc->subvol_metrics_nonzero_sends++;
 		mutex_unlock(&mdsc->subvol_metrics_last_mutex);
 		subvols = NULL;
 	}
 	kfree(subvols);
 	return true;
 }
@ -198,9 +346,20 @@ static void metric_get_session(struct ceph_mds_client *mdsc)
 		 * Skip it if MDS doesn't support the metric collection,
 		 * or the MDS will close the session's socket connection
 		 * directly when it get this message.
 		 *
 		 * Also skip sessions that don't support SUBVOLUME_METRICS
 		 * when subvolume metrics collection is enabled. This ensures
 		 * we only send subvolume metrics to MDSs that understand them.
 		 * If no session supports the feature, metrics won't be sent.
 		 */
 		if (check_session_state(s) &&
 		    test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) {
 			if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) &&
 			    !test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS,
 				      &s->s_features)) {
 				ceph_put_mds_session(s);
 				continue;
 			}
 			mdsc->metric.session = s;
 			break;
 		}
@ -217,9 +376,18 @@ static void metric_delayed_work(struct work_struct *work)
 	struct ceph_mds_client *mdsc =
 		container_of(m, struct ceph_mds_client, metric);
-	if (mdsc->stopping || disable_send_metrics)
+	if (mdsc->stopping)
 		return;
 	if (disable_send_metrics) {
 		if (!metrics_disable_warned) {
 			pr_info("ceph: metrics sending disabled via module parameter\n");
 			metrics_disable_warned = true;
 		}
 		return;
 	}
 	metrics_disable_warned = false;
 	if (!m->session || !check_session_state(m->session)) {
 		if (m->session) {
 			ceph_put_mds_session(m->session);
@ -227,10 +395,13 @@ static void metric_delayed_work(struct work_struct *work)
 		}
 		metric_get_session(mdsc);
 	}
-	if (m->session) {
+
 	if (m->session)
 		ceph_mdsc_send_metrics(mdsc, m->session);
-		metric_schedule_delayed(m);
+	else
-	}
+		pr_warn_ratelimited("ceph: metrics worker has no MDS session\n");
 	metric_schedule_delayed(m);
 }
 int ceph_metric_init(struct ceph_client_metric *m)
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@ -25,8 +25,9 @@ enum ceph_metric_type {
 	CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
 	CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
 	CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
 	CLIENT_METRIC_TYPE_SUBVOLUME_METRICS,
-	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_SUBVOLUME_METRICS,
 };
 /*
@ -50,6 +51,7 @@ enum ceph_metric_type {
 	CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,	   \
 	CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,   \
 	CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \
 	CLIENT_METRIC_TYPE_SUBVOLUME_METRICS,	   \
 						   \
 	CLIENT_METRIC_TYPE_MAX,			   \
 }
@ -139,6 +141,41 @@ struct ceph_write_io_size {
 	__le64 total_size;
 } __packed;
 /**
 * struct ceph_subvolume_metric_entry_wire - On-wire format sent to MDS
 * @subvolume_id: Subvolume identifier
 * @read_ops: Read operation count (32-bit, clamped from 64-bit internal)
 * @write_ops: Write operation count (32-bit, clamped from 64-bit internal)
 * @read_bytes: Total bytes read
 * @write_bytes: Total bytes written
 * @read_latency_us: Cumulative read latency in microseconds
 * @write_latency_us: Cumulative write latency in microseconds
 * @time_stamp: Collection timestamp (currently unused, set to 0)
 *
 * Wire format must match C++ AggregatedIOMetrics struct in MDS.
 */
 struct ceph_subvolume_metric_entry_wire {
 	__le64 subvolume_id;
 	__le32 read_ops;
 	__le32 write_ops;
 	__le64 read_bytes;
 	__le64 write_bytes;
 	__le64 read_latency_us;
 	__le64 write_latency_us;
 	__le64 time_stamp;
 } __packed;
 /* Old struct kept for internal tracking, not used on wire */
 struct ceph_subvolume_metric_entry {
 	__le64 subvolume_id;
 	__le64 read_ops;
 	__le64 write_ops;
 	__le64 read_bytes;
 	__le64 write_bytes;
 	__le64 read_latency_us;
 	__le64 write_latency_us;
 } __packed;
 struct ceph_metric_head {
 	__le32 num;	/* the number of metrics that will be sent */
 } __packed;
--- a/fs/ceph/subvolume_metrics.c
+++ b/fs/ceph/subvolume_metrics.c
@ -0,0 +1,416 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/ceph/ceph_debug.h>
 #include <linux/math64.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include "subvolume_metrics.h"
 #include "mds_client.h"
 #include "super.h"
 /**
 * struct ceph_subvol_metric_rb_entry - Per-subvolume I/O metrics node
 * @node: Red-black tree linkage for tracker->tree
 * @subvolume_id: Subvolume identifier (key for rb-tree lookup)
 * @read_ops: Accumulated read operation count since last snapshot
 * @write_ops: Accumulated write operation count since last snapshot
 * @read_bytes: Accumulated bytes read since last snapshot
 * @write_bytes: Accumulated bytes written since last snapshot
 * @read_latency_us: Sum of read latencies in microseconds
 * @write_latency_us: Sum of write latencies in microseconds
 */
 struct ceph_subvol_metric_rb_entry {
 	struct rb_node node;
 	u64 subvolume_id;
 	u64 read_ops;
 	u64 write_ops;
 	u64 read_bytes;
 	u64 write_bytes;
 	u64 read_latency_us;
 	u64 write_latency_us;
 };
 static struct kmem_cache *ceph_subvol_metric_entry_cachep;
 void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker)
 {
 	spin_lock_init(&tracker->lock);
 	tracker->tree = RB_ROOT_CACHED;
 	tracker->nr_entries = 0;
 	tracker->enabled = false;
 	atomic64_set(&tracker->snapshot_attempts, 0);
 	atomic64_set(&tracker->snapshot_empty, 0);
 	atomic64_set(&tracker->snapshot_failures, 0);
 	atomic64_set(&tracker->record_calls, 0);
 	atomic64_set(&tracker->record_disabled, 0);
 	atomic64_set(&tracker->record_no_subvol, 0);
 	atomic64_set(&tracker->total_read_ops, 0);
 	atomic64_set(&tracker->total_read_bytes, 0);
 	atomic64_set(&tracker->total_write_ops, 0);
 	atomic64_set(&tracker->total_write_bytes, 0);
 }
 static struct ceph_subvol_metric_rb_entry *
 __lookup_entry(struct ceph_subvolume_metrics_tracker *tracker, u64 subvol_id)
 {
 	struct rb_node *node;
 	node = tracker->tree.rb_root.rb_node;
 	while (node) {
 		struct ceph_subvol_metric_rb_entry *entry =
 			rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
 		if (subvol_id < entry->subvolume_id)
 			node = node->rb_left;
 		else if (subvol_id > entry->subvolume_id)
 			node = node->rb_right;
 		else
 			return entry;
 	}
 	return NULL;
 }
 static struct ceph_subvol_metric_rb_entry *
 __insert_entry(struct ceph_subvolume_metrics_tracker *tracker,
 	       struct ceph_subvol_metric_rb_entry *entry)
 {
 	struct rb_node **link = &tracker->tree.rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	bool leftmost = true;
 	while (*link) {
 		struct ceph_subvol_metric_rb_entry *cur =
 			rb_entry(*link, struct ceph_subvol_metric_rb_entry, node);
 		parent = *link;
 		if (entry->subvolume_id < cur->subvolume_id)
 			link = &(*link)->rb_left;
 		else if (entry->subvolume_id > cur->subvolume_id) {
 			link = &(*link)->rb_right;
 			leftmost = false;
 		} else
 			return cur;
 	}
 	rb_link_node(&entry->node, parent, link);
 	rb_insert_color_cached(&entry->node, &tracker->tree, leftmost);
 	tracker->nr_entries++;
 	return entry;
 }
 static void ceph_subvolume_metrics_clear_locked(
 		struct ceph_subvolume_metrics_tracker *tracker)
 {
 	struct rb_node *node = rb_first_cached(&tracker->tree);
 	while (node) {
 		struct ceph_subvol_metric_rb_entry *entry =
 			rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
 		struct rb_node *next = rb_next(node);
 		rb_erase_cached(&entry->node, &tracker->tree);
 		tracker->nr_entries--;
 		kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
 		node = next;
 	}
 	tracker->tree = RB_ROOT_CACHED;
 }
 void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker)
 {
 	spin_lock(&tracker->lock);
 	ceph_subvolume_metrics_clear_locked(tracker);
 	tracker->enabled = false;
 	spin_unlock(&tracker->lock);
 }
 void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker,
 				   bool enable)
 {
 	spin_lock(&tracker->lock);
 	if (enable) {
 		tracker->enabled = true;
 	} else {
 		tracker->enabled = false;
 		ceph_subvolume_metrics_clear_locked(tracker);
 	}
 	spin_unlock(&tracker->lock);
 }
 void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker,
 				   u64 subvol_id, bool is_write,
 				   size_t size, u64 latency_us)
 {
 	struct ceph_subvol_metric_rb_entry *entry, *new_entry = NULL;
 	bool retry = false;
 	/* CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset subvolume */
 	if (!READ_ONCE(tracker->enabled) ||
 	    subvol_id == CEPH_SUBVOLUME_ID_NONE || !size || !latency_us)
 		return;
 	/*
 	 * Retry loop for lock-free allocation pattern:
 	 * 1. First iteration: lookup under lock, if miss -> drop lock, alloc, retry
 	 * 2. Second iteration: lookup again (may have been inserted), insert if still missing
 	 * 3. On race (another thread inserted same key): free our alloc, retry
 	 * All successful paths exit via return, so retry flag doesn't need reset.
 	 */
 	do {
 		spin_lock(&tracker->lock);
 		if (!tracker->enabled) {
 			spin_unlock(&tracker->lock);
 			if (new_entry)
 				kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
 			return;
 		}
 		entry = __lookup_entry(tracker, subvol_id);
 		if (!entry) {
 			if (!new_entry) {
 				spin_unlock(&tracker->lock);
 				new_entry = kmem_cache_zalloc(ceph_subvol_metric_entry_cachep,
 						      GFP_NOFS);
 				if (!new_entry)
 					return;
 				new_entry->subvolume_id = subvol_id;
 				retry = true;
 				continue;
 			}
 			entry = __insert_entry(tracker, new_entry);
 			if (entry != new_entry) {
 				/* raced with another insert */
 				spin_unlock(&tracker->lock);
 				kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
 				new_entry = NULL;
 				retry = true;
 				continue;
 			}
 			new_entry = NULL;
 		}
 		if (is_write) {
 			entry->write_ops++;
 			entry->write_bytes += size;
 			entry->write_latency_us += latency_us;
 			atomic64_inc(&tracker->total_write_ops);
 			atomic64_add(size, &tracker->total_write_bytes);
 		} else {
 			entry->read_ops++;
 			entry->read_bytes += size;
 			entry->read_latency_us += latency_us;
 			atomic64_inc(&tracker->total_read_ops);
 			atomic64_add(size, &tracker->total_read_bytes);
 		}
 		spin_unlock(&tracker->lock);
 		if (new_entry)
 			kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
 		return;
 	} while (retry);
 }
 int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker,
 				    struct ceph_subvol_metric_snapshot **out,
 				    u32 *nr, bool consume)
 {
 	struct ceph_subvol_metric_snapshot *snap = NULL;
 	struct rb_node *node;
 	u32 count = 0, idx = 0;
 	int ret = 0;
 	*out = NULL;
 	*nr = 0;
 	if (!READ_ONCE(tracker->enabled))
 		return 0;
 	atomic64_inc(&tracker->snapshot_attempts);
 	spin_lock(&tracker->lock);
 	for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) {
 		struct ceph_subvol_metric_rb_entry *entry =
 			rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
 		/* Include entries with ANY I/O activity (read OR write) */
 		if (entry->read_ops || entry->write_ops)
 			count++;
 	}
 	spin_unlock(&tracker->lock);
 	if (!count) {
 		atomic64_inc(&tracker->snapshot_empty);
 		return 0;
 	}
 	snap = kcalloc(count, sizeof(*snap), GFP_NOFS);
 	if (!snap) {
 		atomic64_inc(&tracker->snapshot_failures);
 		return -ENOMEM;
 	}
 	spin_lock(&tracker->lock);
 	node = rb_first_cached(&tracker->tree);
 	while (node) {
 		struct ceph_subvol_metric_rb_entry *entry =
 			rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
 		struct rb_node *next = rb_next(node);
 		/* Skip entries with NO I/O activity at all */
 		if (!entry->read_ops && !entry->write_ops) {
 			rb_erase_cached(&entry->node, &tracker->tree);
 			tracker->nr_entries--;
 			kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
 			node = next;
 			continue;
 		}
 		if (idx >= count) {
 			pr_warn("ceph: subvol metrics snapshot race (idx=%u count=%u)\n",
 				idx, count);
 			break;
 		}
 		snap[idx].subvolume_id = entry->subvolume_id;
 		snap[idx].read_ops = entry->read_ops;
 		snap[idx].write_ops = entry->write_ops;
 		snap[idx].read_bytes = entry->read_bytes;
 		snap[idx].write_bytes = entry->write_bytes;
 		snap[idx].read_latency_us = entry->read_latency_us;
 		snap[idx].write_latency_us = entry->write_latency_us;
 		idx++;
 		if (consume) {
 			entry->read_ops = 0;
 			entry->write_ops = 0;
 			entry->read_bytes = 0;
 			entry->write_bytes = 0;
 			entry->read_latency_us = 0;
 			entry->write_latency_us = 0;
 			rb_erase_cached(&entry->node, &tracker->tree);
 			tracker->nr_entries--;
 			kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
 		}
 		node = next;
 	}
 	spin_unlock(&tracker->lock);
 	if (!idx) {
 		kfree(snap);
 		snap = NULL;
 		ret = 0;
 	} else {
 		*nr = idx;
 		*out = snap;
 	}
 	return ret;
 }
 void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot)
 {
 	kfree(snapshot);
 }
 /*
 * Dump subvolume metrics to a seq_file for debugfs.
 *
 * Iterates the rb-tree directly under spinlock to avoid allocation.
 * The lock hold time is minimal since we're only doing seq_printf calls.
 */
 void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker,
 				 struct seq_file *s)
 {
 	struct rb_node *node;
 	bool found = false;
 	spin_lock(&tracker->lock);
 	if (!tracker->enabled) {
 		spin_unlock(&tracker->lock);
 		seq_puts(s, "subvolume metrics disabled\n");
 		return;
 	}
 	for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) {
 		struct ceph_subvol_metric_rb_entry *entry =
 			rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
 		u64 avg_rd_lat, avg_wr_lat;
 		if (!entry->read_ops && !entry->write_ops)
 			continue;
 		if (!found) {
 			seq_puts(s, "subvol_id       rd_ops    rd_bytes    rd_avg_lat_us  wr_ops    wr_bytes    wr_avg_lat_us\n");
 			seq_puts(s, "------------------------------------------------------------------------------------------------\n");
 			found = true;
 		}
 		avg_rd_lat = entry->read_ops ?
 			div64_u64(entry->read_latency_us, entry->read_ops) : 0;
 		avg_wr_lat = entry->write_ops ?
 			div64_u64(entry->write_latency_us, entry->write_ops) : 0;
 		seq_printf(s, "%-15llu%-10llu%-12llu%-16llu%-10llu%-12llu%-16llu\n",
 			   entry->subvolume_id,
 			   entry->read_ops,
 			   entry->read_bytes,
 			   avg_rd_lat,
 			   entry->write_ops,
 			   entry->write_bytes,
 			   avg_wr_lat);
 	}
 	spin_unlock(&tracker->lock);
 	if (!found)
 		seq_puts(s, "(no subvolume metrics collected)\n");
 }
 void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc,
 				      struct ceph_inode_info *ci,
 				      bool is_write, size_t bytes,
 				      ktime_t start, ktime_t end)
 {
 	struct ceph_subvolume_metrics_tracker *tracker;
 	u64 subvol_id;
 	s64 delta_us;
 	if (!mdsc || !ci || !bytes)
 		return;
 	tracker = &mdsc->subvol_metrics;
 	atomic64_inc(&tracker->record_calls);
 	if (!ceph_subvolume_metrics_enabled(tracker)) {
 		atomic64_inc(&tracker->record_disabled);
 		return;
 	}
 	subvol_id = READ_ONCE(ci->i_subvolume_id);
 	if (subvol_id == CEPH_SUBVOLUME_ID_NONE) {
 		atomic64_inc(&tracker->record_no_subvol);
 		return;
 	}
 	delta_us = ktime_to_us(ktime_sub(end, start));
 	if (delta_us <= 0)
 		delta_us = 1;
 	ceph_subvolume_metrics_record(tracker, subvol_id, is_write,
 				      bytes, (u64)delta_us);
 }
 int __init ceph_subvolume_metrics_cache_init(void)
 {
 	ceph_subvol_metric_entry_cachep = KMEM_CACHE(ceph_subvol_metric_rb_entry,
 						    SLAB_RECLAIM_ACCOUNT);
 	if (!ceph_subvol_metric_entry_cachep)
 		return -ENOMEM;
 	return 0;
 }
 void ceph_subvolume_metrics_cache_destroy(void)
 {
 	kmem_cache_destroy(ceph_subvol_metric_entry_cachep);
 }
--- a/fs/ceph/subvolume_metrics.h
+++ b/fs/ceph/subvolume_metrics.h
@ -0,0 +1,97 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _FS_CEPH_SUBVOLUME_METRICS_H
 #define _FS_CEPH_SUBVOLUME_METRICS_H
 #include <linux/types.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
 #include <linux/ktime.h>
 #include <linux/atomic.h>
 struct seq_file;
 struct ceph_mds_client;
 struct ceph_inode_info;
 /**
 * struct ceph_subvol_metric_snapshot - Point-in-time snapshot of subvolume metrics
 * @subvolume_id: Subvolume identifier (inode number of subvolume root)
 * @read_ops: Number of read operations since last snapshot
 * @write_ops: Number of write operations since last snapshot
 * @read_bytes: Total bytes read since last snapshot
 * @write_bytes: Total bytes written since last snapshot
 * @read_latency_us: Sum of read latencies in microseconds (for avg calculation)
 * @write_latency_us: Sum of write latencies in microseconds (for avg calculation)
 */
 struct ceph_subvol_metric_snapshot {
 	u64 subvolume_id;
 	u64 read_ops;
 	u64 write_ops;
 	u64 read_bytes;
 	u64 write_bytes;
 	u64 read_latency_us;
 	u64 write_latency_us;
 };
 /**
 * struct ceph_subvolume_metrics_tracker - Tracks per-subvolume I/O metrics
 * @lock: Protects @tree and @nr_entries during concurrent access
 * @tree: Red-black tree of per-subvolume entries, keyed by subvolume_id
 * @nr_entries: Number of entries currently in @tree
 * @enabled: Whether collection is enabled (requires MDS feature support)
 * @snapshot_attempts: Debug counter: total ceph_subvolume_metrics_snapshot() calls
 * @snapshot_empty: Debug counter: snapshots that found no data to report
 * @snapshot_failures: Debug counter: snapshots that failed to allocate memory
 * @record_calls: Debug counter: total ceph_subvolume_metrics_record() calls
 * @record_disabled: Debug counter: record calls skipped because disabled
 * @record_no_subvol: Debug counter: record calls skipped (no subvolume_id)
 * @total_read_ops: Cumulative read ops across all snapshots (never reset)
 * @total_read_bytes: Cumulative bytes read across all snapshots (never reset)
 * @total_write_ops: Cumulative write ops across all snapshots (never reset)
 * @total_write_bytes: Cumulative bytes written across all snapshots (never reset)
 */
 struct ceph_subvolume_metrics_tracker {
 	spinlock_t lock;
 	struct rb_root_cached tree;
 	u32 nr_entries;
 	bool enabled;
 	atomic64_t snapshot_attempts;
 	atomic64_t snapshot_empty;
 	atomic64_t snapshot_failures;
 	atomic64_t record_calls;
 	atomic64_t record_disabled;
 	atomic64_t record_no_subvol;
 	atomic64_t total_read_ops;
 	atomic64_t total_read_bytes;
 	atomic64_t total_write_ops;
 	atomic64_t total_write_bytes;
 };
 void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker);
 void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker);
 void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker,
 				   bool enable);
 void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker,
 				   u64 subvol_id, bool is_write,
 				   size_t size, u64 latency_us);
 int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker,
 				    struct ceph_subvol_metric_snapshot **out,
 				    u32 *nr, bool consume);
 void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot);
 void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker,
 				 struct seq_file *s);
 void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc,
 				      struct ceph_inode_info *ci,
 				      bool is_write, size_t bytes,
 				      ktime_t start, ktime_t end);
 static inline bool ceph_subvolume_metrics_enabled(
 		const struct ceph_subvolume_metrics_tracker *tracker)
 {
 	return READ_ONCE(tracker->enabled);
 }
 int __init ceph_subvolume_metrics_cache_init(void);
 void ceph_subvolume_metrics_cache_destroy(void);
 #endif /* _FS_CEPH_SUBVOLUME_METRICS_H */
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@ -21,6 +21,7 @@
 #include "mds_client.h"
 #include "cache.h"
 #include "crypto.h"
 #include "subvolume_metrics.h"
 #include <linux/ceph/ceph_features.h>
 #include <linux/ceph/decode.h>
@ -966,8 +967,14 @@ static int __init init_caches(void)
 	if (!ceph_wb_pagevec_pool)
 		goto bad_pagevec_pool;
 	error = ceph_subvolume_metrics_cache_init();
 	if (error)
 		goto bad_subvol_metrics;
 	return 0;
 bad_subvol_metrics:
 	mempool_destroy(ceph_wb_pagevec_pool);
 bad_pagevec_pool:
 	kmem_cache_destroy(ceph_mds_request_cachep);
 bad_mds_req:
@ -1004,6 +1011,7 @@ static void destroy_caches(void)
 	kmem_cache_destroy(ceph_dir_file_cachep);
 	kmem_cache_destroy(ceph_mds_request_cachep);
 	mempool_destroy(ceph_wb_pagevec_pool);
 	ceph_subvolume_metrics_cache_destroy();
 }
 static void __ceph_umount_begin(struct ceph_fs_client *fsc)
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@ -179,6 +179,7 @@ struct ceph_fs_client {
 	struct dentry *debugfs_status;
 	struct dentry *debugfs_mds_sessions;
 	struct dentry *debugfs_metrics_dir;
 	struct dentry *debugfs_subvolume_metrics;
 #endif
 #ifdef CONFIG_CEPH_FSCACHE
@ -398,6 +399,15 @@ struct ceph_inode_info {
 	/* quotas */
 	u64 i_max_bytes, i_max_files;
 	/*
 	 * Subvolume ID this inode belongs to. CEPH_SUBVOLUME_ID_NONE (0)
 	 * means unknown/unset, matching the FUSE client convention.
 	 * Once set to a valid (non-zero) value, it should not change
 	 * during the inode's lifetime.
 	 */
 #define CEPH_SUBVOLUME_ID_NONE 0
 	u64 i_subvolume_id;
 	s32 i_dir_pin;
 	struct rb_root i_fragtree;
@ -1069,6 +1079,7 @@ extern struct inode *ceph_get_inode(struct super_block *sb,
 extern struct inode *ceph_get_snapdir(struct inode *parent);
 extern int ceph_fill_file_size(struct inode *inode, int issued,
 			       u32 truncate_seq, u64 truncate_size, u64 size);
 extern void ceph_inode_set_subvolume(struct inode *inode, u64 subvolume_id);
 extern void ceph_fill_file_time(struct inode *inode, int issued,
 				u64 time_warp_seq, struct timespec64 *ctime,
 				struct timespec64 *mtime,
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@ -245,7 +245,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 			ac->protocol = 0;
 			ac->ops = NULL;
 		}
-		if (ac->protocol != protocol) {
+		if (!ac->protocol) {
 			ret = init_protocol(ac, protocol);
 			if (ret) {
 				pr_err("auth protocol '%s' init failed: %d\n",
@ -257,7 +257,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 		ac->negotiating = false;
 	}
-	if (result) {
+	if (result < 0) {
 		pr_err("auth protocol '%s' mauth authentication failed: %d\n",
 		       ceph_auth_proto_name(ac->protocol), result);
 		ret = result;
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@ -911,7 +911,7 @@ int crush_do_rule(const struct crush_map *map,
 	int osize;
 	const struct crush_rule *rule;
 	__u32 step;
-	int i, j;
+	int i;
 	int numrep;
 	int out_size;
 	/*
@ -1012,7 +1012,6 @@ int crush_do_rule(const struct crush_map *map,
 					if (numrep <= 0)
 						continue;
 				}
 				j = 0;
 				/* make sure bucket id is valid */
 				bno = -1 - w[i];
 				if (bno < 0 || bno >= map->max_buckets) {
@ -1036,7 +1035,7 @@ int crush_do_rule(const struct crush_map *map,
 						weight, weight_max,
 						x, numrep,
 						curstep->arg2,
-						o+osize, j,
+						o+osize, 0,
 						result_max-osize,
 						choose_tries,
 						recurse_tries,
@ -1058,7 +1057,7 @@ int crush_do_rule(const struct crush_map *map,
 						weight, weight_max,
 						x, out_size, numrep,
 						curstep->arg2,
-						o+osize, j,
+						o+osize, 0,
 						choose_tries,
 						choose_leaf_tries ?
 						   choose_leaf_tries : 1,
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@ -368,8 +368,8 @@ static void ceph_sock_write_space(struct sock *sk)
 	/* only queue to workqueue if there is data we want to write,
 	 * and there is sufficient space in the socket buffer to accept
 	 * more data.  clear SOCK_NOSPACE so that ceph_sock_write_space()
-	 * doesn't get called again until try_write() fills the socket
+	 * doesn't get called again until ceph_con_v[12]_try_write() fills
-	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
+	 * the socket buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
 	if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@ -8,7 +8,6 @@
 #include <linux/ceph/ceph_debug.h>
 #include <crypto/aead.h>
 #include <crypto/hash.h>
 #include <crypto/sha2.h>
 #include <crypto/utils.h>
 #include <linux/bvec.h>
@ -2352,16 +2351,14 @@ static int process_auth_reply_more(struct ceph_connection *con,
 }
 /*
- * Align session_key and con_secret to avoid GFP_ATOMIC allocation
+ * Align con_secret to avoid GFP_ATOMIC allocation inside
- * inside crypto_shash_setkey() and crypto_aead_setkey() called from
+ * crypto_aead_setkey() called from setup_crypto().  __aligned(16)
- * setup_crypto().  __aligned(16) isn't guaranteed to work for stack
+ * isn't guaranteed to work for stack objects, so do it by hand.
 * objects, so do it by hand.
 */
 static int process_auth_done(struct ceph_connection *con, void *p, void *end)
 {
-	u8 session_key_buf[CEPH_MAX_KEY_LEN + 16];
+	u8 session_key[CEPH_MAX_KEY_LEN];
 	u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16];
 	u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16);
 	u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16);
 	int session_key_len, con_secret_len;
 	int payload_len;
@ -2415,7 +2412,7 @@ static int process_auth_done(struct ceph_connection *con, void *p, void *end)
 	con->state = CEPH_CON_S_V2_AUTH_SIGNATURE;
 out:
-	memzero_explicit(session_key_buf, sizeof(session_key_buf));
+	memzero_explicit(session_key, sizeof(session_key));
 	memzero_explicit(con_secret_buf, sizeof(con_secret_buf));
 	return ret;
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@ -174,6 +174,8 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
 */
 static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 {
 	BUG_ON(len > monc->m_auth->front_alloc_len);
 	monc->pending_auth = 1;
 	monc->m_auth->front.iov_len = len;
 	monc->m_auth->hdr.front_len = cpu_to_le32(len);