mirror of
https://github.com/torvalds/linux.git
synced 2026-05-12 16:18:45 +02:00
ceph: add subvolume metrics collection and reporting
Add complete infrastructure for per-subvolume I/O metrics collection and reporting to the MDS. This enables administrators to monitor I/O patterns at the subvolume granularity, which is useful for multi-tenant CephFS deployments. This patch adds: - CEPHFS_FEATURE_SUBVOLUME_METRICS feature flag for MDS negotiation - CEPH_SUBVOLUME_ID_NONE constant (0) for unknown/unset state - Red-black tree based metrics tracker for efficient per-subvolume aggregation with kmem_cache for entry allocations - Wire format encoding matching the MDS C++ AggregatedIOMetrics struct - Integration with the existing CLIENT_METRICS message - Recording of I/O operations from file read/write and writeback paths - Debugfs interfaces for monitoring (metrics/subvolumes, metrics/metric_features) Metrics tracked per subvolume include: - Read/write operation counts - Read/write byte counts - Read/write latency sums (for average calculation) The metrics are periodically sent to the MDS as part of the existing metrics reporting infrastructure when the MDS advertises support for the SUBVOLUME_METRICS feature. CEPH_SUBVOLUME_ID_NONE enforces subvolume_id immutability. Following the FUSE client convention, 0 means unknown/unset. Once an inode has a valid (non-zero) subvolume_id, it should not change during the inode's lifetime. Signed-off-by: Alex Markuze <amarkuze@redhat.com> Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
This commit is contained in:
parent
4a1c543479
commit
b1137e0b3d
|
|
@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
|
|||
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
|
||||
export.o caps.o snap.o xattr.o quota.o io.o \
|
||||
mds_client.o mdsmap.o strings.o ceph_frag.o \
|
||||
debugfs.o util.o metric.o
|
||||
debugfs.o util.o metric.o subvolume_metrics.o
|
||||
|
||||
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
|
||||
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@
|
|||
#include "mds_client.h"
|
||||
#include "cache.h"
|
||||
#include "metric.h"
|
||||
#include "subvolume_metrics.h"
|
||||
#include "crypto.h"
|
||||
#include <linux/ceph/osd_client.h>
|
||||
#include <linux/ceph/striper.h>
|
||||
|
|
@ -259,6 +260,10 @@ static void finish_netfs_read(struct ceph_osd_request *req)
|
|||
osd_data->length), false);
|
||||
}
|
||||
if (err > 0) {
|
||||
ceph_subvolume_metrics_record_io(fsc->mdsc, ceph_inode(inode),
|
||||
false, err,
|
||||
req->r_start_latency,
|
||||
req->r_end_latency);
|
||||
subreq->transferred = err;
|
||||
err = 0;
|
||||
}
|
||||
|
|
@ -823,6 +828,10 @@ static int write_folio_nounlock(struct folio *folio,
|
|||
|
||||
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
|
||||
req->r_end_latency, len, err);
|
||||
if (err >= 0 && len > 0)
|
||||
ceph_subvolume_metrics_record_io(fsc->mdsc, ci, true, len,
|
||||
req->r_start_latency,
|
||||
req->r_end_latency);
|
||||
fscrypt_free_bounce_page(bounce_page);
|
||||
ceph_osdc_put_request(req);
|
||||
if (err == 0)
|
||||
|
|
@ -963,6 +972,11 @@ static void writepages_finish(struct ceph_osd_request *req)
|
|||
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
|
||||
req->r_end_latency, len, rc);
|
||||
|
||||
if (rc >= 0 && len > 0)
|
||||
ceph_subvolume_metrics_record_io(mdsc, ci, true, len,
|
||||
req->r_start_latency,
|
||||
req->r_end_latency);
|
||||
|
||||
ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
|
||||
|
||||
osd_data = osd_req_op_extent_osd_data(req, 0);
|
||||
|
|
|
|||
|
|
@ -9,11 +9,13 @@
|
|||
#include <linux/seq_file.h>
|
||||
#include <linux/math64.h>
|
||||
#include <linux/ktime.h>
|
||||
#include <linux/atomic.h>
|
||||
|
||||
#include <linux/ceph/libceph.h>
|
||||
#include <linux/ceph/mon_client.h>
|
||||
#include <linux/ceph/auth.h>
|
||||
#include <linux/ceph/debugfs.h>
|
||||
#include <linux/ceph/decode.h>
|
||||
|
||||
#include "super.h"
|
||||
|
||||
|
|
@ -21,6 +23,36 @@
|
|||
|
||||
#include "mds_client.h"
|
||||
#include "metric.h"
|
||||
#include "subvolume_metrics.h"
|
||||
|
||||
/**
|
||||
* struct ceph_session_feature_desc - Maps feature bits to names for debugfs
|
||||
* @bit: Feature bit number from enum ceph_feature_type (see mds_client.h)
|
||||
* @name: Human-readable feature name for debugfs output
|
||||
*
|
||||
* Used by metric_features_show() to display negotiated session features.
|
||||
*/
|
||||
struct ceph_session_feature_desc {
|
||||
unsigned int bit;
|
||||
const char *name;
|
||||
};
|
||||
|
||||
static const struct ceph_session_feature_desc ceph_session_feature_table[] = {
|
||||
{ CEPHFS_FEATURE_METRIC_COLLECT, "METRIC_COLLECT" },
|
||||
{ CEPHFS_FEATURE_REPLY_ENCODING, "REPLY_ENCODING" },
|
||||
{ CEPHFS_FEATURE_RECLAIM_CLIENT, "RECLAIM_CLIENT" },
|
||||
{ CEPHFS_FEATURE_LAZY_CAP_WANTED, "LAZY_CAP_WANTED" },
|
||||
{ CEPHFS_FEATURE_MULTI_RECONNECT, "MULTI_RECONNECT" },
|
||||
{ CEPHFS_FEATURE_DELEG_INO, "DELEG_INO" },
|
||||
{ CEPHFS_FEATURE_ALTERNATE_NAME, "ALTERNATE_NAME" },
|
||||
{ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, "NOTIFY_SESSION_STATE" },
|
||||
{ CEPHFS_FEATURE_OP_GETVXATTR, "OP_GETVXATTR" },
|
||||
{ CEPHFS_FEATURE_32BITS_RETRY_FWD, "32BITS_RETRY_FWD" },
|
||||
{ CEPHFS_FEATURE_NEW_SNAPREALM_INFO, "NEW_SNAPREALM_INFO" },
|
||||
{ CEPHFS_FEATURE_HAS_OWNER_UIDGID, "HAS_OWNER_UIDGID" },
|
||||
{ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, "MDS_AUTH_CAPS_CHECK" },
|
||||
{ CEPHFS_FEATURE_SUBVOLUME_METRICS, "SUBVOLUME_METRICS" },
|
||||
};
|
||||
|
||||
static int mdsmap_show(struct seq_file *s, void *p)
|
||||
{
|
||||
|
|
@ -360,6 +392,59 @@ static int status_show(struct seq_file *s, void *p)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int subvolume_metrics_show(struct seq_file *s, void *p)
|
||||
{
|
||||
struct ceph_fs_client *fsc = s->private;
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_subvol_metric_snapshot *snapshot = NULL;
|
||||
u32 nr = 0;
|
||||
u64 total_sent = 0;
|
||||
u64 nonzero_sends = 0;
|
||||
u32 i;
|
||||
|
||||
if (!mdsc) {
|
||||
seq_puts(s, "mds client unavailable\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
mutex_lock(&mdsc->subvol_metrics_last_mutex);
|
||||
if (mdsc->subvol_metrics_last && mdsc->subvol_metrics_last_nr) {
|
||||
nr = mdsc->subvol_metrics_last_nr;
|
||||
snapshot = kmemdup_array(mdsc->subvol_metrics_last, nr,
|
||||
sizeof(*snapshot), GFP_KERNEL);
|
||||
if (!snapshot)
|
||||
nr = 0;
|
||||
}
|
||||
total_sent = mdsc->subvol_metrics_sent;
|
||||
nonzero_sends = mdsc->subvol_metrics_nonzero_sends;
|
||||
mutex_unlock(&mdsc->subvol_metrics_last_mutex);
|
||||
|
||||
seq_puts(s, "Last sent subvolume metrics:\n");
|
||||
if (!nr) {
|
||||
seq_puts(s, " (none)\n");
|
||||
} else {
|
||||
seq_puts(s, " subvol_id rd_ops wr_ops rd_bytes wr_bytes rd_lat_us wr_lat_us\n");
|
||||
for (i = 0; i < nr; i++) {
|
||||
const struct ceph_subvol_metric_snapshot *e = &snapshot[i];
|
||||
|
||||
seq_printf(s, " %-18llu %-9llu %-9llu %-14llu %-14llu %-14llu %-14llu\n",
|
||||
e->subvolume_id,
|
||||
e->read_ops, e->write_ops,
|
||||
e->read_bytes, e->write_bytes,
|
||||
e->read_latency_us, e->write_latency_us);
|
||||
}
|
||||
}
|
||||
kfree(snapshot);
|
||||
|
||||
seq_puts(s, "\nStatistics:\n");
|
||||
seq_printf(s, " entries_sent: %llu\n", total_sent);
|
||||
seq_printf(s, " non_zero_sends: %llu\n", nonzero_sends);
|
||||
|
||||
seq_puts(s, "\nPending (unsent) subvolume metrics:\n");
|
||||
ceph_subvolume_metrics_dump(&mdsc->subvol_metrics, s);
|
||||
return 0;
|
||||
}
|
||||
|
||||
DEFINE_SHOW_ATTRIBUTE(mdsmap);
|
||||
DEFINE_SHOW_ATTRIBUTE(mdsc);
|
||||
DEFINE_SHOW_ATTRIBUTE(caps);
|
||||
|
|
@ -369,7 +454,72 @@ DEFINE_SHOW_ATTRIBUTE(metrics_file);
|
|||
DEFINE_SHOW_ATTRIBUTE(metrics_latency);
|
||||
DEFINE_SHOW_ATTRIBUTE(metrics_size);
|
||||
DEFINE_SHOW_ATTRIBUTE(metrics_caps);
|
||||
DEFINE_SHOW_ATTRIBUTE(subvolume_metrics);
|
||||
|
||||
static int metric_features_show(struct seq_file *s, void *p)
|
||||
{
|
||||
struct ceph_fs_client *fsc = s->private;
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
unsigned long session_features = 0;
|
||||
bool have_session = false;
|
||||
bool metric_collect = false;
|
||||
bool subvol_support = false;
|
||||
bool metrics_enabled = false;
|
||||
bool subvol_enabled = false;
|
||||
int i;
|
||||
|
||||
if (!mdsc) {
|
||||
seq_puts(s, "mds client unavailable\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
mutex_lock(&mdsc->mutex);
|
||||
if (mdsc->metric.session) {
|
||||
have_session = true;
|
||||
session_features = mdsc->metric.session->s_features;
|
||||
}
|
||||
mutex_unlock(&mdsc->mutex);
|
||||
|
||||
if (have_session) {
|
||||
metric_collect =
|
||||
test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
|
||||
&session_features);
|
||||
subvol_support =
|
||||
test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS,
|
||||
&session_features);
|
||||
}
|
||||
|
||||
metrics_enabled = !disable_send_metrics && have_session && metric_collect;
|
||||
subvol_enabled = metrics_enabled && subvol_support;
|
||||
|
||||
seq_printf(s,
|
||||
"metrics_enabled: %s (disable_send_metrics=%d, session=%s, metric_collect=%s)\n",
|
||||
metrics_enabled ? "yes" : "no",
|
||||
disable_send_metrics ? 1 : 0,
|
||||
have_session ? "yes" : "no",
|
||||
metric_collect ? "yes" : "no");
|
||||
seq_printf(s, "subvolume_metrics_enabled: %s\n",
|
||||
subvol_enabled ? "yes" : "no");
|
||||
seq_printf(s, "session_feature_bits: 0x%lx\n", session_features);
|
||||
|
||||
if (!have_session) {
|
||||
seq_puts(s, "(no active MDS session for metrics)\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(ceph_session_feature_table); i++) {
|
||||
const struct ceph_session_feature_desc *desc =
|
||||
&ceph_session_feature_table[i];
|
||||
bool set = test_bit(desc->bit, &session_features);
|
||||
|
||||
seq_printf(s, " %-24s : %s\n", desc->name,
|
||||
set ? "yes" : "no");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
DEFINE_SHOW_ATTRIBUTE(metric_features);
|
||||
|
||||
/*
|
||||
* debugfs
|
||||
|
|
@ -404,6 +554,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
|
|||
debugfs_remove(fsc->debugfs_caps);
|
||||
debugfs_remove(fsc->debugfs_status);
|
||||
debugfs_remove(fsc->debugfs_mdsc);
|
||||
debugfs_remove(fsc->debugfs_subvolume_metrics);
|
||||
debugfs_remove_recursive(fsc->debugfs_metrics_dir);
|
||||
doutc(fsc->client, "done\n");
|
||||
}
|
||||
|
|
@ -468,6 +619,12 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
|
|||
&metrics_size_fops);
|
||||
debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc,
|
||||
&metrics_caps_fops);
|
||||
debugfs_create_file("metric_features", 0400, fsc->debugfs_metrics_dir,
|
||||
fsc, &metric_features_fops);
|
||||
fsc->debugfs_subvolume_metrics =
|
||||
debugfs_create_file("subvolumes", 0400,
|
||||
fsc->debugfs_metrics_dir, fsc,
|
||||
&subvolume_metrics_fops);
|
||||
doutc(fsc->client, "done\n");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -19,6 +19,25 @@
|
|||
#include "cache.h"
|
||||
#include "io.h"
|
||||
#include "metric.h"
|
||||
#include "subvolume_metrics.h"
|
||||
|
||||
/*
|
||||
* Record I/O for subvolume metrics tracking.
|
||||
*
|
||||
* Callers must ensure bytes > 0 for reads (ret > 0 check) to avoid counting
|
||||
* EOF as an I/O operation. For writes, the condition is (ret >= 0 && len > 0).
|
||||
*/
|
||||
static inline void ceph_record_subvolume_io(struct inode *inode, bool is_write,
|
||||
ktime_t start, ktime_t end,
|
||||
size_t bytes)
|
||||
{
|
||||
if (!bytes)
|
||||
return;
|
||||
|
||||
ceph_subvolume_metrics_record_io(ceph_sb_to_mdsc(inode->i_sb),
|
||||
ceph_inode(inode),
|
||||
is_write, bytes, start, end);
|
||||
}
|
||||
|
||||
static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags)
|
||||
{
|
||||
|
|
@ -1140,6 +1159,15 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
|
|||
req->r_start_latency,
|
||||
req->r_end_latency,
|
||||
read_len, ret);
|
||||
/*
|
||||
* Only record subvolume metrics for actual bytes read.
|
||||
* ret == 0 means EOF (no data), not an I/O operation.
|
||||
*/
|
||||
if (ret > 0)
|
||||
ceph_record_subvolume_io(inode, false,
|
||||
req->r_start_latency,
|
||||
req->r_end_latency,
|
||||
ret);
|
||||
|
||||
if (ret > 0)
|
||||
objver = req->r_version;
|
||||
|
|
@ -1385,12 +1413,23 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
|
|||
|
||||
/* r_start_latency == 0 means the request was not submitted */
|
||||
if (req->r_start_latency) {
|
||||
if (aio_req->write)
|
||||
if (aio_req->write) {
|
||||
ceph_update_write_metrics(metric, req->r_start_latency,
|
||||
req->r_end_latency, len, rc);
|
||||
else
|
||||
if (rc >= 0 && len)
|
||||
ceph_record_subvolume_io(inode, true,
|
||||
req->r_start_latency,
|
||||
req->r_end_latency,
|
||||
len);
|
||||
} else {
|
||||
ceph_update_read_metrics(metric, req->r_start_latency,
|
||||
req->r_end_latency, len, rc);
|
||||
if (rc > 0)
|
||||
ceph_record_subvolume_io(inode, false,
|
||||
req->r_start_latency,
|
||||
req->r_end_latency,
|
||||
rc);
|
||||
}
|
||||
}
|
||||
|
||||
put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
|
||||
|
|
@ -1614,12 +1653,23 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
|
|||
ceph_osdc_start_request(req->r_osdc, req);
|
||||
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
|
||||
|
||||
if (write)
|
||||
if (write) {
|
||||
ceph_update_write_metrics(metric, req->r_start_latency,
|
||||
req->r_end_latency, len, ret);
|
||||
else
|
||||
if (ret >= 0 && len)
|
||||
ceph_record_subvolume_io(inode, true,
|
||||
req->r_start_latency,
|
||||
req->r_end_latency,
|
||||
len);
|
||||
} else {
|
||||
ceph_update_read_metrics(metric, req->r_start_latency,
|
||||
req->r_end_latency, len, ret);
|
||||
if (ret > 0)
|
||||
ceph_record_subvolume_io(inode, false,
|
||||
req->r_start_latency,
|
||||
req->r_end_latency,
|
||||
ret);
|
||||
}
|
||||
|
||||
size = i_size_read(inode);
|
||||
if (!write) {
|
||||
|
|
@ -1872,6 +1922,11 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
|
|||
req->r_start_latency,
|
||||
req->r_end_latency,
|
||||
read_len, ret);
|
||||
if (ret > 0)
|
||||
ceph_record_subvolume_io(inode, false,
|
||||
req->r_start_latency,
|
||||
req->r_end_latency,
|
||||
ret);
|
||||
|
||||
/* Ok if object is not already present */
|
||||
if (ret == -ENOENT) {
|
||||
|
|
@ -2036,6 +2091,11 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
|
|||
|
||||
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
|
||||
req->r_end_latency, len, ret);
|
||||
if (ret >= 0 && write_len)
|
||||
ceph_record_subvolume_io(inode, true,
|
||||
req->r_start_latency,
|
||||
req->r_end_latency,
|
||||
write_len);
|
||||
ceph_osdc_put_request(req);
|
||||
if (ret != 0) {
|
||||
doutc(cl, "osd write returned %d\n", ret);
|
||||
|
|
|
|||
|
|
@ -68,6 +68,21 @@ static void ceph_cap_reclaim_work(struct work_struct *work);
|
|||
|
||||
static const struct ceph_connection_operations mds_con_ops;
|
||||
|
||||
static void ceph_metric_bind_session(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session)
|
||||
{
|
||||
struct ceph_mds_session *old;
|
||||
|
||||
if (!mdsc || !session || disable_send_metrics)
|
||||
return;
|
||||
|
||||
old = mdsc->metric.session;
|
||||
mdsc->metric.session = ceph_get_mds_session(session);
|
||||
if (old)
|
||||
ceph_put_mds_session(old);
|
||||
|
||||
metric_schedule_delayed(&mdsc->metric);
|
||||
}
|
||||
|
||||
/*
|
||||
* mds reply parsing
|
||||
|
|
@ -4347,6 +4362,11 @@ static void handle_session(struct ceph_mds_session *session,
|
|||
}
|
||||
mdsc->s_cap_auths_num = cap_auths_num;
|
||||
mdsc->s_cap_auths = cap_auths;
|
||||
|
||||
session->s_features = features;
|
||||
if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
|
||||
&session->s_features))
|
||||
ceph_metric_bind_session(mdsc, session);
|
||||
}
|
||||
if (op == CEPH_SESSION_CLOSE) {
|
||||
ceph_get_mds_session(session);
|
||||
|
|
@ -4373,7 +4393,11 @@ static void handle_session(struct ceph_mds_session *session,
|
|||
pr_info_client(cl, "mds%d reconnect success\n",
|
||||
session->s_mds);
|
||||
|
||||
session->s_features = features;
|
||||
if (test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS,
|
||||
&session->s_features))
|
||||
ceph_subvolume_metrics_enable(&mdsc->subvol_metrics, true);
|
||||
else
|
||||
ceph_subvolume_metrics_enable(&mdsc->subvol_metrics, false);
|
||||
if (session->s_state == CEPH_MDS_SESSION_OPEN) {
|
||||
pr_notice_client(cl, "mds%d is already opened\n",
|
||||
session->s_mds);
|
||||
|
|
@ -5616,6 +5640,12 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
|
|||
err = ceph_metric_init(&mdsc->metric);
|
||||
if (err)
|
||||
goto err_mdsmap;
|
||||
ceph_subvolume_metrics_init(&mdsc->subvol_metrics);
|
||||
mutex_init(&mdsc->subvol_metrics_last_mutex);
|
||||
mdsc->subvol_metrics_last = NULL;
|
||||
mdsc->subvol_metrics_last_nr = 0;
|
||||
mdsc->subvol_metrics_sent = 0;
|
||||
mdsc->subvol_metrics_nonzero_sends = 0;
|
||||
|
||||
spin_lock_init(&mdsc->dentry_list_lock);
|
||||
INIT_LIST_HEAD(&mdsc->dentry_leases);
|
||||
|
|
@ -6149,6 +6179,8 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
|
|||
ceph_mdsc_stop(mdsc);
|
||||
|
||||
ceph_metric_destroy(&mdsc->metric);
|
||||
ceph_subvolume_metrics_destroy(&mdsc->subvol_metrics);
|
||||
kfree(mdsc->subvol_metrics_last);
|
||||
|
||||
fsc->mdsc = NULL;
|
||||
kfree(mdsc);
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
#include "mdsmap.h"
|
||||
#include "metric.h"
|
||||
#include "subvolume_metrics.h"
|
||||
#include "super.h"
|
||||
|
||||
/* The first 8 bits are reserved for old ceph releases */
|
||||
|
|
@ -36,8 +37,9 @@ enum ceph_feature_type {
|
|||
CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
|
||||
CEPHFS_FEATURE_HAS_OWNER_UIDGID,
|
||||
CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK,
|
||||
CEPHFS_FEATURE_SUBVOLUME_METRICS,
|
||||
|
||||
CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK,
|
||||
CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_SUBVOLUME_METRICS,
|
||||
};
|
||||
|
||||
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
|
||||
|
|
@ -54,6 +56,7 @@ enum ceph_feature_type {
|
|||
CEPHFS_FEATURE_32BITS_RETRY_FWD, \
|
||||
CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
|
||||
CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, \
|
||||
CEPHFS_FEATURE_SUBVOLUME_METRICS, \
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -537,6 +540,14 @@ struct ceph_mds_client {
|
|||
struct list_head dentry_dir_leases; /* lru list */
|
||||
|
||||
struct ceph_client_metric metric;
|
||||
struct ceph_subvolume_metrics_tracker subvol_metrics;
|
||||
|
||||
/* Subvolume metrics send tracking */
|
||||
struct mutex subvol_metrics_last_mutex;
|
||||
struct ceph_subvol_metric_snapshot *subvol_metrics_last;
|
||||
u32 subvol_metrics_last_nr;
|
||||
u64 subvol_metrics_sent;
|
||||
u64 subvol_metrics_nonzero_sends;
|
||||
|
||||
spinlock_t snapid_map_lock;
|
||||
struct rb_root snapid_map_tree;
|
||||
|
|
|
|||
183
fs/ceph/metric.c
183
fs/ceph/metric.c
|
|
@ -4,10 +4,84 @@
|
|||
#include <linux/types.h>
|
||||
#include <linux/percpu_counter.h>
|
||||
#include <linux/math64.h>
|
||||
#include <linux/ratelimit.h>
|
||||
|
||||
#include <linux/ceph/decode.h>
|
||||
|
||||
#include "metric.h"
|
||||
#include "mds_client.h"
|
||||
|
||||
static bool metrics_disable_warned;
|
||||
|
||||
static inline u32 ceph_subvolume_entry_payload_len(void)
|
||||
{
|
||||
return sizeof(struct ceph_subvolume_metric_entry_wire);
|
||||
}
|
||||
|
||||
static inline u32 ceph_subvolume_entry_encoded_len(void)
|
||||
{
|
||||
return CEPH_ENCODING_START_BLK_LEN +
|
||||
ceph_subvolume_entry_payload_len();
|
||||
}
|
||||
|
||||
static inline u32 ceph_subvolume_outer_payload_len(u32 nr_subvols)
|
||||
{
|
||||
/* count is encoded as le64 (size_t on wire) to match FUSE client */
|
||||
return sizeof(__le64) +
|
||||
nr_subvols * ceph_subvolume_entry_encoded_len();
|
||||
}
|
||||
|
||||
static inline u32 ceph_subvolume_metric_data_len(u32 nr_subvols)
|
||||
{
|
||||
return CEPH_ENCODING_START_BLK_LEN +
|
||||
ceph_subvolume_outer_payload_len(nr_subvols);
|
||||
}
|
||||
|
||||
static inline u32 ceph_subvolume_clamp_u32(u64 val)
|
||||
{
|
||||
return val > U32_MAX ? U32_MAX : (u32)val;
|
||||
}
|
||||
|
||||
static void ceph_init_subvolume_wire_entry(
|
||||
struct ceph_subvolume_metric_entry_wire *dst,
|
||||
const struct ceph_subvol_metric_snapshot *src)
|
||||
{
|
||||
dst->subvolume_id = cpu_to_le64(src->subvolume_id);
|
||||
dst->read_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->read_ops));
|
||||
dst->write_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->write_ops));
|
||||
dst->read_bytes = cpu_to_le64(src->read_bytes);
|
||||
dst->write_bytes = cpu_to_le64(src->write_bytes);
|
||||
dst->read_latency_us = cpu_to_le64(src->read_latency_us);
|
||||
dst->write_latency_us = cpu_to_le64(src->write_latency_us);
|
||||
dst->time_stamp = 0;
|
||||
}
|
||||
|
||||
static int ceph_encode_subvolume_metrics(void **p, void *end,
|
||||
struct ceph_subvol_metric_snapshot *subvols,
|
||||
u32 nr_subvols)
|
||||
{
|
||||
u32 i;
|
||||
|
||||
ceph_start_encoding(p, 1, 1,
|
||||
ceph_subvolume_outer_payload_len(nr_subvols));
|
||||
/* count is encoded as le64 (size_t on wire) to match FUSE client */
|
||||
ceph_encode_64_safe(p, end, (u64)nr_subvols, enc_err);
|
||||
|
||||
for (i = 0; i < nr_subvols; i++) {
|
||||
struct ceph_subvolume_metric_entry_wire wire_entry;
|
||||
|
||||
ceph_init_subvolume_wire_entry(&wire_entry, &subvols[i]);
|
||||
ceph_start_encoding(p, 1, 1,
|
||||
ceph_subvolume_entry_payload_len());
|
||||
ceph_encode_copy_safe(p, end, &wire_entry,
|
||||
sizeof(wire_entry), enc_err);
|
||||
}
|
||||
|
||||
return 0;
|
||||
enc_err:
|
||||
return -ERANGE;
|
||||
}
|
||||
|
||||
static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val)
|
||||
{
|
||||
struct timespec64 t = ktime_to_timespec64(val);
|
||||
|
|
@ -29,10 +103,14 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
|
|||
struct ceph_read_io_size *rsize;
|
||||
struct ceph_write_io_size *wsize;
|
||||
struct ceph_client_metric *m = &mdsc->metric;
|
||||
struct ceph_subvol_metric_snapshot *subvols = NULL;
|
||||
u64 nr_caps = atomic64_read(&m->total_caps);
|
||||
u32 header_len = sizeof(struct ceph_metric_header);
|
||||
struct ceph_client *cl = mdsc->fsc->client;
|
||||
struct ceph_msg *msg;
|
||||
u32 nr_subvols = 0;
|
||||
size_t subvol_len = 0;
|
||||
void *cursor;
|
||||
s64 sum;
|
||||
s32 items = 0;
|
||||
s32 len;
|
||||
|
|
@ -45,15 +123,42 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
|
|||
}
|
||||
mutex_unlock(&mdsc->mutex);
|
||||
|
||||
if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) &&
|
||||
test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, &s->s_features)) {
|
||||
int ret;
|
||||
|
||||
ret = ceph_subvolume_metrics_snapshot(&mdsc->subvol_metrics,
|
||||
&subvols, &nr_subvols,
|
||||
true);
|
||||
if (ret) {
|
||||
pr_warn_client(cl, "failed to snapshot subvolume metrics: %d\n",
|
||||
ret);
|
||||
/*
|
||||
* On error, ceph_subvolume_metrics_snapshot() guarantees
|
||||
* *out = NULL and *nr = 0 at function entry, so subvols
|
||||
* is already NULL here - no cleanup needed.
|
||||
*/
|
||||
nr_subvols = 0;
|
||||
subvols = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (nr_subvols) {
|
||||
/* type (le32) + ENCODE_START payload - no metric header */
|
||||
subvol_len = sizeof(__le32) +
|
||||
ceph_subvolume_metric_data_len(nr_subvols);
|
||||
}
|
||||
|
||||
len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
|
||||
+ sizeof(*meta) + sizeof(*dlease) + sizeof(*files)
|
||||
+ sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize)
|
||||
+ sizeof(*wsize);
|
||||
+ sizeof(*wsize) + subvol_len;
|
||||
|
||||
msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
|
||||
if (!msg) {
|
||||
pr_err_client(cl, "to mds%d, failed to allocate message\n",
|
||||
s->s_mds);
|
||||
kfree(subvols);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -172,13 +277,56 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
|
|||
wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum);
|
||||
items++;
|
||||
|
||||
cursor = wsize + 1;
|
||||
|
||||
if (nr_subvols) {
|
||||
void *payload;
|
||||
void *payload_end;
|
||||
int ret;
|
||||
|
||||
/* Emit only the type (le32), no ver/compat/data_len */
|
||||
ceph_encode_32(&cursor, CLIENT_METRIC_TYPE_SUBVOLUME_METRICS);
|
||||
items++;
|
||||
|
||||
payload = cursor;
|
||||
payload_end = (char *)payload +
|
||||
ceph_subvolume_metric_data_len(nr_subvols);
|
||||
|
||||
ret = ceph_encode_subvolume_metrics(&payload, payload_end,
|
||||
subvols, nr_subvols);
|
||||
if (ret) {
|
||||
pr_warn_client(cl,
|
||||
"failed to encode subvolume metrics\n");
|
||||
kfree(subvols);
|
||||
ceph_msg_put(msg);
|
||||
return false;
|
||||
}
|
||||
|
||||
WARN_ON(payload != payload_end);
|
||||
cursor = payload;
|
||||
}
|
||||
|
||||
put_unaligned_le32(items, &head->num);
|
||||
msg->front.iov_len = len;
|
||||
msg->front.iov_len = (char *)cursor - (char *)head;
|
||||
msg->hdr.version = cpu_to_le16(1);
|
||||
msg->hdr.compat_version = cpu_to_le16(1);
|
||||
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
|
||||
|
||||
ceph_con_send(&s->s_con, msg);
|
||||
|
||||
if (nr_subvols) {
|
||||
mutex_lock(&mdsc->subvol_metrics_last_mutex);
|
||||
kfree(mdsc->subvol_metrics_last);
|
||||
mdsc->subvol_metrics_last = subvols;
|
||||
mdsc->subvol_metrics_last_nr = nr_subvols;
|
||||
mdsc->subvol_metrics_sent += nr_subvols;
|
||||
mdsc->subvol_metrics_nonzero_sends++;
|
||||
mutex_unlock(&mdsc->subvol_metrics_last_mutex);
|
||||
|
||||
subvols = NULL;
|
||||
}
|
||||
kfree(subvols);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -198,9 +346,20 @@ static void metric_get_session(struct ceph_mds_client *mdsc)
|
|||
* Skip it if MDS doesn't support the metric collection,
|
||||
* or the MDS will close the session's socket connection
|
||||
* directly when it get this message.
|
||||
*
|
||||
* Also skip sessions that don't support SUBVOLUME_METRICS
|
||||
* when subvolume metrics collection is enabled. This ensures
|
||||
* we only send subvolume metrics to MDSs that understand them.
|
||||
* If no session supports the feature, metrics won't be sent.
|
||||
*/
|
||||
if (check_session_state(s) &&
|
||||
test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) {
|
||||
if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) &&
|
||||
!test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS,
|
||||
&s->s_features)) {
|
||||
ceph_put_mds_session(s);
|
||||
continue;
|
||||
}
|
||||
mdsc->metric.session = s;
|
||||
break;
|
||||
}
|
||||
|
|
@ -217,9 +376,18 @@ static void metric_delayed_work(struct work_struct *work)
|
|||
struct ceph_mds_client *mdsc =
|
||||
container_of(m, struct ceph_mds_client, metric);
|
||||
|
||||
if (mdsc->stopping || disable_send_metrics)
|
||||
if (mdsc->stopping)
|
||||
return;
|
||||
|
||||
if (disable_send_metrics) {
|
||||
if (!metrics_disable_warned) {
|
||||
pr_info("ceph: metrics sending disabled via module parameter\n");
|
||||
metrics_disable_warned = true;
|
||||
}
|
||||
return;
|
||||
}
|
||||
metrics_disable_warned = false;
|
||||
|
||||
if (!m->session || !check_session_state(m->session)) {
|
||||
if (m->session) {
|
||||
ceph_put_mds_session(m->session);
|
||||
|
|
@ -227,10 +395,13 @@ static void metric_delayed_work(struct work_struct *work)
|
|||
}
|
||||
metric_get_session(mdsc);
|
||||
}
|
||||
if (m->session) {
|
||||
|
||||
if (m->session)
|
||||
ceph_mdsc_send_metrics(mdsc, m->session);
|
||||
metric_schedule_delayed(m);
|
||||
}
|
||||
else
|
||||
pr_warn_ratelimited("ceph: metrics worker has no MDS session\n");
|
||||
|
||||
metric_schedule_delayed(m);
|
||||
}
|
||||
|
||||
int ceph_metric_init(struct ceph_client_metric *m)
|
||||
|
|
|
|||
|
|
@ -25,8 +25,9 @@ enum ceph_metric_type {
|
|||
CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
|
||||
CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
|
||||
CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
|
||||
CLIENT_METRIC_TYPE_SUBVOLUME_METRICS,
|
||||
|
||||
CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
|
||||
CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_SUBVOLUME_METRICS,
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -50,6 +51,7 @@ enum ceph_metric_type {
|
|||
CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \
|
||||
CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \
|
||||
CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \
|
||||
CLIENT_METRIC_TYPE_SUBVOLUME_METRICS, \
|
||||
\
|
||||
CLIENT_METRIC_TYPE_MAX, \
|
||||
}
|
||||
|
|
@ -139,6 +141,41 @@ struct ceph_write_io_size {
|
|||
__le64 total_size;
|
||||
} __packed;
|
||||
|
||||
/**
|
||||
* struct ceph_subvolume_metric_entry_wire - On-wire format sent to MDS
|
||||
* @subvolume_id: Subvolume identifier
|
||||
* @read_ops: Read operation count (32-bit, clamped from 64-bit internal)
|
||||
* @write_ops: Write operation count (32-bit, clamped from 64-bit internal)
|
||||
* @read_bytes: Total bytes read
|
||||
* @write_bytes: Total bytes written
|
||||
* @read_latency_us: Cumulative read latency in microseconds
|
||||
* @write_latency_us: Cumulative write latency in microseconds
|
||||
* @time_stamp: Collection timestamp (currently unused, set to 0)
|
||||
*
|
||||
* Wire format must match C++ AggregatedIOMetrics struct in MDS.
|
||||
*/
|
||||
struct ceph_subvolume_metric_entry_wire {
|
||||
__le64 subvolume_id;
|
||||
__le32 read_ops;
|
||||
__le32 write_ops;
|
||||
__le64 read_bytes;
|
||||
__le64 write_bytes;
|
||||
__le64 read_latency_us;
|
||||
__le64 write_latency_us;
|
||||
__le64 time_stamp;
|
||||
} __packed;
|
||||
|
||||
/* Old struct kept for internal tracking, not used on wire */
|
||||
struct ceph_subvolume_metric_entry {
|
||||
__le64 subvolume_id;
|
||||
__le64 read_ops;
|
||||
__le64 write_ops;
|
||||
__le64 read_bytes;
|
||||
__le64 write_bytes;
|
||||
__le64 read_latency_us;
|
||||
__le64 write_latency_us;
|
||||
} __packed;
|
||||
|
||||
struct ceph_metric_head {
|
||||
__le32 num; /* the number of metrics that will be sent */
|
||||
} __packed;
|
||||
|
|
|
|||
416
fs/ceph/subvolume_metrics.c
Normal file
416
fs/ceph/subvolume_metrics.c
Normal file
|
|
@ -0,0 +1,416 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/ceph/ceph_debug.h>
|
||||
|
||||
#include <linux/math64.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
#include "subvolume_metrics.h"
|
||||
#include "mds_client.h"
|
||||
#include "super.h"
|
||||
|
||||
/**
|
||||
* struct ceph_subvol_metric_rb_entry - Per-subvolume I/O metrics node
|
||||
* @node: Red-black tree linkage for tracker->tree
|
||||
* @subvolume_id: Subvolume identifier (key for rb-tree lookup)
|
||||
* @read_ops: Accumulated read operation count since last snapshot
|
||||
* @write_ops: Accumulated write operation count since last snapshot
|
||||
* @read_bytes: Accumulated bytes read since last snapshot
|
||||
* @write_bytes: Accumulated bytes written since last snapshot
|
||||
* @read_latency_us: Sum of read latencies in microseconds
|
||||
* @write_latency_us: Sum of write latencies in microseconds
|
||||
*/
|
||||
struct ceph_subvol_metric_rb_entry {
|
||||
struct rb_node node;
|
||||
u64 subvolume_id;
|
||||
u64 read_ops;
|
||||
u64 write_ops;
|
||||
u64 read_bytes;
|
||||
u64 write_bytes;
|
||||
u64 read_latency_us;
|
||||
u64 write_latency_us;
|
||||
};
|
||||
|
||||
static struct kmem_cache *ceph_subvol_metric_entry_cachep;
|
||||
|
||||
void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker)
|
||||
{
|
||||
spin_lock_init(&tracker->lock);
|
||||
tracker->tree = RB_ROOT_CACHED;
|
||||
tracker->nr_entries = 0;
|
||||
tracker->enabled = false;
|
||||
atomic64_set(&tracker->snapshot_attempts, 0);
|
||||
atomic64_set(&tracker->snapshot_empty, 0);
|
||||
atomic64_set(&tracker->snapshot_failures, 0);
|
||||
atomic64_set(&tracker->record_calls, 0);
|
||||
atomic64_set(&tracker->record_disabled, 0);
|
||||
atomic64_set(&tracker->record_no_subvol, 0);
|
||||
atomic64_set(&tracker->total_read_ops, 0);
|
||||
atomic64_set(&tracker->total_read_bytes, 0);
|
||||
atomic64_set(&tracker->total_write_ops, 0);
|
||||
atomic64_set(&tracker->total_write_bytes, 0);
|
||||
}
|
||||
|
||||
static struct ceph_subvol_metric_rb_entry *
|
||||
__lookup_entry(struct ceph_subvolume_metrics_tracker *tracker, u64 subvol_id)
|
||||
{
|
||||
struct rb_node *node;
|
||||
|
||||
node = tracker->tree.rb_root.rb_node;
|
||||
while (node) {
|
||||
struct ceph_subvol_metric_rb_entry *entry =
|
||||
rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
|
||||
|
||||
if (subvol_id < entry->subvolume_id)
|
||||
node = node->rb_left;
|
||||
else if (subvol_id > entry->subvolume_id)
|
||||
node = node->rb_right;
|
||||
else
|
||||
return entry;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct ceph_subvol_metric_rb_entry *
|
||||
__insert_entry(struct ceph_subvolume_metrics_tracker *tracker,
|
||||
struct ceph_subvol_metric_rb_entry *entry)
|
||||
{
|
||||
struct rb_node **link = &tracker->tree.rb_root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
bool leftmost = true;
|
||||
|
||||
while (*link) {
|
||||
struct ceph_subvol_metric_rb_entry *cur =
|
||||
rb_entry(*link, struct ceph_subvol_metric_rb_entry, node);
|
||||
|
||||
parent = *link;
|
||||
if (entry->subvolume_id < cur->subvolume_id)
|
||||
link = &(*link)->rb_left;
|
||||
else if (entry->subvolume_id > cur->subvolume_id) {
|
||||
link = &(*link)->rb_right;
|
||||
leftmost = false;
|
||||
} else
|
||||
return cur;
|
||||
}
|
||||
|
||||
rb_link_node(&entry->node, parent, link);
|
||||
rb_insert_color_cached(&entry->node, &tracker->tree, leftmost);
|
||||
tracker->nr_entries++;
|
||||
return entry;
|
||||
}
|
||||
|
||||
static void ceph_subvolume_metrics_clear_locked(
|
||||
struct ceph_subvolume_metrics_tracker *tracker)
|
||||
{
|
||||
struct rb_node *node = rb_first_cached(&tracker->tree);
|
||||
|
||||
while (node) {
|
||||
struct ceph_subvol_metric_rb_entry *entry =
|
||||
rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
|
||||
struct rb_node *next = rb_next(node);
|
||||
|
||||
rb_erase_cached(&entry->node, &tracker->tree);
|
||||
tracker->nr_entries--;
|
||||
kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
|
||||
node = next;
|
||||
}
|
||||
|
||||
tracker->tree = RB_ROOT_CACHED;
|
||||
}
|
||||
|
||||
void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker)
|
||||
{
|
||||
spin_lock(&tracker->lock);
|
||||
ceph_subvolume_metrics_clear_locked(tracker);
|
||||
tracker->enabled = false;
|
||||
spin_unlock(&tracker->lock);
|
||||
}
|
||||
|
||||
void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker,
|
||||
bool enable)
|
||||
{
|
||||
spin_lock(&tracker->lock);
|
||||
if (enable) {
|
||||
tracker->enabled = true;
|
||||
} else {
|
||||
tracker->enabled = false;
|
||||
ceph_subvolume_metrics_clear_locked(tracker);
|
||||
}
|
||||
spin_unlock(&tracker->lock);
|
||||
}
|
||||
|
||||
void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker,
|
||||
u64 subvol_id, bool is_write,
|
||||
size_t size, u64 latency_us)
|
||||
{
|
||||
struct ceph_subvol_metric_rb_entry *entry, *new_entry = NULL;
|
||||
bool retry = false;
|
||||
|
||||
/* CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset subvolume */
|
||||
if (!READ_ONCE(tracker->enabled) ||
|
||||
subvol_id == CEPH_SUBVOLUME_ID_NONE || !size || !latency_us)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Retry loop for lock-free allocation pattern:
|
||||
* 1. First iteration: lookup under lock, if miss -> drop lock, alloc, retry
|
||||
* 2. Second iteration: lookup again (may have been inserted), insert if still missing
|
||||
* 3. On race (another thread inserted same key): free our alloc, retry
|
||||
* All successful paths exit via return, so retry flag doesn't need reset.
|
||||
*/
|
||||
do {
|
||||
spin_lock(&tracker->lock);
|
||||
if (!tracker->enabled) {
|
||||
spin_unlock(&tracker->lock);
|
||||
if (new_entry)
|
||||
kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
|
||||
return;
|
||||
}
|
||||
|
||||
entry = __lookup_entry(tracker, subvol_id);
|
||||
if (!entry) {
|
||||
if (!new_entry) {
|
||||
spin_unlock(&tracker->lock);
|
||||
new_entry = kmem_cache_zalloc(ceph_subvol_metric_entry_cachep,
|
||||
GFP_NOFS);
|
||||
if (!new_entry)
|
||||
return;
|
||||
new_entry->subvolume_id = subvol_id;
|
||||
retry = true;
|
||||
continue;
|
||||
}
|
||||
entry = __insert_entry(tracker, new_entry);
|
||||
if (entry != new_entry) {
|
||||
/* raced with another insert */
|
||||
spin_unlock(&tracker->lock);
|
||||
kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
|
||||
new_entry = NULL;
|
||||
retry = true;
|
||||
continue;
|
||||
}
|
||||
new_entry = NULL;
|
||||
}
|
||||
|
||||
if (is_write) {
|
||||
entry->write_ops++;
|
||||
entry->write_bytes += size;
|
||||
entry->write_latency_us += latency_us;
|
||||
atomic64_inc(&tracker->total_write_ops);
|
||||
atomic64_add(size, &tracker->total_write_bytes);
|
||||
} else {
|
||||
entry->read_ops++;
|
||||
entry->read_bytes += size;
|
||||
entry->read_latency_us += latency_us;
|
||||
atomic64_inc(&tracker->total_read_ops);
|
||||
atomic64_add(size, &tracker->total_read_bytes);
|
||||
}
|
||||
spin_unlock(&tracker->lock);
|
||||
if (new_entry)
|
||||
kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
|
||||
return;
|
||||
} while (retry);
|
||||
}
|
||||
|
||||
int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker,
|
||||
struct ceph_subvol_metric_snapshot **out,
|
||||
u32 *nr, bool consume)
|
||||
{
|
||||
struct ceph_subvol_metric_snapshot *snap = NULL;
|
||||
struct rb_node *node;
|
||||
u32 count = 0, idx = 0;
|
||||
int ret = 0;
|
||||
|
||||
*out = NULL;
|
||||
*nr = 0;
|
||||
|
||||
if (!READ_ONCE(tracker->enabled))
|
||||
return 0;
|
||||
|
||||
atomic64_inc(&tracker->snapshot_attempts);
|
||||
|
||||
spin_lock(&tracker->lock);
|
||||
for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) {
|
||||
struct ceph_subvol_metric_rb_entry *entry =
|
||||
rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
|
||||
|
||||
/* Include entries with ANY I/O activity (read OR write) */
|
||||
if (entry->read_ops || entry->write_ops)
|
||||
count++;
|
||||
}
|
||||
spin_unlock(&tracker->lock);
|
||||
|
||||
if (!count) {
|
||||
atomic64_inc(&tracker->snapshot_empty);
|
||||
return 0;
|
||||
}
|
||||
|
||||
snap = kcalloc(count, sizeof(*snap), GFP_NOFS);
|
||||
if (!snap) {
|
||||
atomic64_inc(&tracker->snapshot_failures);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
spin_lock(&tracker->lock);
|
||||
node = rb_first_cached(&tracker->tree);
|
||||
while (node) {
|
||||
struct ceph_subvol_metric_rb_entry *entry =
|
||||
rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
|
||||
struct rb_node *next = rb_next(node);
|
||||
|
||||
/* Skip entries with NO I/O activity at all */
|
||||
if (!entry->read_ops && !entry->write_ops) {
|
||||
rb_erase_cached(&entry->node, &tracker->tree);
|
||||
tracker->nr_entries--;
|
||||
kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
|
||||
node = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (idx >= count) {
|
||||
pr_warn("ceph: subvol metrics snapshot race (idx=%u count=%u)\n",
|
||||
idx, count);
|
||||
break;
|
||||
}
|
||||
|
||||
snap[idx].subvolume_id = entry->subvolume_id;
|
||||
snap[idx].read_ops = entry->read_ops;
|
||||
snap[idx].write_ops = entry->write_ops;
|
||||
snap[idx].read_bytes = entry->read_bytes;
|
||||
snap[idx].write_bytes = entry->write_bytes;
|
||||
snap[idx].read_latency_us = entry->read_latency_us;
|
||||
snap[idx].write_latency_us = entry->write_latency_us;
|
||||
idx++;
|
||||
|
||||
if (consume) {
|
||||
entry->read_ops = 0;
|
||||
entry->write_ops = 0;
|
||||
entry->read_bytes = 0;
|
||||
entry->write_bytes = 0;
|
||||
entry->read_latency_us = 0;
|
||||
entry->write_latency_us = 0;
|
||||
rb_erase_cached(&entry->node, &tracker->tree);
|
||||
tracker->nr_entries--;
|
||||
kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
|
||||
}
|
||||
node = next;
|
||||
}
|
||||
spin_unlock(&tracker->lock);
|
||||
|
||||
if (!idx) {
|
||||
kfree(snap);
|
||||
snap = NULL;
|
||||
ret = 0;
|
||||
} else {
|
||||
*nr = idx;
|
||||
*out = snap;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot)
|
||||
{
|
||||
kfree(snapshot);
|
||||
}
|
||||
|
||||
/*
|
||||
* Dump subvolume metrics to a seq_file for debugfs.
|
||||
*
|
||||
* Iterates the rb-tree directly under spinlock to avoid allocation.
|
||||
* The lock hold time is minimal since we're only doing seq_printf calls.
|
||||
*/
|
||||
void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker,
|
||||
struct seq_file *s)
|
||||
{
|
||||
struct rb_node *node;
|
||||
bool found = false;
|
||||
|
||||
spin_lock(&tracker->lock);
|
||||
if (!tracker->enabled) {
|
||||
spin_unlock(&tracker->lock);
|
||||
seq_puts(s, "subvolume metrics disabled\n");
|
||||
return;
|
||||
}
|
||||
|
||||
for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) {
|
||||
struct ceph_subvol_metric_rb_entry *entry =
|
||||
rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
|
||||
u64 avg_rd_lat, avg_wr_lat;
|
||||
|
||||
if (!entry->read_ops && !entry->write_ops)
|
||||
continue;
|
||||
|
||||
if (!found) {
|
||||
seq_puts(s, "subvol_id rd_ops rd_bytes rd_avg_lat_us wr_ops wr_bytes wr_avg_lat_us\n");
|
||||
seq_puts(s, "------------------------------------------------------------------------------------------------\n");
|
||||
found = true;
|
||||
}
|
||||
|
||||
avg_rd_lat = entry->read_ops ?
|
||||
div64_u64(entry->read_latency_us, entry->read_ops) : 0;
|
||||
avg_wr_lat = entry->write_ops ?
|
||||
div64_u64(entry->write_latency_us, entry->write_ops) : 0;
|
||||
|
||||
seq_printf(s, "%-15llu%-10llu%-12llu%-16llu%-10llu%-12llu%-16llu\n",
|
||||
entry->subvolume_id,
|
||||
entry->read_ops,
|
||||
entry->read_bytes,
|
||||
avg_rd_lat,
|
||||
entry->write_ops,
|
||||
entry->write_bytes,
|
||||
avg_wr_lat);
|
||||
}
|
||||
spin_unlock(&tracker->lock);
|
||||
|
||||
if (!found)
|
||||
seq_puts(s, "(no subvolume metrics collected)\n");
|
||||
}
|
||||
|
||||
void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc,
|
||||
struct ceph_inode_info *ci,
|
||||
bool is_write, size_t bytes,
|
||||
ktime_t start, ktime_t end)
|
||||
{
|
||||
struct ceph_subvolume_metrics_tracker *tracker;
|
||||
u64 subvol_id;
|
||||
s64 delta_us;
|
||||
|
||||
if (!mdsc || !ci || !bytes)
|
||||
return;
|
||||
|
||||
tracker = &mdsc->subvol_metrics;
|
||||
atomic64_inc(&tracker->record_calls);
|
||||
|
||||
if (!ceph_subvolume_metrics_enabled(tracker)) {
|
||||
atomic64_inc(&tracker->record_disabled);
|
||||
return;
|
||||
}
|
||||
|
||||
subvol_id = READ_ONCE(ci->i_subvolume_id);
|
||||
if (subvol_id == CEPH_SUBVOLUME_ID_NONE) {
|
||||
atomic64_inc(&tracker->record_no_subvol);
|
||||
return;
|
||||
}
|
||||
|
||||
delta_us = ktime_to_us(ktime_sub(end, start));
|
||||
if (delta_us <= 0)
|
||||
delta_us = 1;
|
||||
|
||||
ceph_subvolume_metrics_record(tracker, subvol_id, is_write,
|
||||
bytes, (u64)delta_us);
|
||||
}
|
||||
|
||||
int __init ceph_subvolume_metrics_cache_init(void)
|
||||
{
|
||||
ceph_subvol_metric_entry_cachep = KMEM_CACHE(ceph_subvol_metric_rb_entry,
|
||||
SLAB_RECLAIM_ACCOUNT);
|
||||
if (!ceph_subvol_metric_entry_cachep)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ceph_subvolume_metrics_cache_destroy(void)
|
||||
{
|
||||
kmem_cache_destroy(ceph_subvol_metric_entry_cachep);
|
||||
}
|
||||
97
fs/ceph/subvolume_metrics.h
Normal file
97
fs/ceph/subvolume_metrics.h
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _FS_CEPH_SUBVOLUME_METRICS_H
|
||||
#define _FS_CEPH_SUBVOLUME_METRICS_H
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/ktime.h>
|
||||
#include <linux/atomic.h>
|
||||
|
||||
struct seq_file;
|
||||
struct ceph_mds_client;
|
||||
struct ceph_inode_info;
|
||||
|
||||
/**
|
||||
* struct ceph_subvol_metric_snapshot - Point-in-time snapshot of subvolume metrics
|
||||
* @subvolume_id: Subvolume identifier (inode number of subvolume root)
|
||||
* @read_ops: Number of read operations since last snapshot
|
||||
* @write_ops: Number of write operations since last snapshot
|
||||
* @read_bytes: Total bytes read since last snapshot
|
||||
* @write_bytes: Total bytes written since last snapshot
|
||||
* @read_latency_us: Sum of read latencies in microseconds (for avg calculation)
|
||||
* @write_latency_us: Sum of write latencies in microseconds (for avg calculation)
|
||||
*/
|
||||
struct ceph_subvol_metric_snapshot {
|
||||
u64 subvolume_id;
|
||||
u64 read_ops;
|
||||
u64 write_ops;
|
||||
u64 read_bytes;
|
||||
u64 write_bytes;
|
||||
u64 read_latency_us;
|
||||
u64 write_latency_us;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct ceph_subvolume_metrics_tracker - Tracks per-subvolume I/O metrics
|
||||
* @lock: Protects @tree and @nr_entries during concurrent access
|
||||
* @tree: Red-black tree of per-subvolume entries, keyed by subvolume_id
|
||||
* @nr_entries: Number of entries currently in @tree
|
||||
* @enabled: Whether collection is enabled (requires MDS feature support)
|
||||
* @snapshot_attempts: Debug counter: total ceph_subvolume_metrics_snapshot() calls
|
||||
* @snapshot_empty: Debug counter: snapshots that found no data to report
|
||||
* @snapshot_failures: Debug counter: snapshots that failed to allocate memory
|
||||
* @record_calls: Debug counter: total ceph_subvolume_metrics_record() calls
|
||||
* @record_disabled: Debug counter: record calls skipped because disabled
|
||||
* @record_no_subvol: Debug counter: record calls skipped (no subvolume_id)
|
||||
* @total_read_ops: Cumulative read ops across all snapshots (never reset)
|
||||
* @total_read_bytes: Cumulative bytes read across all snapshots (never reset)
|
||||
* @total_write_ops: Cumulative write ops across all snapshots (never reset)
|
||||
* @total_write_bytes: Cumulative bytes written across all snapshots (never reset)
|
||||
*/
|
||||
struct ceph_subvolume_metrics_tracker {
|
||||
spinlock_t lock;
|
||||
struct rb_root_cached tree;
|
||||
u32 nr_entries;
|
||||
bool enabled;
|
||||
atomic64_t snapshot_attempts;
|
||||
atomic64_t snapshot_empty;
|
||||
atomic64_t snapshot_failures;
|
||||
atomic64_t record_calls;
|
||||
atomic64_t record_disabled;
|
||||
atomic64_t record_no_subvol;
|
||||
atomic64_t total_read_ops;
|
||||
atomic64_t total_read_bytes;
|
||||
atomic64_t total_write_ops;
|
||||
atomic64_t total_write_bytes;
|
||||
};
|
||||
|
||||
void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker);
|
||||
void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker);
|
||||
void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker,
|
||||
bool enable);
|
||||
void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker,
|
||||
u64 subvol_id, bool is_write,
|
||||
size_t size, u64 latency_us);
|
||||
int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker,
|
||||
struct ceph_subvol_metric_snapshot **out,
|
||||
u32 *nr, bool consume);
|
||||
void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot);
|
||||
void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker,
|
||||
struct seq_file *s);
|
||||
|
||||
void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc,
|
||||
struct ceph_inode_info *ci,
|
||||
bool is_write, size_t bytes,
|
||||
ktime_t start, ktime_t end);
|
||||
|
||||
static inline bool ceph_subvolume_metrics_enabled(
|
||||
const struct ceph_subvolume_metrics_tracker *tracker)
|
||||
{
|
||||
return READ_ONCE(tracker->enabled);
|
||||
}
|
||||
|
||||
int __init ceph_subvolume_metrics_cache_init(void);
|
||||
void ceph_subvolume_metrics_cache_destroy(void);
|
||||
|
||||
#endif /* _FS_CEPH_SUBVOLUME_METRICS_H */
|
||||
|
|
@ -21,6 +21,7 @@
|
|||
#include "mds_client.h"
|
||||
#include "cache.h"
|
||||
#include "crypto.h"
|
||||
#include "subvolume_metrics.h"
|
||||
|
||||
#include <linux/ceph/ceph_features.h>
|
||||
#include <linux/ceph/decode.h>
|
||||
|
|
@ -966,8 +967,14 @@ static int __init init_caches(void)
|
|||
if (!ceph_wb_pagevec_pool)
|
||||
goto bad_pagevec_pool;
|
||||
|
||||
error = ceph_subvolume_metrics_cache_init();
|
||||
if (error)
|
||||
goto bad_subvol_metrics;
|
||||
|
||||
return 0;
|
||||
|
||||
bad_subvol_metrics:
|
||||
mempool_destroy(ceph_wb_pagevec_pool);
|
||||
bad_pagevec_pool:
|
||||
kmem_cache_destroy(ceph_mds_request_cachep);
|
||||
bad_mds_req:
|
||||
|
|
@ -1004,6 +1011,7 @@ static void destroy_caches(void)
|
|||
kmem_cache_destroy(ceph_dir_file_cachep);
|
||||
kmem_cache_destroy(ceph_mds_request_cachep);
|
||||
mempool_destroy(ceph_wb_pagevec_pool);
|
||||
ceph_subvolume_metrics_cache_destroy();
|
||||
}
|
||||
|
||||
static void __ceph_umount_begin(struct ceph_fs_client *fsc)
|
||||
|
|
|
|||
|
|
@ -179,6 +179,7 @@ struct ceph_fs_client {
|
|||
struct dentry *debugfs_status;
|
||||
struct dentry *debugfs_mds_sessions;
|
||||
struct dentry *debugfs_metrics_dir;
|
||||
struct dentry *debugfs_subvolume_metrics;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CEPH_FSCACHE
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user