mirror of
https://github.com/torvalds/linux.git
synced 2026-05-30 01:53:29 +02:00
Add complete infrastructure for per-subvolume I/O metrics collection and reporting to the MDS. This enables administrators to monitor I/O patterns at the subvolume granularity, which is useful for multi-tenant CephFS deployments. This patch adds: - CEPHFS_FEATURE_SUBVOLUME_METRICS feature flag for MDS negotiation - CEPH_SUBVOLUME_ID_NONE constant (0) for unknown/unset state - Red-black tree based metrics tracker for efficient per-subvolume aggregation with kmem_cache for entry allocations - Wire format encoding matching the MDS C++ AggregatedIOMetrics struct - Integration with the existing CLIENT_METRICS message - Recording of I/O operations from file read/write and writeback paths - Debugfs interfaces for monitoring (metrics/subvolumes, metrics/metric_features) Metrics tracked per subvolume include: - Read/write operation counts - Read/write byte counts - Read/write latency sums (for average calculation) The metrics are periodically sent to the MDS as part of the existing metrics reporting infrastructure when the MDS advertises support for the SUBVOLUME_METRICS feature. CEPH_SUBVOLUME_ID_NONE enforces subvolume_id immutability. Following the FUSE client convention, 0 means unknown/unset. Once an inode has a valid (non-zero) subvolume_id, it should not change during the inode's lifetime. Signed-off-by: Alex Markuze <amarkuze@redhat.com> Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
417 lines
11 KiB
C
417 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/ceph/ceph_debug.h>
|
|
|
|
#include <linux/math64.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/seq_file.h>
|
|
|
|
#include "subvolume_metrics.h"
|
|
#include "mds_client.h"
|
|
#include "super.h"
|
|
|
|
/**
|
|
* struct ceph_subvol_metric_rb_entry - Per-subvolume I/O metrics node
|
|
* @node: Red-black tree linkage for tracker->tree
|
|
* @subvolume_id: Subvolume identifier (key for rb-tree lookup)
|
|
* @read_ops: Accumulated read operation count since last snapshot
|
|
* @write_ops: Accumulated write operation count since last snapshot
|
|
* @read_bytes: Accumulated bytes read since last snapshot
|
|
* @write_bytes: Accumulated bytes written since last snapshot
|
|
* @read_latency_us: Sum of read latencies in microseconds
|
|
* @write_latency_us: Sum of write latencies in microseconds
|
|
*/
|
|
struct ceph_subvol_metric_rb_entry {
|
|
struct rb_node node;
|
|
u64 subvolume_id;
|
|
u64 read_ops;
|
|
u64 write_ops;
|
|
u64 read_bytes;
|
|
u64 write_bytes;
|
|
u64 read_latency_us;
|
|
u64 write_latency_us;
|
|
};
|
|
|
|
static struct kmem_cache *ceph_subvol_metric_entry_cachep;
|
|
|
|
void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker)
|
|
{
|
|
spin_lock_init(&tracker->lock);
|
|
tracker->tree = RB_ROOT_CACHED;
|
|
tracker->nr_entries = 0;
|
|
tracker->enabled = false;
|
|
atomic64_set(&tracker->snapshot_attempts, 0);
|
|
atomic64_set(&tracker->snapshot_empty, 0);
|
|
atomic64_set(&tracker->snapshot_failures, 0);
|
|
atomic64_set(&tracker->record_calls, 0);
|
|
atomic64_set(&tracker->record_disabled, 0);
|
|
atomic64_set(&tracker->record_no_subvol, 0);
|
|
atomic64_set(&tracker->total_read_ops, 0);
|
|
atomic64_set(&tracker->total_read_bytes, 0);
|
|
atomic64_set(&tracker->total_write_ops, 0);
|
|
atomic64_set(&tracker->total_write_bytes, 0);
|
|
}
|
|
|
|
static struct ceph_subvol_metric_rb_entry *
|
|
__lookup_entry(struct ceph_subvolume_metrics_tracker *tracker, u64 subvol_id)
|
|
{
|
|
struct rb_node *node;
|
|
|
|
node = tracker->tree.rb_root.rb_node;
|
|
while (node) {
|
|
struct ceph_subvol_metric_rb_entry *entry =
|
|
rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
|
|
|
|
if (subvol_id < entry->subvolume_id)
|
|
node = node->rb_left;
|
|
else if (subvol_id > entry->subvolume_id)
|
|
node = node->rb_right;
|
|
else
|
|
return entry;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static struct ceph_subvol_metric_rb_entry *
|
|
__insert_entry(struct ceph_subvolume_metrics_tracker *tracker,
|
|
struct ceph_subvol_metric_rb_entry *entry)
|
|
{
|
|
struct rb_node **link = &tracker->tree.rb_root.rb_node;
|
|
struct rb_node *parent = NULL;
|
|
bool leftmost = true;
|
|
|
|
while (*link) {
|
|
struct ceph_subvol_metric_rb_entry *cur =
|
|
rb_entry(*link, struct ceph_subvol_metric_rb_entry, node);
|
|
|
|
parent = *link;
|
|
if (entry->subvolume_id < cur->subvolume_id)
|
|
link = &(*link)->rb_left;
|
|
else if (entry->subvolume_id > cur->subvolume_id) {
|
|
link = &(*link)->rb_right;
|
|
leftmost = false;
|
|
} else
|
|
return cur;
|
|
}
|
|
|
|
rb_link_node(&entry->node, parent, link);
|
|
rb_insert_color_cached(&entry->node, &tracker->tree, leftmost);
|
|
tracker->nr_entries++;
|
|
return entry;
|
|
}
|
|
|
|
static void ceph_subvolume_metrics_clear_locked(
|
|
struct ceph_subvolume_metrics_tracker *tracker)
|
|
{
|
|
struct rb_node *node = rb_first_cached(&tracker->tree);
|
|
|
|
while (node) {
|
|
struct ceph_subvol_metric_rb_entry *entry =
|
|
rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
|
|
struct rb_node *next = rb_next(node);
|
|
|
|
rb_erase_cached(&entry->node, &tracker->tree);
|
|
tracker->nr_entries--;
|
|
kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
|
|
node = next;
|
|
}
|
|
|
|
tracker->tree = RB_ROOT_CACHED;
|
|
}
|
|
|
|
void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker)
|
|
{
|
|
spin_lock(&tracker->lock);
|
|
ceph_subvolume_metrics_clear_locked(tracker);
|
|
tracker->enabled = false;
|
|
spin_unlock(&tracker->lock);
|
|
}
|
|
|
|
void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker,
|
|
bool enable)
|
|
{
|
|
spin_lock(&tracker->lock);
|
|
if (enable) {
|
|
tracker->enabled = true;
|
|
} else {
|
|
tracker->enabled = false;
|
|
ceph_subvolume_metrics_clear_locked(tracker);
|
|
}
|
|
spin_unlock(&tracker->lock);
|
|
}
|
|
|
|
void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker,
|
|
u64 subvol_id, bool is_write,
|
|
size_t size, u64 latency_us)
|
|
{
|
|
struct ceph_subvol_metric_rb_entry *entry, *new_entry = NULL;
|
|
bool retry = false;
|
|
|
|
/* CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset subvolume */
|
|
if (!READ_ONCE(tracker->enabled) ||
|
|
subvol_id == CEPH_SUBVOLUME_ID_NONE || !size || !latency_us)
|
|
return;
|
|
|
|
/*
|
|
* Retry loop for lock-free allocation pattern:
|
|
* 1. First iteration: lookup under lock, if miss -> drop lock, alloc, retry
|
|
* 2. Second iteration: lookup again (may have been inserted), insert if still missing
|
|
* 3. On race (another thread inserted same key): free our alloc, retry
|
|
* All successful paths exit via return, so retry flag doesn't need reset.
|
|
*/
|
|
do {
|
|
spin_lock(&tracker->lock);
|
|
if (!tracker->enabled) {
|
|
spin_unlock(&tracker->lock);
|
|
if (new_entry)
|
|
kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
|
|
return;
|
|
}
|
|
|
|
entry = __lookup_entry(tracker, subvol_id);
|
|
if (!entry) {
|
|
if (!new_entry) {
|
|
spin_unlock(&tracker->lock);
|
|
new_entry = kmem_cache_zalloc(ceph_subvol_metric_entry_cachep,
|
|
GFP_NOFS);
|
|
if (!new_entry)
|
|
return;
|
|
new_entry->subvolume_id = subvol_id;
|
|
retry = true;
|
|
continue;
|
|
}
|
|
entry = __insert_entry(tracker, new_entry);
|
|
if (entry != new_entry) {
|
|
/* raced with another insert */
|
|
spin_unlock(&tracker->lock);
|
|
kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
|
|
new_entry = NULL;
|
|
retry = true;
|
|
continue;
|
|
}
|
|
new_entry = NULL;
|
|
}
|
|
|
|
if (is_write) {
|
|
entry->write_ops++;
|
|
entry->write_bytes += size;
|
|
entry->write_latency_us += latency_us;
|
|
atomic64_inc(&tracker->total_write_ops);
|
|
atomic64_add(size, &tracker->total_write_bytes);
|
|
} else {
|
|
entry->read_ops++;
|
|
entry->read_bytes += size;
|
|
entry->read_latency_us += latency_us;
|
|
atomic64_inc(&tracker->total_read_ops);
|
|
atomic64_add(size, &tracker->total_read_bytes);
|
|
}
|
|
spin_unlock(&tracker->lock);
|
|
if (new_entry)
|
|
kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
|
|
return;
|
|
} while (retry);
|
|
}
|
|
|
|
int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker,
|
|
struct ceph_subvol_metric_snapshot **out,
|
|
u32 *nr, bool consume)
|
|
{
|
|
struct ceph_subvol_metric_snapshot *snap = NULL;
|
|
struct rb_node *node;
|
|
u32 count = 0, idx = 0;
|
|
int ret = 0;
|
|
|
|
*out = NULL;
|
|
*nr = 0;
|
|
|
|
if (!READ_ONCE(tracker->enabled))
|
|
return 0;
|
|
|
|
atomic64_inc(&tracker->snapshot_attempts);
|
|
|
|
spin_lock(&tracker->lock);
|
|
for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) {
|
|
struct ceph_subvol_metric_rb_entry *entry =
|
|
rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
|
|
|
|
/* Include entries with ANY I/O activity (read OR write) */
|
|
if (entry->read_ops || entry->write_ops)
|
|
count++;
|
|
}
|
|
spin_unlock(&tracker->lock);
|
|
|
|
if (!count) {
|
|
atomic64_inc(&tracker->snapshot_empty);
|
|
return 0;
|
|
}
|
|
|
|
snap = kcalloc(count, sizeof(*snap), GFP_NOFS);
|
|
if (!snap) {
|
|
atomic64_inc(&tracker->snapshot_failures);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
spin_lock(&tracker->lock);
|
|
node = rb_first_cached(&tracker->tree);
|
|
while (node) {
|
|
struct ceph_subvol_metric_rb_entry *entry =
|
|
rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
|
|
struct rb_node *next = rb_next(node);
|
|
|
|
/* Skip entries with NO I/O activity at all */
|
|
if (!entry->read_ops && !entry->write_ops) {
|
|
rb_erase_cached(&entry->node, &tracker->tree);
|
|
tracker->nr_entries--;
|
|
kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
|
|
node = next;
|
|
continue;
|
|
}
|
|
|
|
if (idx >= count) {
|
|
pr_warn("ceph: subvol metrics snapshot race (idx=%u count=%u)\n",
|
|
idx, count);
|
|
break;
|
|
}
|
|
|
|
snap[idx].subvolume_id = entry->subvolume_id;
|
|
snap[idx].read_ops = entry->read_ops;
|
|
snap[idx].write_ops = entry->write_ops;
|
|
snap[idx].read_bytes = entry->read_bytes;
|
|
snap[idx].write_bytes = entry->write_bytes;
|
|
snap[idx].read_latency_us = entry->read_latency_us;
|
|
snap[idx].write_latency_us = entry->write_latency_us;
|
|
idx++;
|
|
|
|
if (consume) {
|
|
entry->read_ops = 0;
|
|
entry->write_ops = 0;
|
|
entry->read_bytes = 0;
|
|
entry->write_bytes = 0;
|
|
entry->read_latency_us = 0;
|
|
entry->write_latency_us = 0;
|
|
rb_erase_cached(&entry->node, &tracker->tree);
|
|
tracker->nr_entries--;
|
|
kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
|
|
}
|
|
node = next;
|
|
}
|
|
spin_unlock(&tracker->lock);
|
|
|
|
if (!idx) {
|
|
kfree(snap);
|
|
snap = NULL;
|
|
ret = 0;
|
|
} else {
|
|
*nr = idx;
|
|
*out = snap;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot)
|
|
{
|
|
kfree(snapshot);
|
|
}
|
|
|
|
/*
|
|
* Dump subvolume metrics to a seq_file for debugfs.
|
|
*
|
|
* Iterates the rb-tree directly under spinlock to avoid allocation.
|
|
* The lock hold time is minimal since we're only doing seq_printf calls.
|
|
*/
|
|
void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker,
|
|
struct seq_file *s)
|
|
{
|
|
struct rb_node *node;
|
|
bool found = false;
|
|
|
|
spin_lock(&tracker->lock);
|
|
if (!tracker->enabled) {
|
|
spin_unlock(&tracker->lock);
|
|
seq_puts(s, "subvolume metrics disabled\n");
|
|
return;
|
|
}
|
|
|
|
for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) {
|
|
struct ceph_subvol_metric_rb_entry *entry =
|
|
rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
|
|
u64 avg_rd_lat, avg_wr_lat;
|
|
|
|
if (!entry->read_ops && !entry->write_ops)
|
|
continue;
|
|
|
|
if (!found) {
|
|
seq_puts(s, "subvol_id rd_ops rd_bytes rd_avg_lat_us wr_ops wr_bytes wr_avg_lat_us\n");
|
|
seq_puts(s, "------------------------------------------------------------------------------------------------\n");
|
|
found = true;
|
|
}
|
|
|
|
avg_rd_lat = entry->read_ops ?
|
|
div64_u64(entry->read_latency_us, entry->read_ops) : 0;
|
|
avg_wr_lat = entry->write_ops ?
|
|
div64_u64(entry->write_latency_us, entry->write_ops) : 0;
|
|
|
|
seq_printf(s, "%-15llu%-10llu%-12llu%-16llu%-10llu%-12llu%-16llu\n",
|
|
entry->subvolume_id,
|
|
entry->read_ops,
|
|
entry->read_bytes,
|
|
avg_rd_lat,
|
|
entry->write_ops,
|
|
entry->write_bytes,
|
|
avg_wr_lat);
|
|
}
|
|
spin_unlock(&tracker->lock);
|
|
|
|
if (!found)
|
|
seq_puts(s, "(no subvolume metrics collected)\n");
|
|
}
|
|
|
|
void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc,
|
|
struct ceph_inode_info *ci,
|
|
bool is_write, size_t bytes,
|
|
ktime_t start, ktime_t end)
|
|
{
|
|
struct ceph_subvolume_metrics_tracker *tracker;
|
|
u64 subvol_id;
|
|
s64 delta_us;
|
|
|
|
if (!mdsc || !ci || !bytes)
|
|
return;
|
|
|
|
tracker = &mdsc->subvol_metrics;
|
|
atomic64_inc(&tracker->record_calls);
|
|
|
|
if (!ceph_subvolume_metrics_enabled(tracker)) {
|
|
atomic64_inc(&tracker->record_disabled);
|
|
return;
|
|
}
|
|
|
|
subvol_id = READ_ONCE(ci->i_subvolume_id);
|
|
if (subvol_id == CEPH_SUBVOLUME_ID_NONE) {
|
|
atomic64_inc(&tracker->record_no_subvol);
|
|
return;
|
|
}
|
|
|
|
delta_us = ktime_to_us(ktime_sub(end, start));
|
|
if (delta_us <= 0)
|
|
delta_us = 1;
|
|
|
|
ceph_subvolume_metrics_record(tracker, subvol_id, is_write,
|
|
bytes, (u64)delta_us);
|
|
}
|
|
|
|
int __init ceph_subvolume_metrics_cache_init(void)
|
|
{
|
|
ceph_subvol_metric_entry_cachep = KMEM_CACHE(ceph_subvol_metric_rb_entry,
|
|
SLAB_RECLAIM_ACCOUNT);
|
|
if (!ceph_subvol_metric_entry_cachep)
|
|
return -ENOMEM;
|
|
return 0;
|
|
}
|
|
|
|
void ceph_subvolume_metrics_cache_destroy(void)
|
|
{
|
|
kmem_cache_destroy(ceph_subvol_metric_entry_cachep);
|
|
}
|