mirror of
https://github.com/torvalds/linux.git
synced 2026-05-28 00:53:34 +02:00
perf lock contention: Symbolize zone->lock using BTF
The struct zone is embedded in struct pglist_data which can be allocated
for each NUMA node early in the boot process. As it's not a slab object
nor a global lock, this was not symbolized.
Since the zone->lock is often contended, it'd be nice if we can
symbolize it. On NUMA systems, node_data array will have pointers for
struct pglist_data. By following the pointer, it can calculate the
address of each zone and its lock using BTF. On UMA, it can just use
contig_page_data and its zones.
The following example shows the zone lock contention at the end.
$ sudo ./perf lock con -abl -E 5 -- ./perf bench sched messaging
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run
Total time: 0.038 [sec]
contended total wait max wait avg wait address symbol
5167 18.17 ms 10.27 us 3.52 us ffff953340052d00 &kmem_cache_node (spinlock)
38 11.75 ms 465.49 us 309.13 us ffff95334060c480 &sock_inode_cache (spinlock)
3916 10.13 ms 10.43 us 2.59 us ffff953342aecb40 &kmem_cache_node (spinlock)
2963 10.02 ms 13.75 us 3.38 us ffff9533d2344098 &kmalloc-rnd-08-2k (spinlock)
216 5.05 ms 99.49 us 23.39 us ffff9542bf7d65d0 zone_lock (spinlock)
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: bpf@vger.kernel.org
Cc: linux-mm@kvack.org
Link: https://lore.kernel.org/r/20250401063055.7431-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
parent
2d099ccaad
commit
13f35928a4
|
|
@ -12,6 +12,7 @@
|
|||
#include "util/lock-contention.h"
|
||||
#include <linux/zalloc.h>
|
||||
#include <linux/string.h>
|
||||
#include <api/fs/fs.h>
|
||||
#include <bpf/bpf.h>
|
||||
#include <bpf/btf.h>
|
||||
#include <inttypes.h>
|
||||
|
|
@ -35,28 +36,26 @@ static bool slab_cache_equal(long key1, long key2, void *ctx __maybe_unused)
|
|||
|
||||
static void check_slab_cache_iter(struct lock_contention *con)
|
||||
{
|
||||
struct btf *btf = btf__load_vmlinux_btf();
|
||||
s32 ret;
|
||||
|
||||
hashmap__init(&slab_hash, slab_cache_hash, slab_cache_equal, /*ctx=*/NULL);
|
||||
|
||||
if (btf == NULL) {
|
||||
con->btf = btf__load_vmlinux_btf();
|
||||
if (con->btf == NULL) {
|
||||
pr_debug("BTF loading failed: %s\n", strerror(errno));
|
||||
return;
|
||||
}
|
||||
|
||||
ret = btf__find_by_name_kind(btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
|
||||
ret = btf__find_by_name_kind(con->btf, "bpf_iter__kmem_cache", BTF_KIND_STRUCT);
|
||||
if (ret < 0) {
|
||||
bpf_program__set_autoload(skel->progs.slab_cache_iter, false);
|
||||
pr_debug("slab cache iterator is not available: %d\n", ret);
|
||||
goto out;
|
||||
return;
|
||||
}
|
||||
|
||||
has_slab_iter = true;
|
||||
|
||||
bpf_map__set_max_entries(skel->maps.slab_caches, con->map_nr_entries);
|
||||
out:
|
||||
btf__free(btf);
|
||||
}
|
||||
|
||||
static void run_slab_cache_iter(void)
|
||||
|
|
@ -109,6 +108,75 @@ static void exit_slab_cache_iter(void)
|
|||
hashmap__clear(&slab_hash);
|
||||
}
|
||||
|
||||
static void init_numa_data(struct lock_contention *con)
|
||||
{
|
||||
struct symbol *sym;
|
||||
struct map *kmap;
|
||||
char *buf = NULL, *p;
|
||||
size_t len;
|
||||
long last = -1;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* 'struct zone' is embedded in 'struct pglist_data' as an array.
|
||||
* As we may not have full information of the struct zone in the
|
||||
* (fake) vmlinux.h, let's get the actual size from BTF.
|
||||
*/
|
||||
ret = btf__find_by_name_kind(con->btf, "zone", BTF_KIND_STRUCT);
|
||||
if (ret < 0) {
|
||||
pr_debug("cannot get type of struct zone: %d\n", ret);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = btf__resolve_size(con->btf, ret);
|
||||
if (ret < 0) {
|
||||
pr_debug("cannot get size of struct zone: %d\n", ret);
|
||||
return;
|
||||
}
|
||||
skel->rodata->sizeof_zone = ret;
|
||||
|
||||
/* UMA system doesn't have 'node_data[]' - just use contig_page_data. */
|
||||
sym = machine__find_kernel_symbol_by_name(con->machine,
|
||||
"contig_page_data",
|
||||
&kmap);
|
||||
if (sym) {
|
||||
skel->rodata->contig_page_data_addr = map__unmap_ip(kmap, sym->start);
|
||||
map__put(kmap);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* The 'node_data' is an array of pointers to struct pglist_data.
|
||||
* It needs to follow the pointer for each node in BPF to get the
|
||||
* address of struct pglist_data and its zones.
|
||||
*/
|
||||
sym = machine__find_kernel_symbol_by_name(con->machine,
|
||||
"node_data",
|
||||
&kmap);
|
||||
if (sym == NULL)
|
||||
return;
|
||||
|
||||
skel->rodata->node_data_addr = map__unmap_ip(kmap, sym->start);
|
||||
map__put(kmap);
|
||||
|
||||
/* get the number of online nodes using the last node number + 1 */
|
||||
ret = sysfs__read_str("devices/system/node/online", &buf, &len);
|
||||
if (ret < 0) {
|
||||
pr_debug("failed to read online node: %d\n", ret);
|
||||
return;
|
||||
}
|
||||
|
||||
p = buf;
|
||||
while (p && *p) {
|
||||
last = strtol(p, &p, 0);
|
||||
|
||||
if (p && (*p == ',' || *p == '-' || *p == '\n'))
|
||||
p++;
|
||||
}
|
||||
skel->rodata->nr_nodes = last + 1;
|
||||
free(buf);
|
||||
}
|
||||
|
||||
int lock_contention_prepare(struct lock_contention *con)
|
||||
{
|
||||
int i, fd;
|
||||
|
|
@ -218,6 +286,8 @@ int lock_contention_prepare(struct lock_contention *con)
|
|||
|
||||
bpf_map__set_max_entries(skel->maps.slab_filter, nslabs);
|
||||
|
||||
init_numa_data(con);
|
||||
|
||||
if (lock_contention_bpf__load(skel) < 0) {
|
||||
pr_err("Failed to load lock-contention BPF skeleton\n");
|
||||
return -1;
|
||||
|
|
@ -505,6 +575,11 @@ static const char *lock_contention_get_name(struct lock_contention *con,
|
|||
return "rq_lock";
|
||||
}
|
||||
|
||||
if (!bpf_map_lookup_elem(lock_fd, &key->lock_addr_or_cgroup, &flags)) {
|
||||
if (flags == LOCK_CLASS_ZONE_LOCK)
|
||||
return "zone_lock";
|
||||
}
|
||||
|
||||
/* look slab_hash for dynamic locks in a slab object */
|
||||
if (hashmap__find(&slab_hash, flags & LCB_F_SLAB_ID_MASK, &slab_data)) {
|
||||
snprintf(name_buf, sizeof(name_buf), "&%s", slab_data->name);
|
||||
|
|
@ -743,6 +818,7 @@ int lock_contention_finish(struct lock_contention *con)
|
|||
}
|
||||
|
||||
exit_slab_cache_iter();
|
||||
btf__free(con->btf);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,6 +11,9 @@
|
|||
/* for collect_lock_syms(). 4096 was rejected by the verifier */
|
||||
#define MAX_CPUS 1024
|
||||
|
||||
/* for collect_zone_lock(). It should be more than the actual zones. */
|
||||
#define MAX_ZONES 10
|
||||
|
||||
/* lock contention flags from include/trace/events/lock.h */
|
||||
#define LCB_F_SPIN (1U << 0)
|
||||
#define LCB_F_READ (1U << 1)
|
||||
|
|
@ -801,6 +804,11 @@ int contention_end(u64 *ctx)
|
|||
|
||||
extern struct rq runqueues __ksym;
|
||||
|
||||
const volatile __u64 contig_page_data_addr;
|
||||
const volatile __u64 node_data_addr;
|
||||
const volatile int nr_nodes;
|
||||
const volatile int sizeof_zone;
|
||||
|
||||
struct rq___old {
|
||||
raw_spinlock_t lock;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
|
@ -809,6 +817,59 @@ struct rq___new {
|
|||
raw_spinlock_t __lock;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
static void collect_zone_lock(void)
|
||||
{
|
||||
__u64 nr_zones, zone_off;
|
||||
__u64 lock_addr, lock_off;
|
||||
__u32 lock_flag = LOCK_CLASS_ZONE_LOCK;
|
||||
|
||||
zone_off = offsetof(struct pglist_data, node_zones);
|
||||
lock_off = offsetof(struct zone, lock);
|
||||
|
||||
if (contig_page_data_addr) {
|
||||
struct pglist_data *contig_page_data;
|
||||
|
||||
contig_page_data = (void *)(long)contig_page_data_addr;
|
||||
nr_zones = BPF_CORE_READ(contig_page_data, nr_zones);
|
||||
|
||||
for (int i = 0; i < MAX_ZONES; i++) {
|
||||
__u64 zone_addr;
|
||||
|
||||
if (i >= nr_zones)
|
||||
break;
|
||||
|
||||
zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off;
|
||||
lock_addr = zone_addr + lock_off;
|
||||
|
||||
bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
|
||||
}
|
||||
} else if (nr_nodes > 0) {
|
||||
struct pglist_data **node_data = (void *)(long)node_data_addr;
|
||||
|
||||
for (int i = 0; i < nr_nodes; i++) {
|
||||
struct pglist_data *pgdat = NULL;
|
||||
int err;
|
||||
|
||||
err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]);
|
||||
if (err < 0 || pgdat == NULL)
|
||||
break;
|
||||
|
||||
nr_zones = BPF_CORE_READ(pgdat, nr_zones);
|
||||
for (int k = 0; k < MAX_ZONES; k++) {
|
||||
__u64 zone_addr;
|
||||
|
||||
if (k >= nr_zones)
|
||||
break;
|
||||
|
||||
zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off;
|
||||
lock_addr = zone_addr + lock_off;
|
||||
|
||||
bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SEC("raw_tp/bpf_test_finish")
|
||||
int BPF_PROG(collect_lock_syms)
|
||||
{
|
||||
|
|
@ -830,6 +891,9 @@ int BPF_PROG(collect_lock_syms)
|
|||
lock_flag = LOCK_CLASS_RQLOCK;
|
||||
bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
|
||||
}
|
||||
|
||||
collect_zone_lock();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ enum lock_aggr_mode {
|
|||
enum lock_class_sym {
|
||||
LOCK_CLASS_NONE,
|
||||
LOCK_CLASS_RQLOCK,
|
||||
LOCK_CLASS_ZONE_LOCK,
|
||||
};
|
||||
|
||||
struct slab_cache_data {
|
||||
|
|
|
|||
|
|
@ -203,4 +203,13 @@ struct bpf_iter__kmem_cache {
|
|||
struct kmem_cache *s;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
struct zone {
|
||||
spinlock_t lock;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
struct pglist_data {
|
||||
struct zone node_zones[6]; /* value for all possible config */
|
||||
int nr_zones;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
#endif // __VMLINUX_H
|
||||
|
|
|
|||
|
|
@ -142,6 +142,7 @@ struct lock_contention {
|
|||
struct lock_filter *filters;
|
||||
struct lock_contention_fails fails;
|
||||
struct rb_root cgroups;
|
||||
void *btf;
|
||||
unsigned long map_nr_entries;
|
||||
int max_stack;
|
||||
int stack_skip;
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user