162306a36Sopenharmony_ci// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
262306a36Sopenharmony_ci// Copyright (c) 2021 Facebook
362306a36Sopenharmony_ci// Copyright (c) 2021 Google
462306a36Sopenharmony_ci#include "vmlinux.h"
562306a36Sopenharmony_ci#include <bpf/bpf_helpers.h>
662306a36Sopenharmony_ci#include <bpf/bpf_tracing.h>
762306a36Sopenharmony_ci#include <bpf/bpf_core_read.h>
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#define MAX_LEVELS  10  // max cgroup hierarchy level: arbitrary
1062306a36Sopenharmony_ci#define MAX_EVENTS  32  // max events per cgroup: arbitrary
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci// NOTE: many of map and global data will be modified before loading
1362306a36Sopenharmony_ci//       from the userspace (perf tool) using the skeleton helpers.
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci// single set of global perf events to measure
1662306a36Sopenharmony_cistruct {
1762306a36Sopenharmony_ci	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
1862306a36Sopenharmony_ci	__uint(key_size, sizeof(__u32));
1962306a36Sopenharmony_ci	__uint(value_size, sizeof(int));
2062306a36Sopenharmony_ci	__uint(max_entries, 1);
2162306a36Sopenharmony_ci} events SEC(".maps");
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci// from cgroup id to event index
2462306a36Sopenharmony_cistruct {
2562306a36Sopenharmony_ci	__uint(type, BPF_MAP_TYPE_HASH);
2662306a36Sopenharmony_ci	__uint(key_size, sizeof(__u64));
2762306a36Sopenharmony_ci	__uint(value_size, sizeof(__u32));
2862306a36Sopenharmony_ci	__uint(max_entries, 1);
2962306a36Sopenharmony_ci} cgrp_idx SEC(".maps");
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_ci// per-cpu event snapshots to calculate delta
3262306a36Sopenharmony_cistruct {
3362306a36Sopenharmony_ci	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
3462306a36Sopenharmony_ci	__uint(key_size, sizeof(__u32));
3562306a36Sopenharmony_ci	__uint(value_size, sizeof(struct bpf_perf_event_value));
3662306a36Sopenharmony_ci} prev_readings SEC(".maps");
3762306a36Sopenharmony_ci
3862306a36Sopenharmony_ci// aggregated event values for each cgroup (per-cpu)
3962306a36Sopenharmony_ci// will be read from the user-space
4062306a36Sopenharmony_cistruct {
4162306a36Sopenharmony_ci	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
4262306a36Sopenharmony_ci	__uint(key_size, sizeof(__u32));
4362306a36Sopenharmony_ci	__uint(value_size, sizeof(struct bpf_perf_event_value));
4462306a36Sopenharmony_ci} cgrp_readings SEC(".maps");
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci/* new kernel cgroup definition */
4762306a36Sopenharmony_cistruct cgroup___new {
4862306a36Sopenharmony_ci	int level;
4962306a36Sopenharmony_ci	struct cgroup *ancestors[];
5062306a36Sopenharmony_ci} __attribute__((preserve_access_index));
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci/* old kernel cgroup definition */
5362306a36Sopenharmony_cistruct cgroup___old {
5462306a36Sopenharmony_ci	int level;
5562306a36Sopenharmony_ci	u64 ancestor_ids[];
5662306a36Sopenharmony_ci} __attribute__((preserve_access_index));
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ciconst volatile __u32 num_events = 1;
5962306a36Sopenharmony_ciconst volatile __u32 num_cpus = 1;
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ciint enabled = 0;
6262306a36Sopenharmony_ciint use_cgroup_v2 = 0;
6362306a36Sopenharmony_ciint perf_subsys_id = -1;
6462306a36Sopenharmony_ci
6562306a36Sopenharmony_cistatic inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
6662306a36Sopenharmony_ci{
6762306a36Sopenharmony_ci	/* recast pointer to capture new type for compiler */
6862306a36Sopenharmony_ci	struct cgroup___new *cgrp_new = (void *)cgrp;
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci	if (bpf_core_field_exists(cgrp_new->ancestors)) {
7162306a36Sopenharmony_ci		return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
7262306a36Sopenharmony_ci	} else {
7362306a36Sopenharmony_ci		/* recast pointer to capture old type for compiler */
7462306a36Sopenharmony_ci		struct cgroup___old *cgrp_old = (void *)cgrp;
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci		return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
7762306a36Sopenharmony_ci	}
7862306a36Sopenharmony_ci}
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_cistatic inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
8162306a36Sopenharmony_ci{
8262306a36Sopenharmony_ci	struct task_struct *p = (void *)bpf_get_current_task();
8362306a36Sopenharmony_ci	struct cgroup *cgrp;
8462306a36Sopenharmony_ci	register int i = 0;
8562306a36Sopenharmony_ci	__u32 *elem;
8662306a36Sopenharmony_ci	int level;
8762306a36Sopenharmony_ci	int cnt;
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci	if (perf_subsys_id == -1) {
9062306a36Sopenharmony_ci#if __has_builtin(__builtin_preserve_enum_value)
9162306a36Sopenharmony_ci		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
9262306a36Sopenharmony_ci						     perf_event_cgrp_id);
9362306a36Sopenharmony_ci#else
9462306a36Sopenharmony_ci		perf_subsys_id = perf_event_cgrp_id;
9562306a36Sopenharmony_ci#endif
9662306a36Sopenharmony_ci	}
9762306a36Sopenharmony_ci	cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
9862306a36Sopenharmony_ci	level = BPF_CORE_READ(cgrp, level);
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci	for (cnt = 0; i < MAX_LEVELS; i++) {
10162306a36Sopenharmony_ci		__u64 cgrp_id;
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_ci		if (i > level)
10462306a36Sopenharmony_ci			break;
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci		// convert cgroup-id to a map index
10762306a36Sopenharmony_ci		cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
10862306a36Sopenharmony_ci		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
10962306a36Sopenharmony_ci		if (!elem)
11062306a36Sopenharmony_ci			continue;
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_ci		cgrps[cnt++] = *elem;
11362306a36Sopenharmony_ci		if (cnt == size)
11462306a36Sopenharmony_ci			break;
11562306a36Sopenharmony_ci	}
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_ci	return cnt;
11862306a36Sopenharmony_ci}
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_cistatic inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
12162306a36Sopenharmony_ci{
12262306a36Sopenharmony_ci	register int i = 0;
12362306a36Sopenharmony_ci	__u32 *elem;
12462306a36Sopenharmony_ci	int cnt;
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci	for (cnt = 0; i < MAX_LEVELS; i++) {
12762306a36Sopenharmony_ci		__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci		if (cgrp_id == 0)
13062306a36Sopenharmony_ci			break;
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci		// convert cgroup-id to a map index
13362306a36Sopenharmony_ci		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
13462306a36Sopenharmony_ci		if (!elem)
13562306a36Sopenharmony_ci			continue;
13662306a36Sopenharmony_ci
13762306a36Sopenharmony_ci		cgrps[cnt++] = *elem;
13862306a36Sopenharmony_ci		if (cnt == size)
13962306a36Sopenharmony_ci			break;
14062306a36Sopenharmony_ci	}
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci	return cnt;
14362306a36Sopenharmony_ci}
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_cistatic int bperf_cgroup_count(void)
14662306a36Sopenharmony_ci{
14762306a36Sopenharmony_ci	register __u32 idx = 0;  // to have it in a register to pass BPF verifier
14862306a36Sopenharmony_ci	register int c = 0;
14962306a36Sopenharmony_ci	struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
15062306a36Sopenharmony_ci	__u32 cpu = bpf_get_smp_processor_id();
15162306a36Sopenharmony_ci	__u32 cgrp_idx[MAX_LEVELS];
15262306a36Sopenharmony_ci	int cgrp_cnt;
15362306a36Sopenharmony_ci	__u32 key, cgrp;
15462306a36Sopenharmony_ci	long err;
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_ci	if (use_cgroup_v2)
15762306a36Sopenharmony_ci		cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
15862306a36Sopenharmony_ci	else
15962306a36Sopenharmony_ci		cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	for ( ; idx < MAX_EVENTS; idx++) {
16262306a36Sopenharmony_ci		if (idx == num_events)
16362306a36Sopenharmony_ci			break;
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_ci		// XXX: do not pass idx directly (for verifier)
16662306a36Sopenharmony_ci		key = idx;
16762306a36Sopenharmony_ci		// this is per-cpu array for diff
16862306a36Sopenharmony_ci		prev_val = bpf_map_lookup_elem(&prev_readings, &key);
16962306a36Sopenharmony_ci		if (!prev_val) {
17062306a36Sopenharmony_ci			val.counter = val.enabled = val.running = 0;
17162306a36Sopenharmony_ci			bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci			prev_val = bpf_map_lookup_elem(&prev_readings, &key);
17462306a36Sopenharmony_ci			if (!prev_val)
17562306a36Sopenharmony_ci				continue;
17662306a36Sopenharmony_ci		}
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci		// read from global perf_event array
17962306a36Sopenharmony_ci		key = idx * num_cpus + cpu;
18062306a36Sopenharmony_ci		err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
18162306a36Sopenharmony_ci		if (err)
18262306a36Sopenharmony_ci			continue;
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci		if (enabled) {
18562306a36Sopenharmony_ci			delta.counter = val.counter - prev_val->counter;
18662306a36Sopenharmony_ci			delta.enabled = val.enabled - prev_val->enabled;
18762306a36Sopenharmony_ci			delta.running = val.running - prev_val->running;
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci			for (c = 0; c < MAX_LEVELS; c++) {
19062306a36Sopenharmony_ci				if (c == cgrp_cnt)
19162306a36Sopenharmony_ci					break;
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci				cgrp = cgrp_idx[c];
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci				// aggregate the result by cgroup
19662306a36Sopenharmony_ci				key = cgrp * num_events + idx;
19762306a36Sopenharmony_ci				cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
19862306a36Sopenharmony_ci				if (cgrp_val) {
19962306a36Sopenharmony_ci					cgrp_val->counter += delta.counter;
20062306a36Sopenharmony_ci					cgrp_val->enabled += delta.enabled;
20162306a36Sopenharmony_ci					cgrp_val->running += delta.running;
20262306a36Sopenharmony_ci				} else {
20362306a36Sopenharmony_ci					bpf_map_update_elem(&cgrp_readings, &key,
20462306a36Sopenharmony_ci							    &delta, BPF_ANY);
20562306a36Sopenharmony_ci				}
20662306a36Sopenharmony_ci			}
20762306a36Sopenharmony_ci		}
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci		*prev_val = val;
21062306a36Sopenharmony_ci	}
21162306a36Sopenharmony_ci	return 0;
21262306a36Sopenharmony_ci}
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci// This will be attached to cgroup-switches event for each cpu
21562306a36Sopenharmony_ciSEC("perf_event")
21662306a36Sopenharmony_ciint BPF_PROG(on_cgrp_switch)
21762306a36Sopenharmony_ci{
21862306a36Sopenharmony_ci	return bperf_cgroup_count();
21962306a36Sopenharmony_ci}
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ciSEC("raw_tp/sched_switch")
22262306a36Sopenharmony_ciint BPF_PROG(trigger_read)
22362306a36Sopenharmony_ci{
22462306a36Sopenharmony_ci	return bperf_cgroup_count();
22562306a36Sopenharmony_ci}
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_cichar LICENSE[] SEC("license") = "Dual BSD/GPL";
228