162306a36Sopenharmony_ci// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 262306a36Sopenharmony_ci// Copyright (c) 2021 Facebook 362306a36Sopenharmony_ci// Copyright (c) 2021 Google 462306a36Sopenharmony_ci#include "vmlinux.h" 562306a36Sopenharmony_ci#include <bpf/bpf_helpers.h> 662306a36Sopenharmony_ci#include <bpf/bpf_tracing.h> 762306a36Sopenharmony_ci#include <bpf/bpf_core_read.h> 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary 1062306a36Sopenharmony_ci#define MAX_EVENTS 32 // max events per cgroup: arbitrary 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci// NOTE: many of map and global data will be modified before loading 1362306a36Sopenharmony_ci// from the userspace (perf tool) using the skeleton helpers. 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci// single set of global perf events to measure 1662306a36Sopenharmony_cistruct { 1762306a36Sopenharmony_ci __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 1862306a36Sopenharmony_ci __uint(key_size, sizeof(__u32)); 1962306a36Sopenharmony_ci __uint(value_size, sizeof(int)); 2062306a36Sopenharmony_ci __uint(max_entries, 1); 2162306a36Sopenharmony_ci} events SEC(".maps"); 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_ci// from cgroup id to event index 2462306a36Sopenharmony_cistruct { 2562306a36Sopenharmony_ci __uint(type, BPF_MAP_TYPE_HASH); 2662306a36Sopenharmony_ci __uint(key_size, sizeof(__u64)); 2762306a36Sopenharmony_ci __uint(value_size, sizeof(__u32)); 2862306a36Sopenharmony_ci __uint(max_entries, 1); 2962306a36Sopenharmony_ci} cgrp_idx SEC(".maps"); 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci// per-cpu event snapshots to calculate delta 3262306a36Sopenharmony_cistruct { 3362306a36Sopenharmony_ci __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 3462306a36Sopenharmony_ci __uint(key_size, sizeof(__u32)); 3562306a36Sopenharmony_ci __uint(value_size, sizeof(struct bpf_perf_event_value)); 3662306a36Sopenharmony_ci} prev_readings SEC(".maps"); 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci// aggregated event values for each cgroup (per-cpu) 3962306a36Sopenharmony_ci// will be read from the user-space 4062306a36Sopenharmony_cistruct { 4162306a36Sopenharmony_ci __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 4262306a36Sopenharmony_ci __uint(key_size, sizeof(__u32)); 4362306a36Sopenharmony_ci __uint(value_size, sizeof(struct bpf_perf_event_value)); 4462306a36Sopenharmony_ci} cgrp_readings SEC(".maps"); 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci/* new kernel cgroup definition */ 4762306a36Sopenharmony_cistruct cgroup___new { 4862306a36Sopenharmony_ci int level; 4962306a36Sopenharmony_ci struct cgroup *ancestors[]; 5062306a36Sopenharmony_ci} __attribute__((preserve_access_index)); 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci/* old kernel cgroup definition */ 5362306a36Sopenharmony_cistruct cgroup___old { 5462306a36Sopenharmony_ci int level; 5562306a36Sopenharmony_ci u64 ancestor_ids[]; 5662306a36Sopenharmony_ci} __attribute__((preserve_access_index)); 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ciconst volatile __u32 num_events = 1; 5962306a36Sopenharmony_ciconst volatile __u32 num_cpus = 1; 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ciint enabled = 0; 6262306a36Sopenharmony_ciint use_cgroup_v2 = 0; 6362306a36Sopenharmony_ciint perf_subsys_id = -1; 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_cistatic inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level) 6662306a36Sopenharmony_ci{ 6762306a36Sopenharmony_ci /* recast pointer to capture new type for compiler */ 6862306a36Sopenharmony_ci struct cgroup___new *cgrp_new = (void *)cgrp; 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_ci if (bpf_core_field_exists(cgrp_new->ancestors)) { 7162306a36Sopenharmony_ci return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id); 7262306a36Sopenharmony_ci } else { 7362306a36Sopenharmony_ci /* recast pointer to capture old type for compiler */ 7462306a36Sopenharmony_ci struct cgroup___old *cgrp_old = (void *)cgrp; 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci return BPF_CORE_READ(cgrp_old, ancestor_ids[level]); 7762306a36Sopenharmony_ci } 7862306a36Sopenharmony_ci} 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_cistatic inline int get_cgroup_v1_idx(__u32 *cgrps, int size) 8162306a36Sopenharmony_ci{ 8262306a36Sopenharmony_ci struct task_struct *p = (void *)bpf_get_current_task(); 8362306a36Sopenharmony_ci struct cgroup *cgrp; 8462306a36Sopenharmony_ci register int i = 0; 8562306a36Sopenharmony_ci __u32 *elem; 8662306a36Sopenharmony_ci int level; 8762306a36Sopenharmony_ci int cnt; 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci if (perf_subsys_id == -1) { 9062306a36Sopenharmony_ci#if __has_builtin(__builtin_preserve_enum_value) 9162306a36Sopenharmony_ci perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 9262306a36Sopenharmony_ci perf_event_cgrp_id); 9362306a36Sopenharmony_ci#else 9462306a36Sopenharmony_ci perf_subsys_id = perf_event_cgrp_id; 9562306a36Sopenharmony_ci#endif 9662306a36Sopenharmony_ci } 9762306a36Sopenharmony_ci cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup); 9862306a36Sopenharmony_ci level = BPF_CORE_READ(cgrp, level); 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci for (cnt = 0; i < MAX_LEVELS; i++) { 10162306a36Sopenharmony_ci __u64 cgrp_id; 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci if (i > level) 10462306a36Sopenharmony_ci break; 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci // convert cgroup-id to a map index 10762306a36Sopenharmony_ci cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i); 10862306a36Sopenharmony_ci elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 10962306a36Sopenharmony_ci if (!elem) 11062306a36Sopenharmony_ci continue; 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci cgrps[cnt++] = *elem; 11362306a36Sopenharmony_ci if (cnt == size) 11462306a36Sopenharmony_ci break; 11562306a36Sopenharmony_ci } 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci return cnt; 11862306a36Sopenharmony_ci} 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_cistatic inline int get_cgroup_v2_idx(__u32 *cgrps, int size) 12162306a36Sopenharmony_ci{ 12262306a36Sopenharmony_ci register int i = 0; 12362306a36Sopenharmony_ci __u32 *elem; 12462306a36Sopenharmony_ci int cnt; 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci for (cnt = 0; i < MAX_LEVELS; i++) { 12762306a36Sopenharmony_ci __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i); 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci if (cgrp_id == 0) 13062306a36Sopenharmony_ci break; 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci // convert cgroup-id to a map index 13362306a36Sopenharmony_ci elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 13462306a36Sopenharmony_ci if (!elem) 13562306a36Sopenharmony_ci continue; 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci cgrps[cnt++] = *elem; 13862306a36Sopenharmony_ci if (cnt == size) 13962306a36Sopenharmony_ci break; 14062306a36Sopenharmony_ci } 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_ci return cnt; 14362306a36Sopenharmony_ci} 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_cistatic int bperf_cgroup_count(void) 14662306a36Sopenharmony_ci{ 14762306a36Sopenharmony_ci register __u32 idx = 0; // to have it in a register to pass BPF verifier 14862306a36Sopenharmony_ci register int c = 0; 14962306a36Sopenharmony_ci struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; 15062306a36Sopenharmony_ci __u32 cpu = bpf_get_smp_processor_id(); 15162306a36Sopenharmony_ci __u32 cgrp_idx[MAX_LEVELS]; 15262306a36Sopenharmony_ci int cgrp_cnt; 15362306a36Sopenharmony_ci __u32 key, cgrp; 15462306a36Sopenharmony_ci long err; 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci if (use_cgroup_v2) 15762306a36Sopenharmony_ci cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS); 15862306a36Sopenharmony_ci else 15962306a36Sopenharmony_ci cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS); 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci for ( ; idx < MAX_EVENTS; idx++) { 16262306a36Sopenharmony_ci if (idx == num_events) 16362306a36Sopenharmony_ci break; 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci // XXX: do not pass idx directly (for verifier) 16662306a36Sopenharmony_ci key = idx; 16762306a36Sopenharmony_ci // this is per-cpu array for diff 16862306a36Sopenharmony_ci prev_val = bpf_map_lookup_elem(&prev_readings, &key); 16962306a36Sopenharmony_ci if (!prev_val) { 17062306a36Sopenharmony_ci val.counter = val.enabled = val.running = 0; 17162306a36Sopenharmony_ci bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci prev_val = bpf_map_lookup_elem(&prev_readings, &key); 17462306a36Sopenharmony_ci if (!prev_val) 17562306a36Sopenharmony_ci continue; 17662306a36Sopenharmony_ci } 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci // read from global perf_event array 17962306a36Sopenharmony_ci key = idx * num_cpus + cpu; 18062306a36Sopenharmony_ci err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); 18162306a36Sopenharmony_ci if (err) 18262306a36Sopenharmony_ci continue; 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci if (enabled) { 18562306a36Sopenharmony_ci delta.counter = val.counter - prev_val->counter; 18662306a36Sopenharmony_ci delta.enabled = val.enabled - prev_val->enabled; 18762306a36Sopenharmony_ci delta.running = val.running - prev_val->running; 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci for (c = 0; c < MAX_LEVELS; c++) { 19062306a36Sopenharmony_ci if (c == cgrp_cnt) 19162306a36Sopenharmony_ci break; 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci cgrp = cgrp_idx[c]; 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci // aggregate the result by cgroup 19662306a36Sopenharmony_ci key = cgrp * num_events + idx; 19762306a36Sopenharmony_ci cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); 19862306a36Sopenharmony_ci if (cgrp_val) { 19962306a36Sopenharmony_ci cgrp_val->counter += delta.counter; 20062306a36Sopenharmony_ci cgrp_val->enabled += delta.enabled; 20162306a36Sopenharmony_ci cgrp_val->running += delta.running; 20262306a36Sopenharmony_ci } else { 20362306a36Sopenharmony_ci bpf_map_update_elem(&cgrp_readings, &key, 20462306a36Sopenharmony_ci &delta, BPF_ANY); 20562306a36Sopenharmony_ci } 20662306a36Sopenharmony_ci } 20762306a36Sopenharmony_ci } 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci *prev_val = val; 21062306a36Sopenharmony_ci } 21162306a36Sopenharmony_ci return 0; 21262306a36Sopenharmony_ci} 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci// This will be attached to cgroup-switches event for each cpu 21562306a36Sopenharmony_ciSEC("perf_event") 21662306a36Sopenharmony_ciint BPF_PROG(on_cgrp_switch) 21762306a36Sopenharmony_ci{ 21862306a36Sopenharmony_ci return bperf_cgroup_count(); 21962306a36Sopenharmony_ci} 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ciSEC("raw_tp/sched_switch") 22262306a36Sopenharmony_ciint BPF_PROG(trigger_read) 22362306a36Sopenharmony_ci{ 22462306a36Sopenharmony_ci return bperf_cgroup_count(); 22562306a36Sopenharmony_ci} 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_cichar LICENSE[] SEC("license") = "Dual BSD/GPL"; 228