162306a36Sopenharmony_ci// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 262306a36Sopenharmony_ci// Copyright (c) 2022 Google 362306a36Sopenharmony_ci#include "vmlinux.h" 462306a36Sopenharmony_ci#include <bpf/bpf_helpers.h> 562306a36Sopenharmony_ci#include <bpf/bpf_tracing.h> 662306a36Sopenharmony_ci#include <bpf/bpf_core_read.h> 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci/* task->flags for off-cpu analysis */ 962306a36Sopenharmony_ci#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_ci/* task->state for off-cpu analysis */ 1262306a36Sopenharmony_ci#define TASK_INTERRUPTIBLE 0x0001 1362306a36Sopenharmony_ci#define TASK_UNINTERRUPTIBLE 0x0002 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci/* create a new thread */ 1662306a36Sopenharmony_ci#define CLONE_THREAD 0x10000 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ci#define MAX_STACKS 32 1962306a36Sopenharmony_ci#define MAX_ENTRIES 102400 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_cistruct tstamp_data { 2262306a36Sopenharmony_ci __u32 stack_id; 2362306a36Sopenharmony_ci __u32 state; 2462306a36Sopenharmony_ci __u64 timestamp; 2562306a36Sopenharmony_ci}; 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_cistruct offcpu_key { 2862306a36Sopenharmony_ci __u32 pid; 2962306a36Sopenharmony_ci __u32 tgid; 3062306a36Sopenharmony_ci __u32 stack_id; 3162306a36Sopenharmony_ci __u32 state; 3262306a36Sopenharmony_ci __u64 cgroup_id; 3362306a36Sopenharmony_ci}; 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_cistruct { 3662306a36Sopenharmony_ci __uint(type, BPF_MAP_TYPE_STACK_TRACE); 3762306a36Sopenharmony_ci __uint(key_size, sizeof(__u32)); 3862306a36Sopenharmony_ci __uint(value_size, MAX_STACKS * sizeof(__u64)); 3962306a36Sopenharmony_ci __uint(max_entries, MAX_ENTRIES); 4062306a36Sopenharmony_ci} stacks SEC(".maps"); 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_cistruct { 4362306a36Sopenharmony_ci __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 4462306a36Sopenharmony_ci __uint(map_flags, BPF_F_NO_PREALLOC); 4562306a36Sopenharmony_ci __type(key, int); 4662306a36Sopenharmony_ci __type(value, struct tstamp_data); 4762306a36Sopenharmony_ci} tstamp SEC(".maps"); 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_cistruct { 5062306a36Sopenharmony_ci __uint(type, BPF_MAP_TYPE_HASH); 5162306a36Sopenharmony_ci __uint(key_size, sizeof(struct offcpu_key)); 5262306a36Sopenharmony_ci __uint(value_size, sizeof(__u64)); 5362306a36Sopenharmony_ci __uint(max_entries, MAX_ENTRIES); 5462306a36Sopenharmony_ci} off_cpu SEC(".maps"); 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_cistruct { 5762306a36Sopenharmony_ci __uint(type, BPF_MAP_TYPE_HASH); 5862306a36Sopenharmony_ci __uint(key_size, sizeof(__u32)); 5962306a36Sopenharmony_ci __uint(value_size, sizeof(__u8)); 6062306a36Sopenharmony_ci __uint(max_entries, 1); 6162306a36Sopenharmony_ci} cpu_filter SEC(".maps"); 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_cistruct { 6462306a36Sopenharmony_ci __uint(type, BPF_MAP_TYPE_HASH); 6562306a36Sopenharmony_ci __uint(key_size, sizeof(__u32)); 6662306a36Sopenharmony_ci __uint(value_size, sizeof(__u8)); 6762306a36Sopenharmony_ci __uint(max_entries, 1); 6862306a36Sopenharmony_ci} task_filter SEC(".maps"); 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_cistruct { 7162306a36Sopenharmony_ci __uint(type, BPF_MAP_TYPE_HASH); 7262306a36Sopenharmony_ci __uint(key_size, sizeof(__u64)); 7362306a36Sopenharmony_ci __uint(value_size, sizeof(__u8)); 7462306a36Sopenharmony_ci __uint(max_entries, 1); 7562306a36Sopenharmony_ci} cgroup_filter SEC(".maps"); 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci/* new kernel task_struct definition */ 7862306a36Sopenharmony_cistruct task_struct___new { 7962306a36Sopenharmony_ci long __state; 8062306a36Sopenharmony_ci} __attribute__((preserve_access_index)); 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci/* old kernel task_struct definition */ 8362306a36Sopenharmony_cistruct task_struct___old { 8462306a36Sopenharmony_ci long state; 8562306a36Sopenharmony_ci} __attribute__((preserve_access_index)); 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ciint enabled = 0; 8862306a36Sopenharmony_ciint has_cpu = 0; 8962306a36Sopenharmony_ciint has_task = 0; 9062306a36Sopenharmony_ciint has_cgroup = 0; 9162306a36Sopenharmony_ciint uses_tgid = 0; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ciconst volatile bool has_prev_state = false; 9462306a36Sopenharmony_ciconst volatile bool needs_cgroup = false; 9562306a36Sopenharmony_ciconst volatile bool uses_cgroup_v1 = false; 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ciint perf_subsys_id = -1; 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci/* 10062306a36Sopenharmony_ci * Old kernel used to call it task_struct->state and now it's '__state'. 10162306a36Sopenharmony_ci * Use BPF CO-RE "ignored suffix rule" to deal with it like below: 10262306a36Sopenharmony_ci * 10362306a36Sopenharmony_ci * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes 10462306a36Sopenharmony_ci */ 10562306a36Sopenharmony_cistatic inline int get_task_state(struct task_struct *t) 10662306a36Sopenharmony_ci{ 10762306a36Sopenharmony_ci /* recast pointer to capture new type for compiler */ 10862306a36Sopenharmony_ci struct task_struct___new *t_new = (void *)t; 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci if (bpf_core_field_exists(t_new->__state)) { 11162306a36Sopenharmony_ci return BPF_CORE_READ(t_new, __state); 11262306a36Sopenharmony_ci } else { 11362306a36Sopenharmony_ci /* recast pointer to capture old type for compiler */ 11462306a36Sopenharmony_ci struct task_struct___old *t_old = (void *)t; 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci return BPF_CORE_READ(t_old, state); 11762306a36Sopenharmony_ci } 11862306a36Sopenharmony_ci} 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_cistatic inline __u64 get_cgroup_id(struct task_struct *t) 12162306a36Sopenharmony_ci{ 12262306a36Sopenharmony_ci struct cgroup *cgrp; 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci if (!uses_cgroup_v1) 12562306a36Sopenharmony_ci return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id); 12662306a36Sopenharmony_ci 12762306a36Sopenharmony_ci if (perf_subsys_id == -1) { 12862306a36Sopenharmony_ci#if __has_builtin(__builtin_preserve_enum_value) 12962306a36Sopenharmony_ci perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 13062306a36Sopenharmony_ci perf_event_cgrp_id); 13162306a36Sopenharmony_ci#else 13262306a36Sopenharmony_ci perf_subsys_id = perf_event_cgrp_id; 13362306a36Sopenharmony_ci#endif 13462306a36Sopenharmony_ci } 13562306a36Sopenharmony_ci 13662306a36Sopenharmony_ci cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup); 13762306a36Sopenharmony_ci return BPF_CORE_READ(cgrp, kn, id); 13862306a36Sopenharmony_ci} 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_cistatic inline int can_record(struct task_struct *t, int state) 14162306a36Sopenharmony_ci{ 14262306a36Sopenharmony_ci /* kernel threads don't have user stack */ 14362306a36Sopenharmony_ci if (t->flags & PF_KTHREAD) 14462306a36Sopenharmony_ci return 0; 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci if (state != TASK_INTERRUPTIBLE && 14762306a36Sopenharmony_ci state != TASK_UNINTERRUPTIBLE) 14862306a36Sopenharmony_ci return 0; 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_ci if (has_cpu) { 15162306a36Sopenharmony_ci __u32 cpu = bpf_get_smp_processor_id(); 15262306a36Sopenharmony_ci __u8 *ok; 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 15562306a36Sopenharmony_ci if (!ok) 15662306a36Sopenharmony_ci return 0; 15762306a36Sopenharmony_ci } 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_ci if (has_task) { 16062306a36Sopenharmony_ci __u8 *ok; 16162306a36Sopenharmony_ci __u32 pid; 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci if (uses_tgid) 16462306a36Sopenharmony_ci pid = t->tgid; 16562306a36Sopenharmony_ci else 16662306a36Sopenharmony_ci pid = t->pid; 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_ci ok = bpf_map_lookup_elem(&task_filter, &pid); 16962306a36Sopenharmony_ci if (!ok) 17062306a36Sopenharmony_ci return 0; 17162306a36Sopenharmony_ci } 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci if (has_cgroup) { 17462306a36Sopenharmony_ci __u8 *ok; 17562306a36Sopenharmony_ci __u64 cgrp_id = get_cgroup_id(t); 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); 17862306a36Sopenharmony_ci if (!ok) 17962306a36Sopenharmony_ci return 0; 18062306a36Sopenharmony_ci } 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci return 1; 18362306a36Sopenharmony_ci} 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_cistatic int off_cpu_stat(u64 *ctx, struct task_struct *prev, 18662306a36Sopenharmony_ci struct task_struct *next, int state) 18762306a36Sopenharmony_ci{ 18862306a36Sopenharmony_ci __u64 ts; 18962306a36Sopenharmony_ci __u32 stack_id; 19062306a36Sopenharmony_ci struct tstamp_data *pelem; 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci ts = bpf_ktime_get_ns(); 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci if (!can_record(prev, state)) 19562306a36Sopenharmony_ci goto next; 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci stack_id = bpf_get_stackid(ctx, &stacks, 19862306a36Sopenharmony_ci BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci pelem = bpf_task_storage_get(&tstamp, prev, NULL, 20162306a36Sopenharmony_ci BPF_LOCAL_STORAGE_GET_F_CREATE); 20262306a36Sopenharmony_ci if (!pelem) 20362306a36Sopenharmony_ci goto next; 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci pelem->timestamp = ts; 20662306a36Sopenharmony_ci pelem->state = state; 20762306a36Sopenharmony_ci pelem->stack_id = stack_id; 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_cinext: 21062306a36Sopenharmony_ci pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci if (pelem && pelem->timestamp) { 21362306a36Sopenharmony_ci struct offcpu_key key = { 21462306a36Sopenharmony_ci .pid = next->pid, 21562306a36Sopenharmony_ci .tgid = next->tgid, 21662306a36Sopenharmony_ci .stack_id = pelem->stack_id, 21762306a36Sopenharmony_ci .state = pelem->state, 21862306a36Sopenharmony_ci .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, 21962306a36Sopenharmony_ci }; 22062306a36Sopenharmony_ci __u64 delta = ts - pelem->timestamp; 22162306a36Sopenharmony_ci __u64 *total; 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci total = bpf_map_lookup_elem(&off_cpu, &key); 22462306a36Sopenharmony_ci if (total) 22562306a36Sopenharmony_ci *total += delta; 22662306a36Sopenharmony_ci else 22762306a36Sopenharmony_ci bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci /* prevent to reuse the timestamp later */ 23062306a36Sopenharmony_ci pelem->timestamp = 0; 23162306a36Sopenharmony_ci } 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci return 0; 23462306a36Sopenharmony_ci} 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ciSEC("tp_btf/task_newtask") 23762306a36Sopenharmony_ciint on_newtask(u64 *ctx) 23862306a36Sopenharmony_ci{ 23962306a36Sopenharmony_ci struct task_struct *task; 24062306a36Sopenharmony_ci u64 clone_flags; 24162306a36Sopenharmony_ci u32 pid; 24262306a36Sopenharmony_ci u8 val = 1; 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci if (!uses_tgid) 24562306a36Sopenharmony_ci return 0; 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci task = (struct task_struct *)bpf_get_current_task(); 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci pid = BPF_CORE_READ(task, tgid); 25062306a36Sopenharmony_ci if (!bpf_map_lookup_elem(&task_filter, &pid)) 25162306a36Sopenharmony_ci return 0; 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci task = (struct task_struct *)ctx[0]; 25462306a36Sopenharmony_ci clone_flags = ctx[1]; 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci pid = task->tgid; 25762306a36Sopenharmony_ci if (!(clone_flags & CLONE_THREAD)) 25862306a36Sopenharmony_ci bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST); 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci return 0; 26162306a36Sopenharmony_ci} 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ciSEC("tp_btf/sched_switch") 26462306a36Sopenharmony_ciint on_switch(u64 *ctx) 26562306a36Sopenharmony_ci{ 26662306a36Sopenharmony_ci struct task_struct *prev, *next; 26762306a36Sopenharmony_ci int prev_state; 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci if (!enabled) 27062306a36Sopenharmony_ci return 0; 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci prev = (struct task_struct *)ctx[1]; 27362306a36Sopenharmony_ci next = (struct task_struct *)ctx[2]; 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci if (has_prev_state) 27662306a36Sopenharmony_ci prev_state = (int)ctx[3]; 27762306a36Sopenharmony_ci else 27862306a36Sopenharmony_ci prev_state = get_task_state(prev); 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci return off_cpu_stat(ctx, prev, next, prev_state & 0xff); 28162306a36Sopenharmony_ci} 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_cichar LICENSE[] SEC("license") = "Dual BSD/GPL"; 284