162306a36Sopenharmony_ci// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
262306a36Sopenharmony_ci// Copyright (c) 2022 Google
362306a36Sopenharmony_ci#include "vmlinux.h"
462306a36Sopenharmony_ci#include <bpf/bpf_helpers.h>
562306a36Sopenharmony_ci#include <bpf/bpf_tracing.h>
662306a36Sopenharmony_ci#include <bpf/bpf_core_read.h>
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci/* task->flags for off-cpu analysis */
962306a36Sopenharmony_ci#define PF_KTHREAD   0x00200000  /* I am a kernel thread */
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci/* task->state for off-cpu analysis */
1262306a36Sopenharmony_ci#define TASK_INTERRUPTIBLE	0x0001
1362306a36Sopenharmony_ci#define TASK_UNINTERRUPTIBLE	0x0002
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci/* create a new thread */
1662306a36Sopenharmony_ci#define CLONE_THREAD  0x10000
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#define MAX_STACKS   32
1962306a36Sopenharmony_ci#define MAX_ENTRIES  102400
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_cistruct tstamp_data {
2262306a36Sopenharmony_ci	__u32 stack_id;
2362306a36Sopenharmony_ci	__u32 state;
2462306a36Sopenharmony_ci	__u64 timestamp;
2562306a36Sopenharmony_ci};
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_cistruct offcpu_key {
2862306a36Sopenharmony_ci	__u32 pid;
2962306a36Sopenharmony_ci	__u32 tgid;
3062306a36Sopenharmony_ci	__u32 stack_id;
3162306a36Sopenharmony_ci	__u32 state;
3262306a36Sopenharmony_ci	__u64 cgroup_id;
3362306a36Sopenharmony_ci};
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_cistruct {
3662306a36Sopenharmony_ci	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
3762306a36Sopenharmony_ci	__uint(key_size, sizeof(__u32));
3862306a36Sopenharmony_ci	__uint(value_size, MAX_STACKS * sizeof(__u64));
3962306a36Sopenharmony_ci	__uint(max_entries, MAX_ENTRIES);
4062306a36Sopenharmony_ci} stacks SEC(".maps");
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_cistruct {
4362306a36Sopenharmony_ci	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
4462306a36Sopenharmony_ci	__uint(map_flags, BPF_F_NO_PREALLOC);
4562306a36Sopenharmony_ci	__type(key, int);
4662306a36Sopenharmony_ci	__type(value, struct tstamp_data);
4762306a36Sopenharmony_ci} tstamp SEC(".maps");
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_cistruct {
5062306a36Sopenharmony_ci	__uint(type, BPF_MAP_TYPE_HASH);
5162306a36Sopenharmony_ci	__uint(key_size, sizeof(struct offcpu_key));
5262306a36Sopenharmony_ci	__uint(value_size, sizeof(__u64));
5362306a36Sopenharmony_ci	__uint(max_entries, MAX_ENTRIES);
5462306a36Sopenharmony_ci} off_cpu SEC(".maps");
5562306a36Sopenharmony_ci
5662306a36Sopenharmony_cistruct {
5762306a36Sopenharmony_ci	__uint(type, BPF_MAP_TYPE_HASH);
5862306a36Sopenharmony_ci	__uint(key_size, sizeof(__u32));
5962306a36Sopenharmony_ci	__uint(value_size, sizeof(__u8));
6062306a36Sopenharmony_ci	__uint(max_entries, 1);
6162306a36Sopenharmony_ci} cpu_filter SEC(".maps");
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_cistruct {
6462306a36Sopenharmony_ci	__uint(type, BPF_MAP_TYPE_HASH);
6562306a36Sopenharmony_ci	__uint(key_size, sizeof(__u32));
6662306a36Sopenharmony_ci	__uint(value_size, sizeof(__u8));
6762306a36Sopenharmony_ci	__uint(max_entries, 1);
6862306a36Sopenharmony_ci} task_filter SEC(".maps");
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_cistruct {
7162306a36Sopenharmony_ci	__uint(type, BPF_MAP_TYPE_HASH);
7262306a36Sopenharmony_ci	__uint(key_size, sizeof(__u64));
7362306a36Sopenharmony_ci	__uint(value_size, sizeof(__u8));
7462306a36Sopenharmony_ci	__uint(max_entries, 1);
7562306a36Sopenharmony_ci} cgroup_filter SEC(".maps");
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci/* new kernel task_struct definition */
7862306a36Sopenharmony_cistruct task_struct___new {
7962306a36Sopenharmony_ci	long __state;
8062306a36Sopenharmony_ci} __attribute__((preserve_access_index));
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci/* old kernel task_struct definition */
8362306a36Sopenharmony_cistruct task_struct___old {
8462306a36Sopenharmony_ci	long state;
8562306a36Sopenharmony_ci} __attribute__((preserve_access_index));
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ciint enabled = 0;
8862306a36Sopenharmony_ciint has_cpu = 0;
8962306a36Sopenharmony_ciint has_task = 0;
9062306a36Sopenharmony_ciint has_cgroup = 0;
9162306a36Sopenharmony_ciint uses_tgid = 0;
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ciconst volatile bool has_prev_state = false;
9462306a36Sopenharmony_ciconst volatile bool needs_cgroup = false;
9562306a36Sopenharmony_ciconst volatile bool uses_cgroup_v1 = false;
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ciint perf_subsys_id = -1;
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci/*
10062306a36Sopenharmony_ci * Old kernel used to call it task_struct->state and now it's '__state'.
10162306a36Sopenharmony_ci * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
10262306a36Sopenharmony_ci *
10362306a36Sopenharmony_ci * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
10462306a36Sopenharmony_ci */
10562306a36Sopenharmony_cistatic inline int get_task_state(struct task_struct *t)
10662306a36Sopenharmony_ci{
10762306a36Sopenharmony_ci	/* recast pointer to capture new type for compiler */
10862306a36Sopenharmony_ci	struct task_struct___new *t_new = (void *)t;
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci	if (bpf_core_field_exists(t_new->__state)) {
11162306a36Sopenharmony_ci		return BPF_CORE_READ(t_new, __state);
11262306a36Sopenharmony_ci	} else {
11362306a36Sopenharmony_ci		/* recast pointer to capture old type for compiler */
11462306a36Sopenharmony_ci		struct task_struct___old *t_old = (void *)t;
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci		return BPF_CORE_READ(t_old, state);
11762306a36Sopenharmony_ci	}
11862306a36Sopenharmony_ci}
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_cistatic inline __u64 get_cgroup_id(struct task_struct *t)
12162306a36Sopenharmony_ci{
12262306a36Sopenharmony_ci	struct cgroup *cgrp;
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci	if (!uses_cgroup_v1)
12562306a36Sopenharmony_ci		return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci	if (perf_subsys_id == -1) {
12862306a36Sopenharmony_ci#if __has_builtin(__builtin_preserve_enum_value)
12962306a36Sopenharmony_ci		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
13062306a36Sopenharmony_ci						     perf_event_cgrp_id);
13162306a36Sopenharmony_ci#else
13262306a36Sopenharmony_ci		perf_subsys_id = perf_event_cgrp_id;
13362306a36Sopenharmony_ci#endif
13462306a36Sopenharmony_ci	}
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci	cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
13762306a36Sopenharmony_ci	return BPF_CORE_READ(cgrp, kn, id);
13862306a36Sopenharmony_ci}
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_cistatic inline int can_record(struct task_struct *t, int state)
14162306a36Sopenharmony_ci{
14262306a36Sopenharmony_ci	/* kernel threads don't have user stack */
14362306a36Sopenharmony_ci	if (t->flags & PF_KTHREAD)
14462306a36Sopenharmony_ci		return 0;
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci	if (state != TASK_INTERRUPTIBLE &&
14762306a36Sopenharmony_ci	    state != TASK_UNINTERRUPTIBLE)
14862306a36Sopenharmony_ci		return 0;
14962306a36Sopenharmony_ci
15062306a36Sopenharmony_ci	if (has_cpu) {
15162306a36Sopenharmony_ci		__u32 cpu = bpf_get_smp_processor_id();
15262306a36Sopenharmony_ci		__u8 *ok;
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
15562306a36Sopenharmony_ci		if (!ok)
15662306a36Sopenharmony_ci			return 0;
15762306a36Sopenharmony_ci	}
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci	if (has_task) {
16062306a36Sopenharmony_ci		__u8 *ok;
16162306a36Sopenharmony_ci		__u32 pid;
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci		if (uses_tgid)
16462306a36Sopenharmony_ci			pid = t->tgid;
16562306a36Sopenharmony_ci		else
16662306a36Sopenharmony_ci			pid = t->pid;
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci		ok = bpf_map_lookup_elem(&task_filter, &pid);
16962306a36Sopenharmony_ci		if (!ok)
17062306a36Sopenharmony_ci			return 0;
17162306a36Sopenharmony_ci	}
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	if (has_cgroup) {
17462306a36Sopenharmony_ci		__u8 *ok;
17562306a36Sopenharmony_ci		__u64 cgrp_id = get_cgroup_id(t);
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_ci		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
17862306a36Sopenharmony_ci		if (!ok)
17962306a36Sopenharmony_ci			return 0;
18062306a36Sopenharmony_ci	}
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci	return 1;
18362306a36Sopenharmony_ci}
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_cistatic int off_cpu_stat(u64 *ctx, struct task_struct *prev,
18662306a36Sopenharmony_ci			struct task_struct *next, int state)
18762306a36Sopenharmony_ci{
18862306a36Sopenharmony_ci	__u64 ts;
18962306a36Sopenharmony_ci	__u32 stack_id;
19062306a36Sopenharmony_ci	struct tstamp_data *pelem;
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci	ts = bpf_ktime_get_ns();
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci	if (!can_record(prev, state))
19562306a36Sopenharmony_ci		goto next;
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci	stack_id = bpf_get_stackid(ctx, &stacks,
19862306a36Sopenharmony_ci				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
20162306a36Sopenharmony_ci				     BPF_LOCAL_STORAGE_GET_F_CREATE);
20262306a36Sopenharmony_ci	if (!pelem)
20362306a36Sopenharmony_ci		goto next;
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci	pelem->timestamp = ts;
20662306a36Sopenharmony_ci	pelem->state = state;
20762306a36Sopenharmony_ci	pelem->stack_id = stack_id;
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_cinext:
21062306a36Sopenharmony_ci	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	if (pelem && pelem->timestamp) {
21362306a36Sopenharmony_ci		struct offcpu_key key = {
21462306a36Sopenharmony_ci			.pid = next->pid,
21562306a36Sopenharmony_ci			.tgid = next->tgid,
21662306a36Sopenharmony_ci			.stack_id = pelem->stack_id,
21762306a36Sopenharmony_ci			.state = pelem->state,
21862306a36Sopenharmony_ci			.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
21962306a36Sopenharmony_ci		};
22062306a36Sopenharmony_ci		__u64 delta = ts - pelem->timestamp;
22162306a36Sopenharmony_ci		__u64 *total;
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_ci		total = bpf_map_lookup_elem(&off_cpu, &key);
22462306a36Sopenharmony_ci		if (total)
22562306a36Sopenharmony_ci			*total += delta;
22662306a36Sopenharmony_ci		else
22762306a36Sopenharmony_ci			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci		/* prevent to reuse the timestamp later */
23062306a36Sopenharmony_ci		pelem->timestamp = 0;
23162306a36Sopenharmony_ci	}
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci	return 0;
23462306a36Sopenharmony_ci}
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ciSEC("tp_btf/task_newtask")
23762306a36Sopenharmony_ciint on_newtask(u64 *ctx)
23862306a36Sopenharmony_ci{
23962306a36Sopenharmony_ci	struct task_struct *task;
24062306a36Sopenharmony_ci	u64 clone_flags;
24162306a36Sopenharmony_ci	u32 pid;
24262306a36Sopenharmony_ci	u8 val = 1;
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	if (!uses_tgid)
24562306a36Sopenharmony_ci		return 0;
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	task = (struct task_struct *)bpf_get_current_task();
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci	pid = BPF_CORE_READ(task, tgid);
25062306a36Sopenharmony_ci	if (!bpf_map_lookup_elem(&task_filter, &pid))
25162306a36Sopenharmony_ci		return 0;
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci	task = (struct task_struct *)ctx[0];
25462306a36Sopenharmony_ci	clone_flags = ctx[1];
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci	pid = task->tgid;
25762306a36Sopenharmony_ci	if (!(clone_flags & CLONE_THREAD))
25862306a36Sopenharmony_ci		bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	return 0;
26162306a36Sopenharmony_ci}
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ciSEC("tp_btf/sched_switch")
26462306a36Sopenharmony_ciint on_switch(u64 *ctx)
26562306a36Sopenharmony_ci{
26662306a36Sopenharmony_ci	struct task_struct *prev, *next;
26762306a36Sopenharmony_ci	int prev_state;
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci	if (!enabled)
27062306a36Sopenharmony_ci		return 0;
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	prev = (struct task_struct *)ctx[1];
27362306a36Sopenharmony_ci	next = (struct task_struct *)ctx[2];
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci	if (has_prev_state)
27662306a36Sopenharmony_ci		prev_state = (int)ctx[3];
27762306a36Sopenharmony_ci	else
27862306a36Sopenharmony_ci		prev_state = get_task_state(prev);
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
28162306a36Sopenharmony_ci}
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_cichar LICENSE[] SEC("license") = "Dual BSD/GPL";
284