1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2// Copyright (c) 2022 Google
3#include "vmlinux.h"
4#include <bpf/bpf_helpers.h>
5#include <bpf/bpf_tracing.h>
6#include <bpf/bpf_core_read.h>
7#include <asm-generic/errno-base.h>
8
9#include "lock_data.h"
10
11/* for collect_lock_syms().  4096 was rejected by the verifier */
12#define MAX_CPUS  1024
13
14/* lock contention flags from include/trace/events/lock.h */
15#define LCB_F_SPIN	(1U << 0)
16#define LCB_F_READ	(1U << 1)
17#define LCB_F_WRITE	(1U << 2)
18#define LCB_F_RT	(1U << 3)
19#define LCB_F_PERCPU	(1U << 4)
20#define LCB_F_MUTEX	(1U << 5)
21
22struct tstamp_data {
23	__u64 timestamp;
24	__u64 lock;
25	__u32 flags;
26	__s32 stack_id;
27};
28
29/* callstack storage  */
30struct {
31	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
32	__uint(key_size, sizeof(__u32));
33	__uint(value_size, sizeof(__u64));
34	__uint(max_entries, MAX_ENTRIES);
35} stacks SEC(".maps");
36
37/* maintain timestamp at the beginning of contention */
38struct {
39	__uint(type, BPF_MAP_TYPE_HASH);
40	__type(key, int);
41	__type(value, struct tstamp_data);
42	__uint(max_entries, MAX_ENTRIES);
43} tstamp SEC(".maps");
44
45/* actual lock contention statistics */
46struct {
47	__uint(type, BPF_MAP_TYPE_HASH);
48	__uint(key_size, sizeof(struct contention_key));
49	__uint(value_size, sizeof(struct contention_data));
50	__uint(max_entries, MAX_ENTRIES);
51} lock_stat SEC(".maps");
52
53struct {
54	__uint(type, BPF_MAP_TYPE_HASH);
55	__uint(key_size, sizeof(__u32));
56	__uint(value_size, sizeof(struct contention_task_data));
57	__uint(max_entries, MAX_ENTRIES);
58} task_data SEC(".maps");
59
60struct {
61	__uint(type, BPF_MAP_TYPE_HASH);
62	__uint(key_size, sizeof(__u64));
63	__uint(value_size, sizeof(__u32));
64	__uint(max_entries, MAX_ENTRIES);
65} lock_syms SEC(".maps");
66
67struct {
68	__uint(type, BPF_MAP_TYPE_HASH);
69	__uint(key_size, sizeof(__u32));
70	__uint(value_size, sizeof(__u8));
71	__uint(max_entries, 1);
72} cpu_filter SEC(".maps");
73
74struct {
75	__uint(type, BPF_MAP_TYPE_HASH);
76	__uint(key_size, sizeof(__u32));
77	__uint(value_size, sizeof(__u8));
78	__uint(max_entries, 1);
79} task_filter SEC(".maps");
80
81struct {
82	__uint(type, BPF_MAP_TYPE_HASH);
83	__uint(key_size, sizeof(__u32));
84	__uint(value_size, sizeof(__u8));
85	__uint(max_entries, 1);
86} type_filter SEC(".maps");
87
88struct {
89	__uint(type, BPF_MAP_TYPE_HASH);
90	__uint(key_size, sizeof(__u64));
91	__uint(value_size, sizeof(__u8));
92	__uint(max_entries, 1);
93} addr_filter SEC(".maps");
94
95struct rw_semaphore___old {
96	struct task_struct *owner;
97} __attribute__((preserve_access_index));
98
99struct rw_semaphore___new {
100	atomic_long_t owner;
101} __attribute__((preserve_access_index));
102
103struct mm_struct___old {
104	struct rw_semaphore mmap_sem;
105} __attribute__((preserve_access_index));
106
107struct mm_struct___new {
108	struct rw_semaphore mmap_lock;
109} __attribute__((preserve_access_index));
110
111/* control flags */
112int enabled;
113int has_cpu;
114int has_task;
115int has_type;
116int has_addr;
117int needs_callstack;
118int stack_skip;
119int lock_owner;
120
121/* determine the key of lock stat */
122int aggr_mode;
123
124/* error stat */
125int task_fail;
126int stack_fail;
127int time_fail;
128int data_fail;
129
130int task_map_full;
131int data_map_full;
132
133static inline int can_record(u64 *ctx)
134{
135	if (has_cpu) {
136		__u32 cpu = bpf_get_smp_processor_id();
137		__u8 *ok;
138
139		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
140		if (!ok)
141			return 0;
142	}
143
144	if (has_task) {
145		__u8 *ok;
146		__u32 pid = bpf_get_current_pid_tgid();
147
148		ok = bpf_map_lookup_elem(&task_filter, &pid);
149		if (!ok)
150			return 0;
151	}
152
153	if (has_type) {
154		__u8 *ok;
155		__u32 flags = (__u32)ctx[1];
156
157		ok = bpf_map_lookup_elem(&type_filter, &flags);
158		if (!ok)
159			return 0;
160	}
161
162	if (has_addr) {
163		__u8 *ok;
164		__u64 addr = ctx[0];
165
166		ok = bpf_map_lookup_elem(&addr_filter, &addr);
167		if (!ok)
168			return 0;
169	}
170
171	return 1;
172}
173
174static inline int update_task_data(struct task_struct *task)
175{
176	struct contention_task_data *p;
177	int pid, err;
178
179	err = bpf_core_read(&pid, sizeof(pid), &task->pid);
180	if (err)
181		return -1;
182
183	p = bpf_map_lookup_elem(&task_data, &pid);
184	if (p == NULL && !task_map_full) {
185		struct contention_task_data data = {};
186
187		BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
188		if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG)
189			task_map_full = 1;
190	}
191
192	return 0;
193}
194
195#ifndef __has_builtin
196# define __has_builtin(x) 0
197#endif
198
199static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
200{
201	struct task_struct *task;
202	__u64 owner = 0;
203
204	if (flags & LCB_F_MUTEX) {
205		struct mutex *mutex = (void *)lock;
206		owner = BPF_CORE_READ(mutex, owner.counter);
207	} else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
208	/*
209	 * Support for the BPF_TYPE_MATCHES argument to the
210	 * __builtin_preserve_type_info builtin was added at some point during
211	 * development of clang 15 and it's what is needed for
212	 * bpf_core_type_matches.
213	 */
214#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
215		if (bpf_core_type_matches(struct rw_semaphore___old)) {
216			struct rw_semaphore___old *rwsem = (void *)lock;
217			owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
218		} else if (bpf_core_type_matches(struct rw_semaphore___new)) {
219			struct rw_semaphore___new *rwsem = (void *)lock;
220			owner = BPF_CORE_READ(rwsem, owner.counter);
221		}
222#else
223		/* assume new struct */
224		struct rw_semaphore *rwsem = (void *)lock;
225		owner = BPF_CORE_READ(rwsem, owner.counter);
226#endif
227	}
228
229	if (!owner)
230		return NULL;
231
232	task = (void *)(owner & ~7UL);
233	return task;
234}
235
236static inline __u32 check_lock_type(__u64 lock, __u32 flags)
237{
238	struct task_struct *curr;
239	struct mm_struct___old *mm_old;
240	struct mm_struct___new *mm_new;
241
242	switch (flags) {
243	case LCB_F_READ:  /* rwsem */
244	case LCB_F_WRITE:
245		curr = bpf_get_current_task_btf();
246		if (curr->mm == NULL)
247			break;
248		mm_new = (void *)curr->mm;
249		if (bpf_core_field_exists(mm_new->mmap_lock)) {
250			if (&mm_new->mmap_lock == (void *)lock)
251				return LCD_F_MMAP_LOCK;
252			break;
253		}
254		mm_old = (void *)curr->mm;
255		if (bpf_core_field_exists(mm_old->mmap_sem)) {
256			if (&mm_old->mmap_sem == (void *)lock)
257				return LCD_F_MMAP_LOCK;
258		}
259		break;
260	case LCB_F_SPIN:  /* spinlock */
261		curr = bpf_get_current_task_btf();
262		if (&curr->sighand->siglock == (void *)lock)
263			return LCD_F_SIGHAND_LOCK;
264		break;
265	default:
266		break;
267	}
268	return 0;
269}
270
271SEC("tp_btf/contention_begin")
272int contention_begin(u64 *ctx)
273{
274	__u32 pid;
275	struct tstamp_data *pelem;
276
277	if (!enabled || !can_record(ctx))
278		return 0;
279
280	pid = bpf_get_current_pid_tgid();
281	pelem = bpf_map_lookup_elem(&tstamp, &pid);
282	if (pelem && pelem->lock)
283		return 0;
284
285	if (pelem == NULL) {
286		struct tstamp_data zero = {};
287
288		bpf_map_update_elem(&tstamp, &pid, &zero, BPF_ANY);
289		pelem = bpf_map_lookup_elem(&tstamp, &pid);
290		if (pelem == NULL) {
291			__sync_fetch_and_add(&task_fail, 1);
292			return 0;
293		}
294	}
295
296	pelem->timestamp = bpf_ktime_get_ns();
297	pelem->lock = (__u64)ctx[0];
298	pelem->flags = (__u32)ctx[1];
299
300	if (needs_callstack) {
301		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
302						  BPF_F_FAST_STACK_CMP | stack_skip);
303		if (pelem->stack_id < 0)
304			__sync_fetch_and_add(&stack_fail, 1);
305	} else if (aggr_mode == LOCK_AGGR_TASK) {
306		struct task_struct *task;
307
308		if (lock_owner) {
309			task = get_lock_owner(pelem->lock, pelem->flags);
310
311			/* The flags is not used anymore.  Pass the owner pid. */
312			if (task)
313				pelem->flags = BPF_CORE_READ(task, pid);
314			else
315				pelem->flags = -1U;
316
317		} else {
318			task = bpf_get_current_task_btf();
319		}
320
321		if (task) {
322			if (update_task_data(task) < 0 && lock_owner)
323				pelem->flags = -1U;
324		}
325	}
326
327	return 0;
328}
329
330SEC("tp_btf/contention_end")
331int contention_end(u64 *ctx)
332{
333	__u32 pid;
334	struct tstamp_data *pelem;
335	struct contention_key key = {};
336	struct contention_data *data;
337	__u64 duration;
338
339	if (!enabled)
340		return 0;
341
342	pid = bpf_get_current_pid_tgid();
343	pelem = bpf_map_lookup_elem(&tstamp, &pid);
344	if (!pelem || pelem->lock != ctx[0])
345		return 0;
346
347	duration = bpf_ktime_get_ns() - pelem->timestamp;
348	if ((__s64)duration < 0) {
349		bpf_map_delete_elem(&tstamp, &pid);
350		__sync_fetch_and_add(&time_fail, 1);
351		return 0;
352	}
353
354	switch (aggr_mode) {
355	case LOCK_AGGR_CALLER:
356		key.stack_id = pelem->stack_id;
357		break;
358	case LOCK_AGGR_TASK:
359		if (lock_owner)
360			key.pid = pelem->flags;
361		else
362			key.pid = pid;
363		if (needs_callstack)
364			key.stack_id = pelem->stack_id;
365		break;
366	case LOCK_AGGR_ADDR:
367		key.lock_addr = pelem->lock;
368		if (needs_callstack)
369			key.stack_id = pelem->stack_id;
370		break;
371	default:
372		/* should not happen */
373		return 0;
374	}
375
376	data = bpf_map_lookup_elem(&lock_stat, &key);
377	if (!data) {
378		if (data_map_full) {
379			bpf_map_delete_elem(&tstamp, &pid);
380			__sync_fetch_and_add(&data_fail, 1);
381			return 0;
382		}
383
384		struct contention_data first = {
385			.total_time = duration,
386			.max_time = duration,
387			.min_time = duration,
388			.count = 1,
389			.flags = pelem->flags,
390		};
391		int err;
392
393		if (aggr_mode == LOCK_AGGR_ADDR)
394			first.flags |= check_lock_type(pelem->lock, pelem->flags);
395
396		err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
397		if (err < 0) {
398			if (err == -E2BIG)
399				data_map_full = 1;
400			__sync_fetch_and_add(&data_fail, 1);
401		}
402		bpf_map_delete_elem(&tstamp, &pid);
403		return 0;
404	}
405
406	__sync_fetch_and_add(&data->total_time, duration);
407	__sync_fetch_and_add(&data->count, 1);
408
409	/* FIXME: need atomic operations */
410	if (data->max_time < duration)
411		data->max_time = duration;
412	if (data->min_time > duration)
413		data->min_time = duration;
414
415	bpf_map_delete_elem(&tstamp, &pid);
416	return 0;
417}
418
419extern struct rq runqueues __ksym;
420
421struct rq___old {
422	raw_spinlock_t lock;
423} __attribute__((preserve_access_index));
424
425struct rq___new {
426	raw_spinlock_t __lock;
427} __attribute__((preserve_access_index));
428
429SEC("raw_tp/bpf_test_finish")
430int BPF_PROG(collect_lock_syms)
431{
432	__u64 lock_addr, lock_off;
433	__u32 lock_flag;
434
435	if (bpf_core_field_exists(struct rq___new, __lock))
436		lock_off = offsetof(struct rq___new, __lock);
437	else
438		lock_off = offsetof(struct rq___old, lock);
439
440	for (int i = 0; i < MAX_CPUS; i++) {
441		struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
442
443		if (rq == NULL)
444			break;
445
446		lock_addr = (__u64)(void *)rq + lock_off;
447		lock_flag = LOCK_CLASS_RQLOCK;
448		bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
449	}
450	return 0;
451}
452
453char LICENSE[] SEC("license") = "Dual BSD/GPL";
454