1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies)
5 */
6 #include "sched.h"
7
8 #include "pelt.h"
9 #include "walt.h"
10
11 int sched_rr_timeslice = RR_TIMESLICE;
12 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
13 /* More than 4 hours if BW_SHIFT equals 20. */
14 static const u64 max_rt_runtime = MAX_BW;
15
16 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
17
18 struct rt_bandwidth def_rt_bandwidth;
19
20 #ifdef CONFIG_SCHED_RT_CAS
21 unsigned int sysctl_sched_enable_rt_cas = 1;
22 #endif
23
24 #ifdef CONFIG_SCHED_RT_ACTIVE_LB
25 unsigned int sysctl_sched_enable_rt_active_lb = 1;
26 #endif
27
sched_rt_period_timer(struct hrtimer *timer)28 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
29 {
30 struct rt_bandwidth *rt_b = container_of(timer, struct rt_bandwidth, rt_period_timer);
31 int idle = 0;
32 int overrun;
33
34 raw_spin_lock(&rt_b->rt_runtime_lock);
35 for (;;) {
36 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
37 if (!overrun) {
38 break;
39 }
40
41 raw_spin_unlock(&rt_b->rt_runtime_lock);
42 idle = do_sched_rt_period_timer(rt_b, overrun);
43 raw_spin_lock(&rt_b->rt_runtime_lock);
44 }
45 if (idle) {
46 rt_b->rt_period_active = 0;
47 }
48 raw_spin_unlock(&rt_b->rt_runtime_lock);
49
50 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
51 }
52
init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)53 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
54 {
55 rt_b->rt_period = ns_to_ktime(period);
56 rt_b->rt_runtime = runtime;
57
58 raw_spin_lock_init(&rt_b->rt_runtime_lock);
59
60 hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
61 rt_b->rt_period_timer.function = sched_rt_period_timer;
62 }
63
start_rt_bandwidth(struct rt_bandwidth *rt_b)64 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
65 {
66 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) {
67 return;
68 }
69
70 raw_spin_lock(&rt_b->rt_runtime_lock);
71 if (!rt_b->rt_period_active) {
72 rt_b->rt_period_active = 1;
73 /*
74 * SCHED_DEADLINE updates the bandwidth, as a run away
75 * RT task with a DL task could hog a CPU. But DL does
76 * not reset the period. If a deadline task was running
77 * without an RT task running, it can cause RT tasks to
78 * throttle when they start up. Kick the timer right away
79 * to update the period.
80 */
81 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
82 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED_HARD);
83 }
84 raw_spin_unlock(&rt_b->rt_runtime_lock);
85 }
86
init_rt_rq(struct rt_rq *rt_rq)87 void init_rt_rq(struct rt_rq *rt_rq)
88 {
89 struct rt_prio_array *array;
90 int i;
91
92 array = &rt_rq->active;
93 for (i = 0; i < MAX_RT_PRIO; i++) {
94 INIT_LIST_HEAD(array->queue + i);
95 __clear_bit(i, array->bitmap);
96 }
97 /* delimiter for bitsearch: */
98 __set_bit(MAX_RT_PRIO, array->bitmap);
99
100 #if defined CONFIG_SMP
101 rt_rq->highest_prio.curr = MAX_RT_PRIO;
102 rt_rq->highest_prio.next = MAX_RT_PRIO;
103 rt_rq->rt_nr_migratory = 0;
104 rt_rq->overloaded = 0;
105 plist_head_init(&rt_rq->pushable_tasks);
106 #endif /* CONFIG_SMP */
107 /* We start is dequeued state, because no RT tasks are queued */
108 rt_rq->rt_queued = 0;
109
110 rt_rq->rt_time = 0;
111 rt_rq->rt_throttled = 0;
112 rt_rq->rt_runtime = 0;
113 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
114 }
115
116 #ifdef CONFIG_RT_GROUP_SCHED
destroy_rt_bandwidth(struct rt_bandwidth *rt_b)117 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
118 {
119 hrtimer_cancel(&rt_b->rt_period_timer);
120 }
121
122 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
123
rt_task_of(struct sched_rt_entity *rt_se)124 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
125 {
126 #ifdef CONFIG_SCHED_DEBUG
127 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
128 #endif
129 return container_of(rt_se, struct task_struct, rt);
130 }
131
rq_of_rt_rq(struct rt_rq *rt_rq)132 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
133 {
134 return rt_rq->rq;
135 }
136
rt_rq_of_se(struct sched_rt_entity *rt_se)137 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
138 {
139 return rt_se->rt_rq;
140 }
141
rq_of_rt_se(struct sched_rt_entity *rt_se)142 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
143 {
144 struct rt_rq *rt_rq = rt_se->rt_rq;
145
146 return rt_rq->rq;
147 }
148
free_rt_sched_group(struct task_group *tg)149 void free_rt_sched_group(struct task_group *tg)
150 {
151 int i;
152
153 if (tg->rt_se) {
154 destroy_rt_bandwidth(&tg->rt_bandwidth);
155 }
156
157 for_each_possible_cpu(i)
158 {
159 if (tg->rt_rq) {
160 kfree(tg->rt_rq[i]);
161 }
162 if (tg->rt_se) {
163 kfree(tg->rt_se[i]);
164 }
165 }
166
167 kfree(tg->rt_rq);
168 kfree(tg->rt_se);
169 }
170
init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent)171 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu,
172 struct sched_rt_entity *parent)
173 {
174 struct rq *rq = cpu_rq(cpu);
175
176 rt_rq->highest_prio.curr = MAX_RT_PRIO;
177 rt_rq->rt_nr_boosted = 0;
178 rt_rq->rq = rq;
179 rt_rq->tg = tg;
180
181 tg->rt_rq[cpu] = rt_rq;
182 tg->rt_se[cpu] = rt_se;
183
184 if (!rt_se) {
185 return;
186 }
187
188 if (!parent) {
189 rt_se->rt_rq = &rq->rt;
190 } else {
191 rt_se->rt_rq = parent->my_q;
192 }
193
194 rt_se->my_q = rt_rq;
195 rt_se->parent = parent;
196 INIT_LIST_HEAD(&rt_se->run_list);
197 }
198
alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)199 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
200 {
201 struct rt_rq *rt_rq;
202 struct sched_rt_entity *rt_se;
203 int i;
204
205 tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
206 if (!tg->rt_rq) {
207 goto err;
208 }
209 tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
210 if (!tg->rt_se) {
211 goto err;
212 }
213
214 init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(def_rt_bandwidth.rt_period), 0);
215
216 for_each_possible_cpu(i)
217 {
218 rt_rq = kzalloc_node(sizeof(struct rt_rq), GFP_KERNEL, cpu_to_node(i));
219 if (!rt_rq) {
220 goto err;
221 }
222
223 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), GFP_KERNEL, cpu_to_node(i));
224 if (!rt_se) {
225 goto err_free_rq;
226 }
227
228 init_rt_rq(rt_rq);
229 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
230 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
231 }
232
233 return 1;
234
235 err_free_rq:
236 kfree(rt_rq);
237 err:
238 return 0;
239 }
240
241 #else /* CONFIG_RT_GROUP_SCHED */
242
243 #define rt_entity_is_task(rt_se) (1)
244
rt_task_of(struct sched_rt_entity *rt_se)245 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
246 {
247 return container_of(rt_se, struct task_struct, rt);
248 }
249
rq_of_rt_rq(struct rt_rq *rt_rq)250 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
251 {
252 return container_of(rt_rq, struct rq, rt);
253 }
254
rq_of_rt_se(struct sched_rt_entity *rt_se)255 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
256 {
257 struct task_struct *p = rt_task_of(rt_se);
258
259 return task_rq(p);
260 }
261
rt_rq_of_se(struct sched_rt_entity *rt_se)262 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
263 {
264 struct rq *rq = rq_of_rt_se(rt_se);
265
266 return &rq->rt;
267 }
268
free_rt_sched_group(struct task_group *tg)269 void free_rt_sched_group(struct task_group *tg)
270 {
271 }
272
alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)273 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
274 {
275 return 1;
276 }
277 #endif /* CONFIG_RT_GROUP_SCHED */
278
279 #ifdef CONFIG_SMP
280
281 static void pull_rt_task(struct rq *this_rq);
282
need_pull_rt_task(struct rq *rq, struct task_struct *prev)283 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
284 {
285 /*
286 * Try to pull RT tasks here if we lower this rq's prio and cpu is not
287 * isolated
288 */
289 return rq->rt.highest_prio.curr > prev->prio && !cpu_isolated(cpu_of(rq));
290 }
291
rt_overloaded(struct rq *rq)292 static inline int rt_overloaded(struct rq *rq)
293 {
294 return atomic_read(&rq->rd->rto_count);
295 }
296
rt_set_overload(struct rq *rq)297 static inline void rt_set_overload(struct rq *rq)
298 {
299 if (!rq->online) {
300 return;
301 }
302
303 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
304 /*
305 * Make sure the mask is visible before we set
306 * the overload count. That is checked to determine
307 * if we should look at the mask. It would be a shame
308 * if we looked at the mask, but the mask was not
309 * updated yet.
310 *
311 * Matched by the barrier in pull_rt_task().
312 */
313 smp_wmb();
314 atomic_inc(&rq->rd->rto_count);
315 }
316
rt_clear_overload(struct rq *rq)317 static inline void rt_clear_overload(struct rq *rq)
318 {
319 if (!rq->online) {
320 return;
321 }
322
323 /* the order here really doesn't matter */
324 atomic_dec(&rq->rd->rto_count);
325 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
326 }
327
update_rt_migration(struct rt_rq *rt_rq)328 static void update_rt_migration(struct rt_rq *rt_rq)
329 {
330 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
331 if (!rt_rq->overloaded) {
332 rt_set_overload(rq_of_rt_rq(rt_rq));
333 rt_rq->overloaded = 1;
334 }
335 } else if (rt_rq->overloaded) {
336 rt_clear_overload(rq_of_rt_rq(rt_rq));
337 rt_rq->overloaded = 0;
338 }
339 }
340
inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)341 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
342 {
343 struct task_struct *p;
344
345 if (!rt_entity_is_task(rt_se)) {
346 return;
347 }
348
349 p = rt_task_of(rt_se);
350 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
351
352 rt_rq->rt_nr_total++;
353 if (p->nr_cpus_allowed > 1) {
354 rt_rq->rt_nr_migratory++;
355 }
356
357 update_rt_migration(rt_rq);
358 }
359
dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)360 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
361 {
362 struct task_struct *p;
363
364 if (!rt_entity_is_task(rt_se)) {
365 return;
366 }
367
368 p = rt_task_of(rt_se);
369 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
370
371 rt_rq->rt_nr_total--;
372 if (p->nr_cpus_allowed > 1) {
373 rt_rq->rt_nr_migratory--;
374 }
375
376 update_rt_migration(rt_rq);
377 }
378
has_pushable_tasks(struct rq *rq)379 static inline int has_pushable_tasks(struct rq *rq)
380 {
381 return !plist_head_empty(&rq->rt.pushable_tasks);
382 }
383
384 static DEFINE_PER_CPU(struct callback_head, rt_push_head);
385 static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
386
387 static void push_rt_tasks(struct rq *);
388 static void pull_rt_task(struct rq *);
389
rt_queue_push_tasks(struct rq *rq)390 static inline void rt_queue_push_tasks(struct rq *rq)
391 {
392 if (!has_pushable_tasks(rq)) {
393 return;
394 }
395
396 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
397 }
398
rt_queue_pull_task(struct rq *rq)399 static inline void rt_queue_pull_task(struct rq *rq)
400 {
401 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
402 }
403
enqueue_pushable_task(struct rq *rq, struct task_struct *p)404 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
405 {
406 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
407 plist_node_init(&p->pushable_tasks, p->prio);
408 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
409
410 /* Update the highest prio pushable task */
411 if (p->prio < rq->rt.highest_prio.next) {
412 rq->rt.highest_prio.next = p->prio;
413 }
414 }
415
dequeue_pushable_task(struct rq *rq, struct task_struct *p)416 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
417 {
418 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
419
420 /* Update the new highest prio pushable task */
421 if (has_pushable_tasks(rq)) {
422 p = plist_first_entry(&rq->rt.pushable_tasks, struct task_struct, pushable_tasks);
423 rq->rt.highest_prio.next = p->prio;
424 } else {
425 rq->rt.highest_prio.next = MAX_RT_PRIO;
426 }
427 }
428
429 #else
430
enqueue_pushable_task(struct rq *rq, struct task_struct *p)431 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
432 {
433 }
434
dequeue_pushable_task(struct rq *rq, struct task_struct *p)435 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
436 {
437 }
438
inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)439 static inline void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
440 {
441 }
442
dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)443 static inline void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
444 {
445 }
446
need_pull_rt_task(struct rq *rq, struct task_struct *prev)447 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
448 {
449 return false;
450 }
451
pull_rt_task(struct rq *this_rq)452 static inline void pull_rt_task(struct rq *this_rq)
453 {
454 }
455
rt_queue_push_tasks(struct rq *rq)456 static inline void rt_queue_push_tasks(struct rq *rq)
457 {
458 }
459 #endif /* CONFIG_SMP */
460
461 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
462 static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
463
on_rt_rq(struct sched_rt_entity *rt_se)464 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
465 {
466 return rt_se->on_rq;
467 }
468
469 #ifdef CONFIG_UCLAMP_TASK
470 /*
471 * Verify the fitness of task @p to run on @cpu taking into account the uclamp
472 * settings.
473 *
474 * This check is only important for heterogeneous systems where uclamp_min value
475 * is higher than the capacity of a @cpu. For non-heterogeneous system this
476 * function will always return true.
477 *
478 * The function will return true if the capacity of the @cpu is >= the
479 * uclamp_min and false otherwise.
480 *
481 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
482 * > uclamp_max.
483 */
rt_task_fits_capacity(struct task_struct *p, int cpu)484 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
485 {
486 unsigned int min_cap;
487 unsigned int max_cap;
488 unsigned int cpu_cap;
489
490 /* Only heterogeneous systems can benefit from this check */
491 if (!static_branch_unlikely(&sched_asym_cpucapacity)) {
492 return true;
493 }
494
495 min_cap = uclamp_eff_value(p, UCLAMP_MIN);
496 max_cap = uclamp_eff_value(p, UCLAMP_MAX);
497
498 cpu_cap = capacity_orig_of(cpu);
499
500 return cpu_cap >= min(min_cap, max_cap);
501 }
502 #else
rt_task_fits_capacity(struct task_struct *p, int cpu)503 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
504 {
505 return true;
506 }
507 #endif
508
509 #ifdef CONFIG_RT_GROUP_SCHED
510
sched_rt_runtime(struct rt_rq *rt_rq)511 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
512 {
513 if (!rt_rq->tg) {
514 return RUNTIME_INF;
515 }
516
517 return rt_rq->rt_runtime;
518 }
519
sched_rt_period(struct rt_rq *rt_rq)520 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
521 {
522 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
523 }
524
525 typedef struct task_group *rt_rq_iter_t;
526
next_task_group(struct task_group *tg)527 static inline struct task_group *next_task_group(struct task_group *tg)
528 {
529 do {
530 tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
531 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
532
533 if (&tg->list == &task_groups) {
534 tg = NULL;
535 }
536
537 return tg;
538 }
539
540 #define cycle_each_rt_rq(rt_rq, iter, rq)
541 do { \
542 for (iter = container_of(&task_groups, typeof(*iter), list); \
543 (iter = next_task_group(iter)) && (rt_rq = iter->rt_rq[cpu_of(rq)]);) \
544 } while (0)
545
546 #define cycle_each_sched_rt_entity(rt_se) for (; rt_se; rt_se = rt_se->parent)
547
group_rt_rq(struct sched_rt_entity *rt_se)548 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
549 {
550 return rt_se->my_q;
551 }
552
553 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
554 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
555
sched_rt_rq_enqueue(struct rt_rq *rt_rq)556 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
557 {
558 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
559 struct rq *rq = rq_of_rt_rq(rt_rq);
560 struct sched_rt_entity *rt_se;
561
562 int cpu = cpu_of(rq);
563
564 rt_se = rt_rq->tg->rt_se[cpu];
565
566 if (rt_rq->rt_nr_running) {
567 if (!rt_se) {
568 enqueue_top_rt_rq(rt_rq);
569 } else if (!on_rt_rq(rt_se)) {
570 enqueue_rt_entity(rt_se, 0);
571 }
572
573 if (rt_rq->highest_prio.curr < curr->prio) {
574 resched_curr(rq);
575 }
576 }
577 }
578
sched_rt_rq_dequeue(struct rt_rq *rt_rq)579 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
580 {
581 struct sched_rt_entity *rt_se;
582 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
583
584 rt_se = rt_rq->tg->rt_se[cpu];
585
586 if (!rt_se) {
587 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
588 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
589 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
590 } else if (on_rt_rq(rt_se)) {
591 dequeue_rt_entity(rt_se, 0);
592 }
593 }
594
rt_rq_throttled(struct rt_rq *rt_rq)595 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
596 {
597 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
598 }
599
rt_se_boosted(struct sched_rt_entity *rt_se)600 static int rt_se_boosted(struct sched_rt_entity *rt_se)
601 {
602 struct rt_rq *rt_rq = group_rt_rq(rt_se);
603 struct task_struct *p;
604
605 if (rt_rq) {
606 return !!rt_rq->rt_nr_boosted;
607 }
608
609 p = rt_task_of(rt_se);
610 return p->prio != p->normal_prio;
611 }
612
613 #ifdef CONFIG_SMP
sched_rt_period_mask(void)614 static inline const struct cpumask *sched_rt_period_mask(void)
615 {
616 return this_rq()->rd->span;
617 }
618 #else
sched_rt_period_mask(void)619 static inline const struct cpumask *sched_rt_period_mask(void)
620 {
621 return cpu_online_mask;
622 }
623 #endif
624
sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)625 static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
626 {
627 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
628 }
629
sched_rt_bandwidth(struct rt_rq *rt_rq)630 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
631 {
632 return &rt_rq->tg->rt_bandwidth;
633 }
634
635 #else /* !CONFIG_RT_GROUP_SCHED */
636
sched_rt_runtime(struct rt_rq *rt_rq)637 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
638 {
639 return rt_rq->rt_runtime;
640 }
641
sched_rt_period(struct rt_rq *rt_rq)642 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
643 {
644 return ktime_to_ns(def_rt_bandwidth.rt_period);
645 }
646
647 typedef struct rt_rq *rt_rq_iter_t;
648
649 #define cycle_each_rt_rq(rt_rq, iter, rq) for ((void)(iter), (rt_rq) = &(rq)->rt; (rt_rq); (rt_rq) = NULL)
650
651 #define cycle_each_sched_rt_entity(rt_se) for (; rt_se; rt_se = NULL)
652
group_rt_rq(struct sched_rt_entity *rt_se)653 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
654 {
655 return NULL;
656 }
657
sched_rt_rq_enqueue(struct rt_rq *rt_rq)658 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
659 {
660 struct rq *rq = rq_of_rt_rq(rt_rq);
661
662 if (!rt_rq->rt_nr_running) {
663 return;
664 }
665
666 enqueue_top_rt_rq(rt_rq);
667 resched_curr(rq);
668 }
669
sched_rt_rq_dequeue(struct rt_rq *rt_rq)670 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
671 {
672 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
673 }
674
rt_rq_throttled(struct rt_rq *rt_rq)675 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
676 {
677 return rt_rq->rt_throttled;
678 }
679
sched_rt_period_mask(void)680 static inline const struct cpumask *sched_rt_period_mask(void)
681 {
682 return cpu_online_mask;
683 }
684
sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)685 static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
686 {
687 return &cpu_rq(cpu)->rt;
688 }
689
sched_rt_bandwidth(struct rt_rq *rt_rq)690 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
691 {
692 return &def_rt_bandwidth;
693 }
694
695 #endif /* CONFIG_RT_GROUP_SCHED */
696
sched_rt_bandwidth_account(struct rt_rq *rt_rq)697 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
698 {
699 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
700
701 return (hrtimer_active(&rt_b->rt_period_timer) || rt_rq->rt_time < rt_b->rt_runtime);
702 }
703
704 #ifdef CONFIG_SMP
705 /*
706 * We ran out of runtime, see if we can borrow some from our neighbours.
707 */
do_balance_runtime(struct rt_rq *rt_rq)708 static void do_balance_runtime(struct rt_rq *rt_rq)
709 {
710 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
711 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
712 int i, weight;
713 u64 rt_period;
714
715 weight = cpumask_weight(rd->span);
716
717 raw_spin_lock(&rt_b->rt_runtime_lock);
718 rt_period = ktime_to_ns(rt_b->rt_period);
719 for_each_cpu(i, rd->span)
720 {
721 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
722 s64 diff;
723
724 if (iter == rt_rq) {
725 continue;
726 }
727
728 raw_spin_lock(&iter->rt_runtime_lock);
729 /*
730 * Either all rqs have inf runtime and there's nothing to steal
731 * or __disable_runtime() below sets a specific rq to inf to
732 * indicate its been disabled and disalow stealing.
733 */
734 if (iter->rt_runtime == RUNTIME_INF) {
735 goto next;
736 }
737
738 /*
739 * From runqueues with spare time, take 1/n part of their
740 * spare time, but no more than our period.
741 */
742 diff = iter->rt_runtime - iter->rt_time;
743 if (diff > 0) {
744 diff = div_u64((u64)diff, weight);
745 if (rt_rq->rt_runtime + diff > rt_period) {
746 diff = rt_period - rt_rq->rt_runtime;
747 }
748 iter->rt_runtime -= diff;
749 rt_rq->rt_runtime += diff;
750 if (rt_rq->rt_runtime == rt_period) {
751 raw_spin_unlock(&iter->rt_runtime_lock);
752 break;
753 }
754 }
755 next:
756 raw_spin_unlock(&iter->rt_runtime_lock);
757 }
758 raw_spin_unlock(&rt_b->rt_runtime_lock);
759 }
760
761 /*
762 * Ensure this RQ takes back all the runtime it lend to its neighbours.
763 */
__disable_runtime(struct rq *rq)764 static void __disable_runtime(struct rq *rq)
765 {
766 struct root_domain *rd = rq->rd;
767 rt_rq_iter_t iter;
768 struct rt_rq *rt_rq;
769
770 if (unlikely(!scheduler_running)) {
771 return;
772 }
773
774 cycle_each_rt_rq(rt_rq, iter, rq) {
775 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
776 s64 want;
777 int i;
778
779 raw_spin_lock(&rt_b->rt_runtime_lock);
780 raw_spin_lock(&rt_rq->rt_runtime_lock);
781 /*
782 * Either we're all inf and nobody needs to borrow, or we're
783 * already disabled and thus have nothing to do, or we have
784 * exactly the right amount of runtime to take out.
785 */
786 if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) {
787 goto balanced;
788 }
789 raw_spin_unlock(&rt_rq->rt_runtime_lock);
790
791 /*
792 * Calculate the difference between what we started out with
793 * and what we current have, that's the amount of runtime
794 * we lend and now have to reclaim.
795 */
796 want = rt_b->rt_runtime - rt_rq->rt_runtime;
797
798 /*
799 * Greedy reclaim, take back as much as we can.
800 */
801 for_each_cpu(i, rd->span)
802 {
803 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
804 s64 diff;
805
806 /*
807 * Can't reclaim from ourselves or disabled runqueues.
808 */
809 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) {
810 continue;
811 }
812
813 raw_spin_lock(&iter->rt_runtime_lock);
814 if (want > 0) {
815 diff = min_t(s64, iter->rt_runtime, want);
816 iter->rt_runtime -= diff;
817 want -= diff;
818 } else {
819 iter->rt_runtime -= want;
820 want -= want;
821 }
822 raw_spin_unlock(&iter->rt_runtime_lock);
823
824 if (!want) {
825 break;
826 }
827 }
828
829 raw_spin_lock(&rt_rq->rt_runtime_lock);
830 /*
831 * We cannot be left wanting - that would mean some runtime
832 * leaked out of the system.
833 */
834 BUG_ON(want);
835 balanced:
836 /*
837 * Disable all the borrow logic by pretending we have inf
838 * runtime - in which case borrowing doesn't make sense.
839 */
840 rt_rq->rt_runtime = RUNTIME_INF;
841 rt_rq->rt_throttled = 0;
842 raw_spin_unlock(&rt_rq->rt_runtime_lock);
843 raw_spin_unlock(&rt_b->rt_runtime_lock);
844
845 /* Make rt_rq available for pick_next_task() */
846 sched_rt_rq_enqueue(rt_rq);
847 }
848 }
849
__enable_runtime(struct rq *rq)850 static void __enable_runtime(struct rq *rq)
851 {
852 rt_rq_iter_t iter;
853 struct rt_rq *rt_rq;
854
855 if (unlikely(!scheduler_running)) {
856 return;
857 }
858
859 /*
860 * Reset each runqueue's bandwidth settings
861 */
862 cycle_each_rt_rq(rt_rq, iter, rq) {
863 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
864
865 raw_spin_lock(&rt_b->rt_runtime_lock);
866 raw_spin_lock(&rt_rq->rt_runtime_lock);
867 rt_rq->rt_runtime = rt_b->rt_runtime;
868 rt_rq->rt_time = 0;
869 rt_rq->rt_throttled = 0;
870 raw_spin_unlock(&rt_rq->rt_runtime_lock);
871 raw_spin_unlock(&rt_b->rt_runtime_lock);
872 }
873 }
874
balance_runtime(struct rt_rq *rt_rq)875 static void balance_runtime(struct rt_rq *rt_rq)
876 {
877 if (!sched_feat(RT_RUNTIME_SHARE)) {
878 return;
879 }
880
881 if (rt_rq->rt_time > rt_rq->rt_runtime) {
882 raw_spin_unlock(&rt_rq->rt_runtime_lock);
883 do_balance_runtime(rt_rq);
884 raw_spin_lock(&rt_rq->rt_runtime_lock);
885 }
886 }
887 #else /* !CONFIG_SMP */
balance_runtime(struct rt_rq *rt_rq)888 static inline void balance_runtime(struct rt_rq *rt_rq)
889 {
890 }
891 #endif /* CONFIG_SMP */
892
do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)893 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
894 {
895 int i, idle = 1, throttled = 0;
896 const struct cpumask *span;
897
898 span = sched_rt_period_mask();
899 #ifdef CONFIG_RT_GROUP_SCHED
900 /*
901 * When the tasks in the task_group run on either isolated
902 * CPUs or non-isolated CPUs, whether they are isolcpus or
903 * were isolated via cpusets, check all the online rt_rq
904 * to lest the timer run on a CPU which does not service
905 * all runqueues, potentially leaving other CPUs indefinitely
906 * throttled.
907 */
908 span = cpu_online_mask;
909 #endif
910 for_each_cpu(i, span)
911 {
912 int enqueue = 0;
913 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
914 struct rq *rq = rq_of_rt_rq(rt_rq);
915 int skip;
916
917 /*
918 * When span == cpu_online_mask, taking each rq->lock
919 * can be time-consuming. Try to avoid it when possible.
920 */
921 raw_spin_lock(&rt_rq->rt_runtime_lock);
922 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) {
923 rt_rq->rt_runtime = rt_b->rt_runtime;
924 }
925 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
926 raw_spin_unlock(&rt_rq->rt_runtime_lock);
927 if (skip) {
928 continue;
929 }
930
931 raw_spin_lock(&rq->lock);
932 update_rq_clock(rq);
933
934 if (rt_rq->rt_time) {
935 u64 runtime;
936
937 raw_spin_lock(&rt_rq->rt_runtime_lock);
938 if (rt_rq->rt_throttled) {
939 balance_runtime(rt_rq);
940 }
941 runtime = rt_rq->rt_runtime;
942 rt_rq->rt_time -= min(rt_rq->rt_time, overrun * runtime);
943 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
944 rt_rq->rt_throttled = 0;
945 enqueue = 1;
946
947 /*
948 * When we're idle and a woken (rt) task is
949 * throttled check_preempt_curr() will set
950 * skip_update and the time between the wakeup
951 * and this unthrottle will get accounted as
952 * 'runtime'.
953 */
954 if (rt_rq->rt_nr_running && rq->curr == rq->idle) {
955 rq_clock_cancel_skipupdate(rq);
956 }
957 }
958 if (rt_rq->rt_time || rt_rq->rt_nr_running) {
959 idle = 0;
960 }
961 raw_spin_unlock(&rt_rq->rt_runtime_lock);
962 } else if (rt_rq->rt_nr_running) {
963 idle = 0;
964 if (!rt_rq_throttled(rt_rq)) {
965 enqueue = 1;
966 }
967 }
968 if (rt_rq->rt_throttled) {
969 throttled = 1;
970 }
971
972 if (enqueue) {
973 sched_rt_rq_enqueue(rt_rq);
974 }
975 raw_spin_unlock(&rq->lock);
976 }
977
978 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) {
979 return 1;
980 }
981
982 return idle;
983 }
984
rt_se_prio(struct sched_rt_entity *rt_se)985 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
986 {
987 #ifdef CONFIG_RT_GROUP_SCHED
988 struct rt_rq *rt_rq = group_rt_rq(rt_se);
989
990 if (rt_rq) {
991 return rt_rq->highest_prio.curr;
992 }
993 #endif
994
995 return rt_task_of(rt_se)->prio;
996 }
997
try_start_rt_bandwidth(struct rt_bandwidth *rt_b)998 static inline void try_start_rt_bandwidth(struct rt_bandwidth *rt_b)
999 {
1000 raw_spin_lock(&rt_b->rt_runtime_lock);
1001 if (!rt_b->rt_period_active) {
1002 rt_b->rt_period_active = 1;
1003 hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
1004 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED_HARD);
1005 }
1006 raw_spin_unlock(&rt_b->rt_runtime_lock);
1007 }
1008
sched_rt_runtime_exceeded(struct rt_rq *rt_rq)1009 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
1010 {
1011 u64 runtime = sched_rt_runtime(rt_rq);
1012
1013 if (rt_rq->rt_throttled) {
1014 return rt_rq_throttled(rt_rq);
1015 }
1016
1017 if (runtime >= sched_rt_period(rt_rq)) {
1018 return 0;
1019 }
1020
1021 balance_runtime(rt_rq);
1022 runtime = sched_rt_runtime(rt_rq);
1023 if (runtime == RUNTIME_INF) {
1024 return 0;
1025 }
1026
1027 if (rt_rq->rt_time > runtime) {
1028 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
1029
1030 /*
1031 * Don't actually throttle groups that have no runtime assigned
1032 * but accrue some time due to boosting.
1033 */
1034 if (likely(rt_b->rt_runtime)) {
1035 rt_rq->rt_throttled = 1;
1036 printk_deferred_once("sched: RT throttling activated\n");
1037 } else {
1038 /*
1039 * In case we did anyway, make it go away,
1040 * replenishment is a joke, since it will replenish us
1041 * with exactly 0 ns.
1042 */
1043 rt_rq->rt_time = 0;
1044 }
1045
1046 if (rt_rq_throttled(rt_rq)) {
1047 sched_rt_rq_dequeue(rt_rq);
1048 return 1;
1049 }
1050 }
1051
1052 return 0;
1053 }
1054
1055 /*
1056 * Update the current task's runtime statistics. Skip current tasks that
1057 * are not in our scheduling class.
1058 */
update_curr_rt(struct rq *rq)1059 static void update_curr_rt(struct rq *rq)
1060 {
1061 struct task_struct *curr = rq->curr;
1062 struct sched_rt_entity *rt_se = &curr->rt;
1063 u64 delta_exec;
1064 u64 now;
1065
1066 if (curr->sched_class != &rt_sched_class) {
1067 return;
1068 }
1069
1070 now = rq_clock_task(rq);
1071 delta_exec = now - curr->se.exec_start;
1072 if (unlikely((s64)delta_exec <= 0)) {
1073 return;
1074 }
1075
1076 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
1077
1078 curr->se.sum_exec_runtime += delta_exec;
1079 account_group_exec_runtime(curr, delta_exec);
1080
1081 curr->se.exec_start = now;
1082 cgroup_account_cputime(curr, delta_exec);
1083
1084 if (!rt_bandwidth_enabled()) {
1085 return;
1086 }
1087
1088 cycle_each_sched_rt_entity(rt_se) {
1089 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1090 int exceeded;
1091
1092 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1093 raw_spin_lock(&rt_rq->rt_runtime_lock);
1094 rt_rq->rt_time += delta_exec;
1095 exceeded = sched_rt_runtime_exceeded(rt_rq);
1096 if (exceeded) {
1097 resched_curr(rq);
1098 }
1099 raw_spin_unlock(&rt_rq->rt_runtime_lock);
1100 if (exceeded) {
1101 try_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1102 }
1103 }
1104 }
1105 }
1106
dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)1107 static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
1108 {
1109 struct rq *rq = rq_of_rt_rq(rt_rq);
1110
1111 BUG_ON(&rq->rt != rt_rq);
1112
1113 if (!rt_rq->rt_queued) {
1114 return;
1115 }
1116
1117 BUG_ON(!rq->nr_running);
1118
1119 sub_nr_running(rq, count);
1120 rt_rq->rt_queued = 0;
1121 }
1122
enqueue_top_rt_rq(struct rt_rq *rt_rq)1123 static void enqueue_top_rt_rq(struct rt_rq *rt_rq)
1124 {
1125 struct rq *rq = rq_of_rt_rq(rt_rq);
1126
1127 BUG_ON(&rq->rt != rt_rq);
1128
1129 if (rt_rq->rt_queued) {
1130 return;
1131 }
1132
1133 if (rt_rq_throttled(rt_rq)) {
1134 return;
1135 }
1136
1137 if (rt_rq->rt_nr_running) {
1138 add_nr_running(rq, rt_rq->rt_nr_running);
1139 rt_rq->rt_queued = 1;
1140 }
1141
1142 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1143 cpufreq_update_util(rq, 0);
1144 }
1145
1146 #if defined CONFIG_SMP
1147
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)1148 static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1149 {
1150 struct rq *rq = rq_of_rt_rq(rt_rq);
1151
1152 #ifdef CONFIG_RT_GROUP_SCHED
1153 /*
1154 * Change rq's cpupri only if rt_rq is the top queue.
1155 */
1156 if (&rq->rt != rt_rq) {
1157 return;
1158 }
1159 #endif
1160 if (rq->online && prio < prev_prio) {
1161 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1162 }
1163 }
1164
dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)1165 static void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1166 {
1167 struct rq *rq = rq_of_rt_rq(rt_rq);
1168
1169 #ifdef CONFIG_RT_GROUP_SCHED
1170 /*
1171 * Change rq's cpupri only if rt_rq is the top queue.
1172 */
1173 if (&rq->rt != rt_rq) {
1174 return;
1175 }
1176 #endif
1177 if (rq->online && rt_rq->highest_prio.curr != prev_prio) {
1178 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1179 }
1180 }
1181
1182 #else /* CONFIG_SMP */
1183
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)1184 static inline void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1185 {
1186 }
dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)1187 static inline void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1188 {
1189 }
1190
1191 #endif /* CONFIG_SMP */
1192
1193 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
inc_rt_prio(struct rt_rq *rt_rq, int prio)1194 static void inc_rt_prio(struct rt_rq *rt_rq, int prio)
1195 {
1196 int prev_prio = rt_rq->highest_prio.curr;
1197
1198 if (prio < prev_prio) {
1199 rt_rq->highest_prio.curr = prio;
1200 }
1201
1202 inc_rt_prio_smp(rt_rq, prio, prev_prio);
1203 }
1204
dec_rt_prio(struct rt_rq *rt_rq, int prio)1205 static void dec_rt_prio(struct rt_rq *rt_rq, int prio)
1206 {
1207 int prev_prio = rt_rq->highest_prio.curr;
1208
1209 if (rt_rq->rt_nr_running) {
1210 WARN_ON(prio < prev_prio);
1211
1212 /*
1213 * This may have been our highest task, and therefore
1214 * we may have some recomputation to do
1215 */
1216 if (prio == prev_prio) {
1217 struct rt_prio_array *array = &rt_rq->active;
1218
1219 rt_rq->highest_prio.curr = sched_find_first_bit(array->bitmap);
1220 }
1221 } else {
1222 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1223 }
1224
1225 dec_rt_prio_smp(rt_rq, prio, prev_prio);
1226 }
1227
1228 #else
1229
inc_rt_prio(struct rt_rq *rt_rq, int prio)1230 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio)
1231 {
1232 }
dec_rt_prio(struct rt_rq *rt_rq, int prio)1233 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio)
1234 {
1235 }
1236
1237 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1238
1239 #ifdef CONFIG_RT_GROUP_SCHED
1240
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)1241 static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1242 {
1243 if (rt_se_boosted(rt_se)) {
1244 rt_rq->rt_nr_boosted++;
1245 }
1246
1247 if (rt_rq->tg) {
1248 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1249 }
1250 }
1251
dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)1252 static void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1253 {
1254 if (rt_se_boosted(rt_se)) {
1255 rt_rq->rt_nr_boosted--;
1256 }
1257
1258 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1259 }
1260
1261 #else /* CONFIG_RT_GROUP_SCHED */
1262
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)1263 static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1264 {
1265 start_rt_bandwidth(&def_rt_bandwidth);
1266 }
1267
dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)1268 static inline void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1269 {
1270 }
1271
1272 #endif /* CONFIG_RT_GROUP_SCHED */
1273
rt_se_nr_running(struct sched_rt_entity *rt_se)1274 static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1275 {
1276 struct rt_rq *group_rq = group_rt_rq(rt_se);
1277
1278 if (group_rq) {
1279 return group_rq->rt_nr_running;
1280 } else {
1281 return 1;
1282 }
1283 }
1284
rt_se_rr_nr_running(struct sched_rt_entity *rt_se)1285 static inline unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1286 {
1287 struct rt_rq *group_rq = group_rt_rq(rt_se);
1288 struct task_struct *tsk;
1289
1290 if (group_rq) {
1291 return group_rq->rr_nr_running;
1292 }
1293
1294 tsk = rt_task_of(rt_se);
1295
1296 return (tsk->policy == SCHED_RR) ? 1 : 0;
1297 }
1298
inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)1299 static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1300 {
1301 int prio = rt_se_prio(rt_se);
1302
1303 WARN_ON(!rt_prio(prio));
1304 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1305 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1306
1307 inc_rt_prio(rt_rq, prio);
1308 inc_rt_migration(rt_se, rt_rq);
1309 inc_rt_group(rt_se, rt_rq);
1310 }
1311
dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)1312 static inline void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1313 {
1314 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1315 WARN_ON(!rt_rq->rt_nr_running);
1316 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1317 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1318
1319 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1320 dec_rt_migration(rt_se, rt_rq);
1321 dec_rt_group(rt_se, rt_rq);
1322 }
1323
1324 /*
1325 * Change rt_se->run_list location unless SAVE && !MOVE
1326 *
1327 * assumes ENQUEUE/DEQUEUE flags match
1328 */
move_entity(unsigned int flags)1329 static inline bool move_entity(unsigned int flags)
1330 {
1331 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) {
1332 return false;
1333 }
1334
1335 return true;
1336 }
1337
__delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)1338 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1339 {
1340 list_del_init(&rt_se->run_list);
1341
1342 if (list_empty(array->queue + rt_se_prio(rt_se))) {
1343 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1344 }
1345
1346 rt_se->on_list = 0;
1347 }
1348
__enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)1349 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1350 {
1351 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1352 struct rt_prio_array *array = &rt_rq->active;
1353 struct rt_rq *group_rq = group_rt_rq(rt_se);
1354 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1355
1356 /*
1357 * Don't enqueue the group if its throttled, or when empty.
1358 * The latter is a consequence of the former when a child group
1359 * get throttled and the current group doesn't have any other
1360 * active members.
1361 */
1362 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1363 if (rt_se->on_list) {
1364 __delist_rt_entity(rt_se, array);
1365 }
1366 return;
1367 }
1368
1369 if (move_entity(flags)) {
1370 WARN_ON_ONCE(rt_se->on_list);
1371 if (flags & ENQUEUE_HEAD) {
1372 list_add(&rt_se->run_list, queue);
1373 } else {
1374 list_add_tail(&rt_se->run_list, queue);
1375 }
1376
1377 __set_bit(rt_se_prio(rt_se), array->bitmap);
1378 rt_se->on_list = 1;
1379 }
1380 rt_se->on_rq = 1;
1381
1382 inc_rt_tasks(rt_se, rt_rq);
1383 }
1384
__dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)1385 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1386 {
1387 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1388 struct rt_prio_array *array = &rt_rq->active;
1389
1390 if (move_entity(flags)) {
1391 WARN_ON_ONCE(!rt_se->on_list);
1392 __delist_rt_entity(rt_se, array);
1393 }
1394 rt_se->on_rq = 0;
1395
1396 dec_rt_tasks(rt_se, rt_rq);
1397 }
1398
1399 /*
1400 * Because the prio of an upper entry depends on the lower
1401 * entries, we must remove entries top - down.
1402 */
dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)1403 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1404 {
1405 struct sched_rt_entity *back = NULL;
1406 unsigned int rt_nr_running;
1407
1408 cycle_each_sched_rt_entity(rt_se) {
1409 rt_se->back = back;
1410 back = rt_se;
1411 }
1412
1413 rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
1414
1415 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1416 if (on_rt_rq(rt_se)) {
1417 __dequeue_rt_entity(rt_se, flags);
1418 }
1419 }
1420 dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
1421 }
1422
enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)1423 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1424 {
1425 struct rq *rq = rq_of_rt_se(rt_se);
1426
1427 dequeue_rt_stack(rt_se, flags);
1428 cycle_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, flags);
1429 enqueue_top_rt_rq(&rq->rt);
1430 }
1431
dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)1432 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1433 {
1434 struct rq *rq = rq_of_rt_se(rt_se);
1435
1436 dequeue_rt_stack(rt_se, flags);
1437
1438 cycle_each_sched_rt_entity(rt_se) {
1439 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1440
1441 if (rt_rq && rt_rq->rt_nr_running) {
1442 __enqueue_rt_entity(rt_se, flags);
1443 }
1444 }
1445 enqueue_top_rt_rq(&rq->rt);
1446 }
1447
1448 #ifdef CONFIG_SMP
should_honor_rt_sync(struct rq *rq, struct task_struct *p, bool sync)1449 static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, bool sync)
1450 {
1451 /*
1452 * If the waker is CFS, then an RT sync wakeup would preempt the waker
1453 * and force it to run for a likely small time after the RT wakee is
1454 * done. So, only honor RT sync wakeups from RT wakers.
1455 */
1456 return sync && task_has_rt_policy(rq->curr) && p->prio <= rq->rt.highest_prio.next && rq->rt.rt_nr_running <= 0x2;
1457 }
1458 #else
should_honor_rt_sync(struct rq *rq, struct task_struct *p, bool sync)1459 static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, bool sync)
1460 {
1461 return 0;
1462 }
1463 #endif
1464
1465 /*
1466 * Adding/removing a task to/from a priority array:
1467 */
enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)1468 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1469 {
1470 struct sched_rt_entity *rt_se = &p->rt;
1471 bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC);
1472
1473 if (flags & ENQUEUE_WAKEUP) {
1474 rt_se->timeout = 0;
1475 }
1476
1477 enqueue_rt_entity(rt_se, flags);
1478 walt_inc_cumulative_runnable_avg(rq, p);
1479
1480 if (!task_current(rq, p) && p->nr_cpus_allowed > 1 && !should_honor_rt_sync(rq, p, sync)) {
1481 enqueue_pushable_task(rq, p);
1482 }
1483 }
1484
dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)1485 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1486 {
1487 struct sched_rt_entity *rt_se = &p->rt;
1488
1489 update_curr_rt(rq);
1490 dequeue_rt_entity(rt_se, flags);
1491 walt_dec_cumulative_runnable_avg(rq, p);
1492
1493 dequeue_pushable_task(rq, p);
1494 }
1495
1496 /*
1497 * Put task to the head or the end of the run list without the overhead of
1498 * dequeue followed by enqueue.
1499 */
requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)1500 static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1501 {
1502 if (on_rt_rq(rt_se)) {
1503 struct rt_prio_array *array = &rt_rq->active;
1504 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1505
1506 if (head) {
1507 list_move(&rt_se->run_list, queue);
1508 } else {
1509 list_move_tail(&rt_se->run_list, queue);
1510 }
1511 }
1512 }
1513
requeue_task_rt(struct rq *rq, struct task_struct *p, int head)1514 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1515 {
1516 struct sched_rt_entity *rt_se = &p->rt;
1517 struct rt_rq *rt_rq;
1518
1519 cycle_each_sched_rt_entity(rt_se) {
1520 rt_rq = rt_rq_of_se(rt_se);
1521 requeue_rt_entity(rt_rq, rt_se, head);
1522 }
1523 }
1524
yield_task_rt(struct rq *rq)1525 static void yield_task_rt(struct rq *rq)
1526 {
1527 requeue_task_rt(rq, rq->curr, 0);
1528 }
1529
1530 #ifdef CONFIG_SMP
1531 static int find_lowest_rq(struct task_struct *task);
1532
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)1533 static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1534 {
1535 struct task_struct *curr;
1536 struct rq *rq;
1537 struct rq *this_cpu_rq;
1538 bool test;
1539 int target_cpu = -1;
1540 bool sync = !!(flags & WF_SYNC);
1541 int this_cpu;
1542
1543 /* For anything but wake ups, just return the task_cpu */
1544 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) {
1545 goto out;
1546 }
1547
1548 rq = cpu_rq(cpu);
1549
1550 rcu_read_lock();
1551 curr = READ_ONCE(rq->curr); /* unlocked access */
1552 this_cpu = smp_processor_id();
1553 this_cpu_rq = cpu_rq(this_cpu);
1554
1555 /*
1556 * If the current task on @p's runqueue is an RT task, then
1557 * try to see if we can wake this RT task up on another
1558 * runqueue. Otherwise simply start this RT task
1559 * on its current runqueue.
1560 *
1561 * We want to avoid overloading runqueues. If the woken
1562 * task is a higher priority, then it will stay on this CPU
1563 * and the lower prio task should be moved to another CPU.
1564 * Even though this will probably make the lower prio task
1565 * lose its cache, we do not want to bounce a higher task
1566 * around just because it gave up its CPU, perhaps for a
1567 * lock?
1568 *
1569 * For equal prio tasks, we just let the scheduler sort it out.
1570 *
1571 * Otherwise, just let it ride on the affined RQ and the
1572 * post-schedule router will push the preempted task away
1573 *
1574 * This test is optimistic, if we get it wrong the load-balancer
1575 * will have to sort it out.
1576 *
1577 * We take into account the capacity of the CPU to ensure it fits the
1578 * requirement of the task - which is only important on heterogeneous
1579 * systems like big.LITTLE.
1580 */
1581 test = curr && unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 0x2 || curr->prio <= p->prio);
1582
1583 /*
1584 * Respect the sync flag as long as the task can run on this CPU.
1585 */
1586 if (should_honor_rt_sync(this_cpu_rq, p, sync) && cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
1587 cpu = this_cpu;
1588 goto out_unlock;
1589 }
1590
1591 if (test || !rt_task_fits_capacity(p, cpu)) {
1592 int target = find_lowest_rq(p);
1593 /*
1594 * Bail out if we were forcing a migration to find a better
1595 * fitting CPU but our search failed.
1596 */
1597 if (!test && target != -1 && !rt_task_fits_capacity(p, target)) {
1598 goto out_unlock;
1599 }
1600
1601 /*
1602 * Don't bother moving it if the destination CPU is
1603 * not running a lower priority task.
1604 */
1605 if (target != -1 && p->prio < cpu_rq(target)->rt.highest_prio.curr) {
1606 cpu = target;
1607 }
1608 }
1609
1610 out_unlock:
1611 rcu_read_unlock();
1612
1613 out:
1614 return cpu;
1615 }
1616
check_preempt_equal_prio(struct rq *rq, struct task_struct *p)1617 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1618 {
1619 /*
1620 * Current can't be migrated, useless to reschedule,
1621 * let's hope p can move out.
1622 */
1623 if (rq->curr->nr_cpus_allowed == 1 || !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) {
1624 return;
1625 }
1626
1627 /*
1628 * p is migratable, so let's not schedule it and
1629 * see if it is pushed or pulled somewhere else.
1630 */
1631 if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) {
1632 return;
1633 }
1634
1635 /*
1636 * There appear to be other CPUs that can accept
1637 * the current task but none can run 'p', so lets reschedule
1638 * to try and push the current task away:
1639 */
1640 requeue_task_rt(rq, p, 1);
1641 resched_curr(rq);
1642 }
1643
balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)1644 static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1645 {
1646 if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1647 /*
1648 * This is OK, because current is on_cpu, which avoids it being
1649 * picked for load-balance and preemption/IRQs are still
1650 * disabled avoiding further scheduler activity on it and we've
1651 * not yet started the picking loop.
1652 */
1653 rq_unpin_lock(rq, rf);
1654 pull_rt_task(rq);
1655 rq_repin_lock(rq, rf);
1656 }
1657
1658 return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1659 }
1660 #endif /* CONFIG_SMP */
1661
1662 /*
1663 * Preempt the current task with a newly woken task if needed:
1664 */
check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)1665 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1666 {
1667 if (p->prio < rq->curr->prio) {
1668 resched_curr(rq);
1669 return;
1670 }
1671
1672 #ifdef CONFIG_SMP
1673 /*
1674 * If:
1675 *
1676 * - the newly woken task is of equal priority to the current task
1677 * - the newly woken task is non-migratable while current is migratable
1678 * - current will be preempted on the next reschedule
1679 *
1680 * we should check to see if current can readily move to a different
1681 * cpu. If so, we will reschedule to allow the push logic to try
1682 * to move current somewhere else, making room for our non-migratable
1683 * task.
1684 */
1685 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) {
1686 check_preempt_equal_prio(rq, p);
1687 }
1688 #endif
1689 }
1690
set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)1691 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
1692 {
1693 p->se.exec_start = rq_clock_task(rq);
1694
1695 /* The running task is never eligible for pushing */
1696 dequeue_pushable_task(rq, p);
1697
1698 if (!first) {
1699 return;
1700 }
1701
1702 /*
1703 * If prev task was rt, put_prev_task() has already updated the
1704 * utilization. We only care of the case where we start to schedule a
1705 * rt task
1706 */
1707 if (rq->curr->sched_class != &rt_sched_class) {
1708 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1709 }
1710
1711 rt_queue_push_tasks(rq);
1712 }
1713
pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq)1714 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq)
1715 {
1716 struct rt_prio_array *array = &rt_rq->active;
1717 struct sched_rt_entity *next = NULL;
1718 struct list_head *queue;
1719 int idx;
1720
1721 idx = sched_find_first_bit(array->bitmap);
1722 BUG_ON(idx >= MAX_RT_PRIO);
1723
1724 queue = array->queue + idx;
1725 next = list_entry(queue->next, struct sched_rt_entity, run_list);
1726
1727 return next;
1728 }
1729
_pick_next_task_rt(struct rq *rq)1730 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1731 {
1732 struct sched_rt_entity *rt_se;
1733 struct rt_rq *rt_rq = &rq->rt;
1734
1735 do {
1736 rt_se = pick_next_rt_entity(rq, rt_rq);
1737 BUG_ON(!rt_se);
1738 rt_rq = group_rt_rq(rt_se);
1739 } while (rt_rq);
1740
1741 return rt_task_of(rt_se);
1742 }
1743
pick_next_task_rt(struct rq *rq)1744 static struct task_struct *pick_next_task_rt(struct rq *rq)
1745 {
1746 struct task_struct *p;
1747
1748 if (!sched_rt_runnable(rq)) {
1749 return NULL;
1750 }
1751
1752 p = _pick_next_task_rt(rq);
1753 set_next_task_rt(rq, p, true);
1754 return p;
1755 }
1756
put_prev_task_rt(struct rq *rq, struct task_struct *p)1757 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1758 {
1759 update_curr_rt(rq);
1760
1761 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1762
1763 /*
1764 * The previous task needs to be made eligible for pushing
1765 * if it is still active
1766 */
1767 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) {
1768 enqueue_pushable_task(rq, p);
1769 }
1770 }
1771
1772 #ifdef CONFIG_SMP
1773
1774 /* Only try algorithms three times */
1775 #define RT_MAX_TRIES 3
1776
pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)1777 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1778 {
1779 if (!task_running(rq, p) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
1780 return 1;
1781 }
1782
1783 return 0;
1784 }
1785
1786 /*
1787 * Return the highest pushable rq's task, which is suitable to be executed
1788 * on the CPU, NULL otherwise
1789 */
pick_highest_pushable_task(struct rq *rq, int cpu)1790 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1791 {
1792 struct plist_head *head = &rq->rt.pushable_tasks;
1793 struct task_struct *p;
1794
1795 if (!has_pushable_tasks(rq)) {
1796 return NULL;
1797 }
1798
1799 plist_for_each_entry(p, head, pushable_tasks)
1800 {
1801 if (pick_rt_task(rq, p, cpu)) {
1802 return p;
1803 }
1804 }
1805
1806 return NULL;
1807 }
1808
1809 #ifdef CONFIG_SCHED_RT_CAS
find_cas_cpu(struct sched_domain *sd, struct task_struct *task, struct cpumask *lowest_mask)1810 static int find_cas_cpu(struct sched_domain *sd, struct task_struct *task, struct cpumask *lowest_mask)
1811 {
1812 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
1813 struct sched_group *sg = NULL;
1814 struct sched_group *sg_target = NULL;
1815 struct sched_group *sg_backup = NULL;
1816 struct cpumask search_cpu, backup_search_cpu;
1817 int cpu = -1;
1818 int target_cpu = -1;
1819 unsigned long cpu_capacity;
1820 unsigned long boosted_tutil = uclamp_task_util(task);
1821 unsigned long target_capacity = ULONG_MAX;
1822 unsigned long util;
1823 unsigned long target_cpu_util = ULONG_MAX;
1824 int prev_cpu = task_cpu(task);
1825 #ifdef CONFIG_SCHED_RTG
1826 struct cpumask *rtg_target = NULL;
1827 #endif
1828 bool boosted = uclamp_boosted(task);
1829
1830 if (!sysctl_sched_enable_rt_cas) {
1831 return -1;
1832 }
1833
1834 rcu_read_lock();
1835
1836 #ifdef CONFIG_SCHED_RTG
1837 rtg_target = find_rtg_target(task);
1838 #endif
1839
1840 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, 0));
1841 if (!sd) {
1842 rcu_read_unlock();
1843 return -1;
1844 }
1845
1846 sg = sd->groups;
1847 do {
1848 if (!cpumask_intersects(lowest_mask, sched_group_span(sg))) {
1849 continue;
1850 }
1851
1852 if (boosted) {
1853 if (cpumask_test_cpu(rd->max_cap_orig_cpu, sched_group_span(sg))) {
1854 sg_target = sg;
1855 break;
1856 }
1857 }
1858
1859 cpu = group_first_cpu(sg);
1860 #ifdef CONFIG_SCHED_RTG
1861 /* honor the rtg tasks */
1862 if (rtg_target) {
1863 if (cpumask_test_cpu(cpu, rtg_target)) {
1864 sg_target = sg;
1865 break;
1866 }
1867
1868 /* active LB or big_task favor cpus with more capacity */
1869 if (task->state == TASK_RUNNING || boosted) {
1870 if (capacity_orig_of(cpu) > capacity_orig_of(cpumask_any(rtg_target))) {
1871 sg_target = sg;
1872 break;
1873 }
1874
1875 sg_backup = sg;
1876 continue;
1877 }
1878 }
1879 #endif
1880 /*
1881 * 1. add margin to support task migration
1882 * 2. if task_util is high then all cpus, make sure the
1883 * sg_backup with the most powerful cpus is selected
1884 */
1885 if (!rt_task_fits_capacity(task, cpu)) {
1886 sg_backup = sg;
1887 continue;
1888 }
1889
1890 /* support task boost */
1891 cpu_capacity = capacity_orig_of(cpu);
1892 if (boosted_tutil > cpu_capacity) {
1893 sg_backup = sg;
1894 continue;
1895 }
1896
1897 /* sg_target: select the sg with smaller capacity */
1898 if (cpu_capacity < target_capacity) {
1899 target_capacity = cpu_capacity;
1900 sg_target = sg;
1901 }
1902 } while (sg = sg->next, sg != sd->groups);
1903
1904 if (!sg_target) {
1905 sg_target = sg_backup;
1906 }
1907
1908 if (sg_target) {
1909 cpumask_and(&search_cpu, lowest_mask, sched_group_span(sg_target));
1910 cpumask_copy(&backup_search_cpu, lowest_mask);
1911 cpumask_andnot(&backup_search_cpu, &backup_search_cpu, &search_cpu);
1912 } else {
1913 cpumask_copy(&search_cpu, lowest_mask);
1914 cpumask_clear(&backup_search_cpu);
1915 }
1916
1917 retry:
1918 cpu = cpumask_first(&search_cpu);
1919 do {
1920 trace_sched_find_cas_cpu_each(task, cpu, target_cpu, cpu_isolated(cpu), idle_cpu(cpu), boosted_tutil,
1921 cpu_util(cpu), capacity_orig_of(cpu));
1922
1923 if (cpu_isolated(cpu)) {
1924 continue;
1925 }
1926
1927 if (!cpumask_test_cpu(cpu, task->cpus_ptr)) {
1928 continue;
1929 }
1930
1931 /* find best cpu with smallest max_capacity */
1932 if (target_cpu != -1 && capacity_orig_of(cpu) > capacity_orig_of(target_cpu)) {
1933 continue;
1934 }
1935
1936 util = cpu_util(cpu);
1937 /* Find the least loaded CPU */
1938 if (util > target_cpu_util) {
1939 continue;
1940 }
1941
1942 /*
1943 * If the preivous CPU has same load, keep it as
1944 * target_cpu
1945 */
1946 if (target_cpu_util == util && target_cpu == prev_cpu) {
1947 continue;
1948 }
1949
1950 /*
1951 * If candidate CPU is the previous CPU, select it.
1952 * If all above conditions are same, select the least
1953 * cumulative window demand CPU.
1954 */
1955 target_cpu_util = util;
1956 target_cpu = cpu;
1957 } while ((cpu = cpumask_next(cpu, &search_cpu)) < nr_cpu_ids);
1958
1959 if (target_cpu != -1 && cpumask_test_cpu(target_cpu, lowest_mask)) {
1960 goto done;
1961 } else if (!cpumask_empty(&backup_search_cpu)) {
1962 cpumask_copy(&search_cpu, &backup_search_cpu);
1963 cpumask_clear(&backup_search_cpu);
1964 goto retry;
1965 }
1966
1967 done:
1968 trace_sched_find_cas_cpu(task, lowest_mask, boosted_tutil, prev_cpu, target_cpu);
1969 rcu_read_unlock();
1970 return target_cpu;
1971 }
1972 #endif
1973
1974 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1975
find_lowest_rq(struct task_struct *task)1976 static int find_lowest_rq(struct task_struct *task)
1977 {
1978 struct sched_domain *sd;
1979 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1980 int this_cpu = smp_processor_id();
1981 int cpu = task_cpu(task);
1982 int ret;
1983 #ifdef CONFIG_SCHED_RT_CAS
1984 int cas_cpu;
1985 #endif
1986
1987 /* Make sure the mask is initialized first */
1988 if (unlikely(!lowest_mask)) {
1989 return -1;
1990 }
1991
1992 if (task->nr_cpus_allowed == 1) {
1993 return -1; /* No other targets possible */
1994 }
1995
1996 /*
1997 * If we're on asym system ensure we consider the different capacities
1998 * of the CPUs when searching for the lowest_mask.
1999 */
2000 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
2001 ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, task, lowest_mask, rt_task_fits_capacity);
2002 } else {
2003 ret = cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask);
2004 }
2005
2006 if (!ret) {
2007 return -1; /* No targets found */
2008 }
2009
2010 #ifdef CONFIG_SCHED_RT_CAS
2011 cas_cpu = find_cas_cpu(sd, task, lowest_mask);
2012 if (cas_cpu != -1) {
2013 return cas_cpu;
2014 }
2015 #endif
2016
2017 /*
2018 * At this point we have built a mask of CPUs representing the
2019 * lowest priority tasks in the system. Now we want to elect
2020 * the best one based on our affinity and topology.
2021 *
2022 * We prioritize the last CPU that the task executed on since
2023 * it is most likely cache-hot in that location.
2024 */
2025 if (cpumask_test_cpu(cpu, lowest_mask)) {
2026 return cpu;
2027 }
2028
2029 /*
2030 * Otherwise, we consult the sched_domains span maps to figure
2031 * out which CPU is logically closest to our hot cache data.
2032 */
2033 if (!cpumask_test_cpu(this_cpu, lowest_mask)) {
2034 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
2035 }
2036
2037 rcu_read_lock();
2038 for_each_domain(cpu, sd)
2039 {
2040 if (sd->flags & SD_WAKE_AFFINE) {
2041 int best_cpu;
2042
2043 /*
2044 * "this_cpu" is cheaper to preempt than a
2045 * remote processor.
2046 */
2047 if (this_cpu != -1 && cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
2048 rcu_read_unlock();
2049 return this_cpu;
2050 }
2051
2052 best_cpu = cpumask_first_and(lowest_mask, sched_domain_span(sd));
2053 if (best_cpu < nr_cpu_ids) {
2054 rcu_read_unlock();
2055 return best_cpu;
2056 }
2057 }
2058 }
2059 rcu_read_unlock();
2060
2061 /*
2062 * And finally, if there were no matches within the domains
2063 * just give the caller *something* to work with from the compatible
2064 * locations.
2065 */
2066 if (this_cpu != -1) {
2067 return this_cpu;
2068 }
2069
2070 cpu = cpumask_any(lowest_mask);
2071 if (cpu < nr_cpu_ids) {
2072 return cpu;
2073 }
2074
2075 return -1;
2076 }
2077
pick_next_pushable_task(struct rq *rq)2078 static struct task_struct *pick_next_pushable_task(struct rq *rq)
2079 {
2080 struct task_struct *p;
2081
2082 if (!has_pushable_tasks(rq)) {
2083 return NULL;
2084 }
2085
2086 p = plist_first_entry(&rq->rt.pushable_tasks, struct task_struct, pushable_tasks);
2087
2088 BUG_ON(rq->cpu != task_cpu(p));
2089 BUG_ON(task_current(rq, p));
2090 BUG_ON(p->nr_cpus_allowed <= 1);
2091
2092 BUG_ON(!task_on_rq_queued(p));
2093 BUG_ON(!rt_task(p));
2094
2095 return p;
2096 }
2097
2098 /* Will lock the rq it finds */
find_lock_lowest_rq(struct task_struct *task, struct rq *rq)2099 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
2100 {
2101 struct rq *lowest_rq = NULL;
2102 int tries;
2103 int cpu;
2104
2105 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
2106 cpu = find_lowest_rq(task);
2107 if ((cpu == -1) || (cpu == rq->cpu)) {
2108 break;
2109 }
2110
2111 lowest_rq = cpu_rq(cpu);
2112 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
2113 /*
2114 * Target rq has tasks of equal or higher priority,
2115 * retrying does not release any lock and is unlikely
2116 * to yield a different result.
2117 */
2118 lowest_rq = NULL;
2119 break;
2120 }
2121
2122 /* if the prio of this runqueue changed, try again */
2123 if (double_lock_balance(rq, lowest_rq)) {
2124 /*
2125 * We had to unlock the run queue. In
2126 * the mean time, task could have
2127 * migrated already or had its affinity changed.
2128 */
2129 struct task_struct *next_task = pick_next_pushable_task(rq);
2130 if (unlikely(next_task != task || !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr))) {
2131 double_unlock_balance(rq, lowest_rq);
2132 lowest_rq = NULL;
2133 break;
2134 }
2135 }
2136
2137 /* If this rq is still suitable use it. */
2138 if (lowest_rq->rt.highest_prio.curr > task->prio) {
2139 break;
2140 }
2141
2142 /* try again */
2143 double_unlock_balance(rq, lowest_rq);
2144 lowest_rq = NULL;
2145 }
2146
2147 return lowest_rq;
2148 }
2149
2150 /*
2151 * If the current CPU has more than one RT task, see if the non
2152 * running task can migrate over to a CPU that is running a task
2153 * of lesser priority.
2154 */
push_rt_task(struct rq *rq)2155 static int push_rt_task(struct rq *rq)
2156 {
2157 struct task_struct *next_task;
2158 struct rq *lowest_rq;
2159 int ret = 0;
2160
2161 if (!rq->rt.overloaded) {
2162 return 0;
2163 }
2164
2165 next_task = pick_next_pushable_task(rq);
2166 if (!next_task) {
2167 return 0;
2168 }
2169
2170 retry:
2171 if (WARN_ON(next_task == rq->curr)) {
2172 return 0;
2173 }
2174
2175 /*
2176 * It's possible that the next_task slipped in of
2177 * higher priority than current. If that's the case
2178 * just reschedule current.
2179 */
2180 if (unlikely(next_task->prio < rq->curr->prio)) {
2181 resched_curr(rq);
2182 return 0;
2183 }
2184
2185 /* We might release rq lock */
2186 get_task_struct(next_task);
2187
2188 /* find_lock_lowest_rq locks the rq if found */
2189 lowest_rq = find_lock_lowest_rq(next_task, rq);
2190 if (!lowest_rq) {
2191 struct task_struct *task;
2192 /*
2193 * find_lock_lowest_rq releases rq->lock
2194 * so it is possible that next_task has migrated.
2195 *
2196 * We need to make sure that the task is still on the same
2197 * run-queue and is also still the next task eligible for
2198 * pushing.
2199 */
2200 task = pick_next_pushable_task(rq);
2201 if (task == next_task) {
2202 /*
2203 * The task hasn't migrated, and is still the next
2204 * eligible task, but we failed to find a run-queue
2205 * to push it to. Do not retry in this case, since
2206 * other CPUs will pull from us when ready.
2207 */
2208 goto out;
2209 }
2210
2211 if (!task) {
2212 /* No more tasks, just exit */
2213 goto out;
2214 }
2215
2216 /*
2217 * Something has shifted, try again.
2218 */
2219 put_task_struct(next_task);
2220 next_task = task;
2221 goto retry;
2222 }
2223
2224 deactivate_task(rq, next_task, 0);
2225 set_task_cpu(next_task, lowest_rq->cpu);
2226 activate_task(lowest_rq, next_task, 0);
2227 ret = 1;
2228
2229 resched_curr(lowest_rq);
2230
2231 double_unlock_balance(rq, lowest_rq);
2232
2233 out:
2234 put_task_struct(next_task);
2235
2236 return ret;
2237 }
2238
push_rt_tasks(struct rq *rq)2239 static void push_rt_tasks(struct rq *rq)
2240 {
2241 /* push_rt_task will return true if it moved an RT */
2242 while (push_rt_task(rq)) {
2243 ;
2244 }
2245 }
2246
2247 #ifdef HAVE_RT_PUSH_IPI
2248
2249 /*
2250 * When a high priority task schedules out from a CPU and a lower priority
2251 * task is scheduled in, a check is made to see if there's any RT tasks
2252 * on other CPUs that are waiting to run because a higher priority RT task
2253 * is currently running on its CPU. In this case, the CPU with multiple RT
2254 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
2255 * up that may be able to run one of its non-running queued RT tasks.
2256 *
2257 * All CPUs with overloaded RT tasks need to be notified as there is currently
2258 * no way to know which of these CPUs have the highest priority task waiting
2259 * to run. Instead of trying to take a spinlock on each of these CPUs,
2260 * which has shown to cause large latency when done on machines with many
2261 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2262 * RT tasks waiting to run.
2263 *
2264 * Just sending an IPI to each of the CPUs is also an issue, as on large
2265 * count CPU machines, this can cause an IPI storm on a CPU, especially
2266 * if its the only CPU with multiple RT tasks queued, and a large number
2267 * of CPUs scheduling a lower priority task at the same time.
2268 *
2269 * Each root domain has its own irq work function that can iterate over
2270 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2271 * tassk must be checked if there's one or many CPUs that are lowering
2272 * their priority, there's a single irq work iterator that will try to
2273 * push off RT tasks that are waiting to run.
2274 *
2275 * When a CPU schedules a lower priority task, it will kick off the
2276 * irq work iterator that will jump to each CPU with overloaded RT tasks.
2277 * As it only takes the first CPU that schedules a lower priority task
2278 * to start the process, the rto_start variable is incremented and if
2279 * the atomic result is one, then that CPU will try to take the rto_lock.
2280 * This prevents high contention on the lock as the process handles all
2281 * CPUs scheduling lower priority tasks.
2282 *
2283 * All CPUs that are scheduling a lower priority task will increment the
2284 * rt_loop_next variable. This will make sure that the irq work iterator
2285 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2286 * priority task, even if the iterator is in the middle of a scan. Incrementing
2287 * the rt_loop_next will cause the iterator to perform another scan.
2288 *
2289 */
rto_next_cpu(struct root_domain *rd)2290 static int rto_next_cpu(struct root_domain *rd)
2291 {
2292 int next;
2293 int cpu;
2294
2295 /*
2296 * When starting the IPI RT pushing, the rto_cpu is set to -1,
2297 * rt_next_cpu() will simply return the first CPU found in
2298 * the rto_mask.
2299 *
2300 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2301 * will return the next CPU found in the rto_mask.
2302 *
2303 * If there are no more CPUs left in the rto_mask, then a check is made
2304 * against rto_loop and rto_loop_next. rto_loop is only updated with
2305 * the rto_lock held, but any CPU may increment the rto_loop_next
2306 * without any locking.
2307 */
2308 for (;;) {
2309 /* When rto_cpu is -1 this acts like cpumask_first() */
2310 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2311
2312 rd->rto_cpu = cpu;
2313
2314 if (cpu < nr_cpu_ids) {
2315 return cpu;
2316 }
2317
2318 rd->rto_cpu = -1;
2319
2320 /*
2321 * ACQUIRE ensures we see the @rto_mask changes
2322 * made prior to the @next value observed.
2323 *
2324 * Matches WMB in rt_set_overload().
2325 */
2326 next = atomic_read_acquire(&rd->rto_loop_next);
2327 if (rd->rto_loop == next) {
2328 break;
2329 }
2330
2331 rd->rto_loop = next;
2332 }
2333
2334 return -1;
2335 }
2336
rto_start_trylock(atomic_t *v)2337 static inline bool rto_start_trylock(atomic_t *v)
2338 {
2339 return !atomic_cmpxchg_acquire(v, 0, 1);
2340 }
2341
rto_start_unlock(atomic_t *v)2342 static inline void rto_start_unlock(atomic_t *v)
2343 {
2344 atomic_set_release(v, 0);
2345 }
2346
tell_cpu_to_push(struct rq *rq)2347 static void tell_cpu_to_push(struct rq *rq)
2348 {
2349 int cpu = -1;
2350
2351 /* Keep the loop going if the IPI is currently active */
2352 atomic_inc(&rq->rd->rto_loop_next);
2353
2354 /* Only one CPU can initiate a loop at a time */
2355 if (!rto_start_trylock(&rq->rd->rto_loop_start)) {
2356 return;
2357 }
2358
2359 raw_spin_lock(&rq->rd->rto_lock);
2360
2361 /*
2362 * The rto_cpu is updated under the lock, if it has a valid CPU
2363 * then the IPI is still running and will continue due to the
2364 * update to loop_next, and nothing needs to be done here.
2365 * Otherwise it is finishing up and an ipi needs to be sent.
2366 */
2367 if (rq->rd->rto_cpu < 0) {
2368 cpu = rto_next_cpu(rq->rd);
2369 }
2370
2371 raw_spin_unlock(&rq->rd->rto_lock);
2372
2373 rto_start_unlock(&rq->rd->rto_loop_start);
2374
2375 if (cpu >= 0) {
2376 /* Make sure the rd does not get freed while pushing */
2377 sched_get_rd(rq->rd);
2378 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2379 }
2380 }
2381
2382 /* Called from hardirq context */
rto_push_irq_work_func(struct irq_work *work)2383 void rto_push_irq_work_func(struct irq_work *work)
2384 {
2385 struct root_domain *rd = container_of(work, struct root_domain, rto_push_work);
2386 struct rq *rq;
2387 int cpu;
2388
2389 rq = this_rq();
2390 /*
2391 * We do not need to grab the lock to check for has_pushable_tasks.
2392 * When it gets updated, a check is made if a push is possible.
2393 */
2394 if (has_pushable_tasks(rq)) {
2395 raw_spin_lock(&rq->lock);
2396 push_rt_tasks(rq);
2397 raw_spin_unlock(&rq->lock);
2398 }
2399
2400 raw_spin_lock(&rd->rto_lock);
2401
2402 /* Pass the IPI to the next rt overloaded queue */
2403 cpu = rto_next_cpu(rd);
2404
2405 raw_spin_unlock(&rd->rto_lock);
2406
2407 if (cpu < 0) {
2408 sched_put_rd(rd);
2409 return;
2410 }
2411
2412 /* Try the next RT overloaded CPU */
2413 irq_work_queue_on(&rd->rto_push_work, cpu);
2414 }
2415 #endif /* HAVE_RT_PUSH_IPI */
2416
pull_rt_task(struct rq *this_rq)2417 static void pull_rt_task(struct rq *this_rq)
2418 {
2419 int this_cpu = this_rq->cpu, cpu;
2420 bool resched = false;
2421 struct task_struct *p;
2422 struct rq *src_rq;
2423 int rt_overload_count = rt_overloaded(this_rq);
2424 if (likely(!rt_overload_count)) {
2425 return;
2426 }
2427
2428 /*
2429 * Match the barrier from rt_set_overloaded; this guarantees that if we
2430 * see overloaded we must also see the rto_mask bit.
2431 */
2432 smp_rmb();
2433
2434 /* If we are the only overloaded CPU do nothing */
2435 if (rt_overload_count == 1 && cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) {
2436 return;
2437 }
2438
2439 #ifdef HAVE_RT_PUSH_IPI
2440 if (sched_feat(RT_PUSH_IPI)) {
2441 tell_cpu_to_push(this_rq);
2442 return;
2443 }
2444 #endif
2445
2446 for_each_cpu(cpu, this_rq->rd->rto_mask)
2447 {
2448 if (this_cpu == cpu) {
2449 continue;
2450 }
2451
2452 src_rq = cpu_rq(cpu);
2453 /*
2454 * Don't bother taking the src_rq->lock if the next highest
2455 * task is known to be lower-priority than our current task.
2456 * This may look racy, but if this value is about to go
2457 * logically higher, the src_rq will push this task away.
2458 * And if its going logically lower, we do not care
2459 */
2460 if (src_rq->rt.highest_prio.next >= this_rq->rt.highest_prio.curr) {
2461 continue;
2462 }
2463
2464 /*
2465 * We can potentially drop this_rq's lock in
2466 * double_lock_balance, and another CPU could
2467 * alter this_rq
2468 */
2469 double_lock_balance(this_rq, src_rq);
2470
2471 /*
2472 * We can pull only a task, which is pushable
2473 * on its rq, and no others.
2474 */
2475 p = pick_highest_pushable_task(src_rq, this_cpu);
2476 /*
2477 * Do we have an RT task that preempts
2478 * the to-be-scheduled task?
2479 */
2480 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2481 WARN_ON(p == src_rq->curr);
2482 WARN_ON(!task_on_rq_queued(p));
2483
2484 /*
2485 * There's a chance that p is higher in priority
2486 * than what's currently running on its CPU.
2487 * This is just that p is wakeing up and hasn't
2488 * had a chance to schedule. We only pull
2489 * p if it is lower in priority than the
2490 * current task on the run queue
2491 */
2492 if (p->prio < src_rq->curr->prio) {
2493 goto skip;
2494 }
2495
2496 resched = true;
2497
2498 deactivate_task(src_rq, p, 0);
2499 set_task_cpu(p, this_cpu);
2500 activate_task(this_rq, p, 0);
2501 /*
2502 * We continue with the search, just in
2503 * case there's an even higher prio task
2504 * in another runqueue. (low likelihood
2505 * but possible)
2506 */
2507 }
2508 skip:
2509 double_unlock_balance(this_rq, src_rq);
2510 }
2511
2512 if (resched) {
2513 resched_curr(this_rq);
2514 }
2515 }
2516
2517 /*
2518 * If we are not running and we are not going to reschedule soon, we should
2519 * try to push tasks away now
2520 */
task_woken_rt(struct rq *rq, struct task_struct *p)2521 static void task_woken_rt(struct rq *rq, struct task_struct *p)
2522 {
2523 bool need_to_push = !task_running(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 &&
2524 (dl_task(rq->curr) || rt_task(rq->curr)) &&
2525 (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio);
2526 if (need_to_push) {
2527 push_rt_tasks(rq);
2528 }
2529 }
2530
2531 /* Assumes rq->lock is held */
rq_online_rt(struct rq *rq)2532 static void rq_online_rt(struct rq *rq)
2533 {
2534 if (rq->rt.overloaded) {
2535 rt_set_overload(rq);
2536 }
2537
2538 __enable_runtime(rq);
2539
2540 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2541 }
2542
2543 /* Assumes rq->lock is held */
rq_offline_rt(struct rq *rq)2544 static void rq_offline_rt(struct rq *rq)
2545 {
2546 if (rq->rt.overloaded) {
2547 rt_clear_overload(rq);
2548 }
2549
2550 __disable_runtime(rq);
2551
2552 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2553 }
2554
2555 /*
2556 * When switch from the rt queue, we bring ourselves to a position
2557 * that we might want to pull RT tasks from other runqueues.
2558 */
switched_from_rt(struct rq *rq, struct task_struct *p)2559 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2560 {
2561 /*
2562 * If there are other RT tasks then we will reschedule
2563 * and the scheduling of the other RT tasks will handle
2564 * the balancing. But if we are the last RT task
2565 * we may need to handle the pulling of RT tasks
2566 * now.
2567 */
2568 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || cpu_isolated(cpu_of(rq))) {
2569 return;
2570 }
2571
2572 rt_queue_pull_task(rq);
2573 }
2574
init_sched_rt_class(void)2575 void __init init_sched_rt_class(void)
2576 {
2577 unsigned int i;
2578
2579 for_each_possible_cpu(i)
2580 {
2581 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i));
2582 }
2583 }
2584 #endif /* CONFIG_SMP */
2585
2586 /*
2587 * When switching a task to RT, we may overload the runqueue
2588 * with RT tasks. In this case we try to push them off to
2589 * other runqueues.
2590 */
switched_to_rt(struct rq *rq, struct task_struct *p)2591 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2592 {
2593 /*
2594 * If we are running, update the avg_rt tracking, as the running time
2595 * will now on be accounted into the latter.
2596 */
2597 if (task_current(rq, p)) {
2598 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2599 return;
2600 }
2601
2602 /*
2603 * If we are not running we may need to preempt the current
2604 * running task. If that current running task is also an RT task
2605 * then see if we can move to another run queue.
2606 */
2607 if (task_on_rq_queued(p)) {
2608 #ifdef CONFIG_SMP
2609 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) {
2610 rt_queue_push_tasks(rq);
2611 }
2612 #endif /* CONFIG_SMP */
2613 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) {
2614 resched_curr(rq);
2615 }
2616 }
2617 }
2618
2619 /*
2620 * Priority of the task has changed. This may cause
2621 * us to initiate a push or pull.
2622 */
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)2623 static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2624 {
2625 if (!task_on_rq_queued(p)) {
2626 return;
2627 }
2628
2629 if (rq->curr == p) {
2630 #ifdef CONFIG_SMP
2631 /*
2632 * If our priority decreases while running, we
2633 * may need to pull tasks to this runqueue.
2634 */
2635 if (oldprio < p->prio) {
2636 rt_queue_pull_task(rq);
2637 }
2638
2639 /*
2640 * If there's a higher priority task waiting to run
2641 * then reschedule.
2642 */
2643 if (p->prio > rq->rt.highest_prio.curr) {
2644 resched_curr(rq);
2645 }
2646 #else
2647 /* For UP simply resched on drop of prio */
2648 if (oldprio < p->prio) {
2649 resched_curr(rq);
2650 }
2651 #endif /* CONFIG_SMP */
2652 } else {
2653 /*
2654 * This task is not running, but if it is
2655 * greater than the current running task
2656 * then reschedule.
2657 */
2658 if (p->prio < rq->curr->prio) {
2659 resched_curr(rq);
2660 }
2661 }
2662 }
2663
2664 #ifdef CONFIG_POSIX_TIMERS
watchdog(struct rq *rq, struct task_struct *p)2665 static void watchdog(struct rq *rq, struct task_struct *p)
2666 {
2667 unsigned long soft, hard;
2668
2669 /* max may change after cur was read, this will be fixed next tick */
2670 soft = task_rlimit(p, RLIMIT_RTTIME);
2671 hard = task_rlimit_max(p, RLIMIT_RTTIME);
2672
2673 if (soft != RLIM_INFINITY) {
2674 unsigned long next;
2675
2676 if (p->rt.watchdog_stamp != jiffies) {
2677 p->rt.timeout++;
2678 p->rt.watchdog_stamp = jiffies;
2679 }
2680
2681 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC / HZ);
2682 if (p->rt.timeout > next) {
2683 posix_cputimers_rt_watchdog(&p->posix_cputimers, p->se.sum_exec_runtime);
2684 }
2685 }
2686 }
2687 #else
watchdog(struct rq *rq, struct task_struct *p)2688 static inline void watchdog(struct rq *rq, struct task_struct *p)
2689 {
2690 }
2691 #endif
2692
2693 /*
2694 * scheduler tick hitting a task of our scheduling class.
2695 *
2696 * NOTE: This function can be called remotely by the tick offload that
2697 * goes along full dynticks. Therefore no local assumption can be made
2698 * and everything must be accessed through the @rq and @curr passed in
2699 * parameters.
2700 */
task_tick_rt(struct rq *rq, struct task_struct *p, int queued)2701 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2702 {
2703 struct sched_rt_entity *rt_se = &p->rt;
2704
2705 update_curr_rt(rq);
2706 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2707
2708 watchdog(rq, p);
2709
2710 /*
2711 * RR tasks need a special form of timeslice management.
2712 * FIFO tasks have no timeslices.
2713 */
2714 if (p->policy != SCHED_RR) {
2715 return;
2716 }
2717
2718 if (--p->rt.time_slice) {
2719 return;
2720 }
2721
2722 p->rt.time_slice = sched_rr_timeslice;
2723
2724 /*
2725 * Requeue to the end of queue if we (and all of our ancestors) are not
2726 * the only element on the queue
2727 */
2728 cycle_each_sched_rt_entity(rt_se) {
2729 if (rt_se->run_list.prev != rt_se->run_list.next) {
2730 requeue_task_rt(rq, p, 0);
2731 resched_curr(rq);
2732 return;
2733 }
2734 }
2735 }
2736
2737 #ifdef CONFIG_SCHED_RT_ACTIVE_LB
rt_active_load_balance_cpu_stop(void *data)2738 static int rt_active_load_balance_cpu_stop(void *data)
2739 {
2740 struct rq *busiest_rq = data;
2741 struct task_struct *next_task = busiest_rq->rt_push_task;
2742 struct rq *lowest_rq = NULL;
2743 unsigned long flags;
2744
2745 raw_spin_lock_irqsave(&busiest_rq->lock, flags);
2746 busiest_rq->rt_active_balance = 0;
2747
2748 /* find_lock_lowest_rq locks the rq if found */
2749 lowest_rq = find_lock_lowest_rq(next_task, busiest_rq);
2750 if (!lowest_rq) {
2751 goto out;
2752 }
2753
2754 if (capacity_orig_of(cpu_of(lowest_rq)) <= capacity_orig_of(task_cpu(next_task))) {
2755 goto unlock;
2756 }
2757
2758 deactivate_task(busiest_rq, next_task, 0);
2759 set_task_cpu(next_task, lowest_rq->cpu);
2760 activate_task(lowest_rq, next_task, 0);
2761
2762 resched_curr(lowest_rq);
2763 unlock:
2764 double_unlock_balance(busiest_rq, lowest_rq);
2765 out:
2766 put_task_struct(next_task);
2767 raw_spin_unlock_irqrestore(&busiest_rq->lock, flags);
2768
2769 return 0;
2770 }
2771
check_for_migration_rt(struct rq *rq, struct task_struct *p)2772 static void check_for_migration_rt(struct rq *rq, struct task_struct *p)
2773 {
2774 bool need_actvie_lb = false;
2775 bool misfit_task = false;
2776 int cpu = task_cpu(p);
2777 unsigned long cpu_orig_cap;
2778 #ifdef CONFIG_SCHED_RTG
2779 struct cpumask *rtg_target = NULL;
2780 #endif
2781
2782 if (!sysctl_sched_enable_rt_active_lb) {
2783 return;
2784 }
2785
2786 if (p->nr_cpus_allowed == 1) {
2787 return;
2788 }
2789
2790 cpu_orig_cap = capacity_orig_of(cpu);
2791 /* cpu has max capacity, no need to do balance */
2792 if (cpu_orig_cap == rq->rd->max_cpu_capacity) {
2793 return;
2794 }
2795
2796 #ifdef CONFIG_SCHED_RTG
2797 rtg_target = find_rtg_target(p);
2798 if (rtg_target) {
2799 misfit_task = capacity_orig_of(cpumask_first(rtg_target)) > cpu_orig_cap;
2800 } else {
2801 misfit_task = !rt_task_fits_capacity(p, cpu);
2802 }
2803 #else
2804 misfit_task = !rt_task_fits_capacity(p, cpu);
2805 #endif
2806 if (misfit_task) {
2807 raw_spin_lock(&rq->lock);
2808 if (!rq->active_balance && !rq->rt_active_balance) {
2809 rq->rt_active_balance = 1;
2810 rq->rt_push_task = p;
2811 get_task_struct(p);
2812 need_actvie_lb = true;
2813 }
2814 raw_spin_unlock(&rq->lock);
2815
2816 if (need_actvie_lb) {
2817 stop_one_cpu_nowait(task_cpu(p), rt_active_load_balance_cpu_stop, rq, &rq->rt_active_balance_work);
2818 }
2819 }
2820 }
2821 #endif
2822
get_rr_interval_rt(struct rq *rq, struct task_struct *task)2823 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2824 {
2825 /*
2826 * Time slice is 0 for SCHED_FIFO tasks
2827 */
2828 if (task->policy == SCHED_RR) {
2829 return sched_rr_timeslice;
2830 } else {
2831 return 0;
2832 }
2833 }
2834
2835 const struct sched_class rt_sched_class __section("__rt_sched_class") = {
2836 .enqueue_task = enqueue_task_rt,
2837 .dequeue_task = dequeue_task_rt,
2838 .yield_task = yield_task_rt,
2839
2840 .check_preempt_curr = check_preempt_curr_rt,
2841
2842 .pick_next_task = pick_next_task_rt,
2843 .put_prev_task = put_prev_task_rt,
2844 .set_next_task = set_next_task_rt,
2845
2846 #ifdef CONFIG_SMP
2847 .balance = balance_rt,
2848 .select_task_rq = select_task_rq_rt,
2849 .set_cpus_allowed = set_cpus_allowed_common,
2850 .rq_online = rq_online_rt,
2851 .rq_offline = rq_offline_rt,
2852 .task_woken = task_woken_rt,
2853 .switched_from = switched_from_rt,
2854 #endif
2855
2856 .task_tick = task_tick_rt,
2857
2858 .get_rr_interval = get_rr_interval_rt,
2859
2860 .prio_changed = prio_changed_rt,
2861 .switched_to = switched_to_rt,
2862
2863 .update_curr = update_curr_rt,
2864
2865 #ifdef CONFIG_UCLAMP_TASK
2866 .uclamp_enabled = 1,
2867 #endif
2868 #ifdef CONFIG_SCHED_WALT
2869 .fixup_walt_sched_stats = fixup_walt_sched_stats_common,
2870 #endif
2871 #ifdef CONFIG_SCHED_RT_ACTIVE_LB
2872 .check_for_migration = check_for_migration_rt,
2873 #endif
2874 };
2875
2876 #ifdef CONFIG_RT_GROUP_SCHED
2877 /*
2878 * Ensure that the real time constraints are schedulable.
2879 */
2880 static DEFINE_MUTEX(rt_constraints_mutex);
2881
tg_has_rt_tasks(struct task_group *tg)2882 static inline int tg_has_rt_tasks(struct task_group *tg)
2883 {
2884 struct task_struct *task;
2885 struct css_task_iter it;
2886 int ret = 0;
2887
2888 /*
2889 * Autogroups do not have RT tasks; see autogroup_create().
2890 */
2891 if (task_group_is_autogroup(tg)) {
2892 return 0;
2893 }
2894
2895 css_task_iter_start(&tg->css, 0, &it);
2896 while (!ret && (task = css_task_iter_next(&it))) {
2897 ret |= rt_task(task);
2898 }
2899 css_task_iter_end(&it);
2900
2901 return ret;
2902 }
2903
2904 struct rt_schedulable_data {
2905 struct task_group *tg;
2906 u64 rt_period;
2907 u64 rt_runtime;
2908 };
2909
tg_rt_schedulable(struct task_group *tg, void *data)2910 static int tg_rt_schedulable(struct task_group *tg, void *data)
2911 {
2912 struct rt_schedulable_data *d = data;
2913 struct task_group *child;
2914 unsigned long total, sum = 0;
2915 u64 period, runtime;
2916
2917 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2918 runtime = tg->rt_bandwidth.rt_runtime;
2919
2920 if (tg == d->tg) {
2921 period = d->rt_period;
2922 runtime = d->rt_runtime;
2923 }
2924
2925 /*
2926 * Cannot have more runtime than the period.
2927 */
2928 if (runtime > period && runtime != RUNTIME_INF) {
2929 return -EINVAL;
2930 }
2931
2932 /*
2933 * Ensure we don't starve existing RT tasks if runtime turns zero.
2934 */
2935 if (rt_bandwidth_enabled() && !runtime && tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) {
2936 return -EBUSY;
2937 }
2938
2939 total = to_ratio(period, runtime);
2940 /*
2941 * Nobody can have more than the global setting allows.
2942 */
2943 if (total > to_ratio(global_rt_period(), global_rt_runtime())) {
2944 return -EINVAL;
2945 }
2946
2947 /*
2948 * The sum of our children's runtime should not exceed our own.
2949 */
2950 list_for_each_entry_rcu(child, &tg->children, siblings)
2951 {
2952 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2953 runtime = child->rt_bandwidth.rt_runtime;
2954
2955 if (child == d->tg) {
2956 period = d->rt_period;
2957 runtime = d->rt_runtime;
2958 }
2959
2960 sum += to_ratio(period, runtime);
2961 }
2962
2963 if (sum > total) {
2964 return -EINVAL;
2965 }
2966
2967 return 0;
2968 }
2969
__rt_schedulable(struct task_group *tg, u64 period, u64 runtime)2970 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2971 {
2972 int ret;
2973
2974 struct rt_schedulable_data data = {
2975 .tg = tg,
2976 .rt_period = period,
2977 .rt_runtime = runtime,
2978 };
2979
2980 rcu_read_lock();
2981 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2982 rcu_read_unlock();
2983
2984 return ret;
2985 }
2986
tg_set_rt_bandwidth(struct task_group *tg, u64 rt_period, u64 rt_runtime)2987 static int tg_set_rt_bandwidth(struct task_group *tg, u64 rt_period, u64 rt_runtime)
2988 {
2989 int i, err = 0;
2990
2991 /*
2992 * Disallowing the root group RT runtime is BAD, it would disallow the
2993 * kernel creating (and or operating) RT threads.
2994 */
2995 if (tg == &root_task_group && rt_runtime == 0) {
2996 return -EINVAL;
2997 }
2998
2999 /* No period doesn't make any sense. */
3000 if (rt_period == 0) {
3001 return -EINVAL;
3002 }
3003
3004 /*
3005 * Bound quota to defend quota against overflow during bandwidth shift.
3006 */
3007 if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) {
3008 return -EINVAL;
3009 }
3010
3011 mutex_lock(&rt_constraints_mutex);
3012 err = __rt_schedulable(tg, rt_period, rt_runtime);
3013 if (err) {
3014 goto unlock;
3015 }
3016
3017 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
3018 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
3019 tg->rt_bandwidth.rt_runtime = rt_runtime;
3020
3021 for_each_possible_cpu(i)
3022 {
3023 struct rt_rq *rt_rq = tg->rt_rq[i];
3024
3025 raw_spin_lock(&rt_rq->rt_runtime_lock);
3026 rt_rq->rt_runtime = rt_runtime;
3027 raw_spin_unlock(&rt_rq->rt_runtime_lock);
3028 }
3029 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
3030 unlock:
3031 mutex_unlock(&rt_constraints_mutex);
3032
3033 return err;
3034 }
3035
sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)3036 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
3037 {
3038 u64 rt_runtime, rt_period;
3039
3040 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
3041 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
3042 if (rt_runtime_us < 0) {
3043 rt_runtime = RUNTIME_INF;
3044 } else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) {
3045 return -EINVAL;
3046 }
3047
3048 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
3049 }
3050
sched_group_rt_runtime(struct task_group *tg)3051 long sched_group_rt_runtime(struct task_group *tg)
3052 {
3053 u64 rt_runtime_us;
3054
3055 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) {
3056 return -1;
3057 }
3058
3059 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
3060 do_div(rt_runtime_us, NSEC_PER_USEC);
3061 return rt_runtime_us;
3062 }
3063
sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)3064 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
3065 {
3066 u64 rt_runtime, rt_period;
3067
3068 if (rt_period_us > U64_MAX / NSEC_PER_USEC) {
3069 return -EINVAL;
3070 }
3071
3072 rt_period = rt_period_us * NSEC_PER_USEC;
3073 rt_runtime = tg->rt_bandwidth.rt_runtime;
3074
3075 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
3076 }
3077
sched_group_rt_period(struct task_group *tg)3078 long sched_group_rt_period(struct task_group *tg)
3079 {
3080 u64 rt_period_us;
3081
3082 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
3083 do_div(rt_period_us, NSEC_PER_USEC);
3084 return rt_period_us;
3085 }
3086
sched_rt_global_constraints(void)3087 static int sched_rt_global_constraints(void)
3088 {
3089 int ret = 0;
3090
3091 mutex_lock(&rt_constraints_mutex);
3092 ret = __rt_schedulable(NULL, 0, 0);
3093 mutex_unlock(&rt_constraints_mutex);
3094
3095 return ret;
3096 }
3097
sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)3098 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
3099 {
3100 /* Don't accept realtime tasks when there is no way for them to run */
3101 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) {
3102 return 0;
3103 }
3104
3105 return 1;
3106 }
3107
3108 #else /* !CONFIG_RT_GROUP_SCHED */
sched_rt_global_constraints(void)3109 static int sched_rt_global_constraints(void)
3110 {
3111 unsigned long flags;
3112 int i;
3113
3114 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
3115 for_each_possible_cpu(i)
3116 {
3117 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
3118
3119 raw_spin_lock(&rt_rq->rt_runtime_lock);
3120 rt_rq->rt_runtime = global_rt_runtime();
3121 raw_spin_unlock(&rt_rq->rt_runtime_lock);
3122 }
3123 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
3124
3125 return 0;
3126 }
3127 #endif /* CONFIG_RT_GROUP_SCHED */
3128
sched_rt_global_validate(void)3129 static int sched_rt_global_validate(void)
3130 {
3131 if (sysctl_sched_rt_period <= 0) {
3132 return -EINVAL;
3133 }
3134
3135 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
3136 ((u64)sysctl_sched_rt_runtime * NSEC_PER_USEC > max_rt_runtime))) {
3137 return -EINVAL;
3138 }
3139
3140 return 0;
3141 }
3142
sched_rt_do_global(void)3143 static void sched_rt_do_global(void)
3144 {
3145 raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock);
3146 def_rt_bandwidth.rt_runtime = global_rt_runtime();
3147 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
3148 raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock);
3149 }
3150
sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos)3151 int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos)
3152 {
3153 int old_period, old_runtime;
3154 static DEFINE_MUTEX(mutex);
3155 int ret;
3156
3157 mutex_lock(&mutex);
3158 old_period = sysctl_sched_rt_period;
3159 old_runtime = sysctl_sched_rt_runtime;
3160
3161 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3162 if (!ret && write) {
3163 ret = sched_rt_global_validate();
3164 if (ret) {
3165 goto undo;
3166 }
3167
3168 ret = sched_dl_global_validate();
3169 if (ret) {
3170 goto undo;
3171 }
3172
3173 ret = sched_rt_global_constraints();
3174 if (ret) {
3175 goto undo;
3176 }
3177
3178 sched_rt_do_global();
3179 sched_dl_do_global();
3180 }
3181 if (0) {
3182 undo:
3183 sysctl_sched_rt_period = old_period;
3184 sysctl_sched_rt_runtime = old_runtime;
3185 }
3186 mutex_unlock(&mutex);
3187
3188 return ret;
3189 }
3190
sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos)3191 int sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos)
3192 {
3193 int ret;
3194 static DEFINE_MUTEX(mutex);
3195
3196 mutex_lock(&mutex);
3197 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3198 /*
3199 * Make sure that internally we keep jiffies.
3200 * Also, writing zero resets the timeslice to default:
3201 */
3202 if (!ret && write) {
3203 sched_rr_timeslice =
3204 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : msecs_to_jiffies(sysctl_sched_rr_timeslice);
3205 }
3206 mutex_unlock(&mutex);
3207
3208 return ret;
3209 }
3210
3211 #ifdef CONFIG_SCHED_DEBUG
print_rt_stats(struct seq_file *m, int cpu)3212 void print_rt_stats(struct seq_file *m, int cpu)
3213 {
3214 rt_rq_iter_t iter;
3215 struct rt_rq *rt_rq;
3216
3217 rcu_read_lock();
3218 cycle_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) print_rt_rq(m, cpu, rt_rq);
3219 rcu_read_unlock();
3220 }
3221 #endif /* CONFIG_SCHED_DEBUG */
3222