1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 4 * policies) 5 */ 6#include "sched.h" 7 8#include "pelt.h" 9#include "walt.h" 10 11int sched_rr_timeslice = RR_TIMESLICE; 12int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; 13/* More than 4 hours if BW_SHIFT equals 20. */ 14static const u64 max_rt_runtime = MAX_BW; 15 16static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 17 18struct rt_bandwidth def_rt_bandwidth; 19 20#ifdef CONFIG_SCHED_RT_CAS 21unsigned int sysctl_sched_enable_rt_cas = 1; 22#endif 23 24#ifdef CONFIG_SCHED_RT_ACTIVE_LB 25unsigned int sysctl_sched_enable_rt_active_lb = 1; 26#endif 27 28static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 29{ 30 struct rt_bandwidth *rt_b = container_of(timer, struct rt_bandwidth, rt_period_timer); 31 int idle = 0; 32 int overrun; 33 34 raw_spin_lock(&rt_b->rt_runtime_lock); 35 for (;;) { 36 overrun = hrtimer_forward_now(timer, rt_b->rt_period); 37 if (!overrun) { 38 break; 39 } 40 41 raw_spin_unlock(&rt_b->rt_runtime_lock); 42 idle = do_sched_rt_period_timer(rt_b, overrun); 43 raw_spin_lock(&rt_b->rt_runtime_lock); 44 } 45 if (idle) { 46 rt_b->rt_period_active = 0; 47 } 48 raw_spin_unlock(&rt_b->rt_runtime_lock); 49 50 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 51} 52 53void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 54{ 55 rt_b->rt_period = ns_to_ktime(period); 56 rt_b->rt_runtime = runtime; 57 58 raw_spin_lock_init(&rt_b->rt_runtime_lock); 59 60 hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); 61 rt_b->rt_period_timer.function = sched_rt_period_timer; 62} 63 64static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 65{ 66 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) { 67 return; 68 } 69 70 raw_spin_lock(&rt_b->rt_runtime_lock); 71 if (!rt_b->rt_period_active) { 72 rt_b->rt_period_active = 1; 73 /* 74 * SCHED_DEADLINE updates the bandwidth, as a run away 75 * RT task with a DL task could hog a CPU. But DL does 76 * not reset the period. If a deadline task was running 77 * without an RT task running, it can cause RT tasks to 78 * throttle when they start up. Kick the timer right away 79 * to update the period. 80 */ 81 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); 82 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED_HARD); 83 } 84 raw_spin_unlock(&rt_b->rt_runtime_lock); 85} 86 87void init_rt_rq(struct rt_rq *rt_rq) 88{ 89 struct rt_prio_array *array; 90 int i; 91 92 array = &rt_rq->active; 93 for (i = 0; i < MAX_RT_PRIO; i++) { 94 INIT_LIST_HEAD(array->queue + i); 95 __clear_bit(i, array->bitmap); 96 } 97 /* delimiter for bitsearch: */ 98 __set_bit(MAX_RT_PRIO, array->bitmap); 99 100#if defined CONFIG_SMP 101 rt_rq->highest_prio.curr = MAX_RT_PRIO; 102 rt_rq->highest_prio.next = MAX_RT_PRIO; 103 rt_rq->rt_nr_migratory = 0; 104 rt_rq->overloaded = 0; 105 plist_head_init(&rt_rq->pushable_tasks); 106#endif /* CONFIG_SMP */ 107 /* We start is dequeued state, because no RT tasks are queued */ 108 rt_rq->rt_queued = 0; 109 110 rt_rq->rt_time = 0; 111 rt_rq->rt_throttled = 0; 112 rt_rq->rt_runtime = 0; 113 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 114} 115 116#ifdef CONFIG_RT_GROUP_SCHED 117static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 118{ 119 hrtimer_cancel(&rt_b->rt_period_timer); 120} 121 122#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 123 124static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 125{ 126#ifdef CONFIG_SCHED_DEBUG 127 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 128#endif 129 return container_of(rt_se, struct task_struct, rt); 130} 131 132static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 133{ 134 return rt_rq->rq; 135} 136 137static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 138{ 139 return rt_se->rt_rq; 140} 141 142static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 143{ 144 struct rt_rq *rt_rq = rt_se->rt_rq; 145 146 return rt_rq->rq; 147} 148 149void free_rt_sched_group(struct task_group *tg) 150{ 151 int i; 152 153 if (tg->rt_se) { 154 destroy_rt_bandwidth(&tg->rt_bandwidth); 155 } 156 157 for_each_possible_cpu(i) 158 { 159 if (tg->rt_rq) { 160 kfree(tg->rt_rq[i]); 161 } 162 if (tg->rt_se) { 163 kfree(tg->rt_se[i]); 164 } 165 } 166 167 kfree(tg->rt_rq); 168 kfree(tg->rt_se); 169} 170 171void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, 172 struct sched_rt_entity *parent) 173{ 174 struct rq *rq = cpu_rq(cpu); 175 176 rt_rq->highest_prio.curr = MAX_RT_PRIO; 177 rt_rq->rt_nr_boosted = 0; 178 rt_rq->rq = rq; 179 rt_rq->tg = tg; 180 181 tg->rt_rq[cpu] = rt_rq; 182 tg->rt_se[cpu] = rt_se; 183 184 if (!rt_se) { 185 return; 186 } 187 188 if (!parent) { 189 rt_se->rt_rq = &rq->rt; 190 } else { 191 rt_se->rt_rq = parent->my_q; 192 } 193 194 rt_se->my_q = rt_rq; 195 rt_se->parent = parent; 196 INIT_LIST_HEAD(&rt_se->run_list); 197} 198 199int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 200{ 201 struct rt_rq *rt_rq; 202 struct sched_rt_entity *rt_se; 203 int i; 204 205 tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); 206 if (!tg->rt_rq) { 207 goto err; 208 } 209 tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL); 210 if (!tg->rt_se) { 211 goto err; 212 } 213 214 init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(def_rt_bandwidth.rt_period), 0); 215 216 for_each_possible_cpu(i) 217 { 218 rt_rq = kzalloc_node(sizeof(struct rt_rq), GFP_KERNEL, cpu_to_node(i)); 219 if (!rt_rq) { 220 goto err; 221 } 222 223 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), GFP_KERNEL, cpu_to_node(i)); 224 if (!rt_se) { 225 goto err_free_rq; 226 } 227 228 init_rt_rq(rt_rq); 229 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 230 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 231 } 232 233 return 1; 234 235err_free_rq: 236 kfree(rt_rq); 237err: 238 return 0; 239} 240 241#else /* CONFIG_RT_GROUP_SCHED */ 242 243#define rt_entity_is_task(rt_se) (1) 244 245static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 246{ 247 return container_of(rt_se, struct task_struct, rt); 248} 249 250static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 251{ 252 return container_of(rt_rq, struct rq, rt); 253} 254 255static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 256{ 257 struct task_struct *p = rt_task_of(rt_se); 258 259 return task_rq(p); 260} 261 262static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 263{ 264 struct rq *rq = rq_of_rt_se(rt_se); 265 266 return &rq->rt; 267} 268 269void free_rt_sched_group(struct task_group *tg) 270{ 271} 272 273int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 274{ 275 return 1; 276} 277#endif /* CONFIG_RT_GROUP_SCHED */ 278 279#ifdef CONFIG_SMP 280 281static void pull_rt_task(struct rq *this_rq); 282 283static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 284{ 285 /* 286 * Try to pull RT tasks here if we lower this rq's prio and cpu is not 287 * isolated 288 */ 289 return rq->rt.highest_prio.curr > prev->prio && !cpu_isolated(cpu_of(rq)); 290} 291 292static inline int rt_overloaded(struct rq *rq) 293{ 294 return atomic_read(&rq->rd->rto_count); 295} 296 297static inline void rt_set_overload(struct rq *rq) 298{ 299 if (!rq->online) { 300 return; 301 } 302 303 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); 304 /* 305 * Make sure the mask is visible before we set 306 * the overload count. That is checked to determine 307 * if we should look at the mask. It would be a shame 308 * if we looked at the mask, but the mask was not 309 * updated yet. 310 * 311 * Matched by the barrier in pull_rt_task(). 312 */ 313 smp_wmb(); 314 atomic_inc(&rq->rd->rto_count); 315} 316 317static inline void rt_clear_overload(struct rq *rq) 318{ 319 if (!rq->online) { 320 return; 321 } 322 323 /* the order here really doesn't matter */ 324 atomic_dec(&rq->rd->rto_count); 325 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 326} 327 328static void update_rt_migration(struct rt_rq *rt_rq) 329{ 330 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { 331 if (!rt_rq->overloaded) { 332 rt_set_overload(rq_of_rt_rq(rt_rq)); 333 rt_rq->overloaded = 1; 334 } 335 } else if (rt_rq->overloaded) { 336 rt_clear_overload(rq_of_rt_rq(rt_rq)); 337 rt_rq->overloaded = 0; 338 } 339} 340 341static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 342{ 343 struct task_struct *p; 344 345 if (!rt_entity_is_task(rt_se)) { 346 return; 347 } 348 349 p = rt_task_of(rt_se); 350 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 351 352 rt_rq->rt_nr_total++; 353 if (p->nr_cpus_allowed > 1) { 354 rt_rq->rt_nr_migratory++; 355 } 356 357 update_rt_migration(rt_rq); 358} 359 360static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 361{ 362 struct task_struct *p; 363 364 if (!rt_entity_is_task(rt_se)) { 365 return; 366 } 367 368 p = rt_task_of(rt_se); 369 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 370 371 rt_rq->rt_nr_total--; 372 if (p->nr_cpus_allowed > 1) { 373 rt_rq->rt_nr_migratory--; 374 } 375 376 update_rt_migration(rt_rq); 377} 378 379static inline int has_pushable_tasks(struct rq *rq) 380{ 381 return !plist_head_empty(&rq->rt.pushable_tasks); 382} 383 384static DEFINE_PER_CPU(struct callback_head, rt_push_head); 385static DEFINE_PER_CPU(struct callback_head, rt_pull_head); 386 387static void push_rt_tasks(struct rq *); 388static void pull_rt_task(struct rq *); 389 390static inline void rt_queue_push_tasks(struct rq *rq) 391{ 392 if (!has_pushable_tasks(rq)) { 393 return; 394 } 395 396 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); 397} 398 399static inline void rt_queue_pull_task(struct rq *rq) 400{ 401 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); 402} 403 404static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 405{ 406 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 407 plist_node_init(&p->pushable_tasks, p->prio); 408 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 409 410 /* Update the highest prio pushable task */ 411 if (p->prio < rq->rt.highest_prio.next) { 412 rq->rt.highest_prio.next = p->prio; 413 } 414} 415 416static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 417{ 418 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 419 420 /* Update the new highest prio pushable task */ 421 if (has_pushable_tasks(rq)) { 422 p = plist_first_entry(&rq->rt.pushable_tasks, struct task_struct, pushable_tasks); 423 rq->rt.highest_prio.next = p->prio; 424 } else { 425 rq->rt.highest_prio.next = MAX_RT_PRIO; 426 } 427} 428 429#else 430 431static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 432{ 433} 434 435static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 436{ 437} 438 439static inline void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 440{ 441} 442 443static inline void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 444{ 445} 446 447static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 448{ 449 return false; 450} 451 452static inline void pull_rt_task(struct rq *this_rq) 453{ 454} 455 456static inline void rt_queue_push_tasks(struct rq *rq) 457{ 458} 459#endif /* CONFIG_SMP */ 460 461static void enqueue_top_rt_rq(struct rt_rq *rt_rq); 462static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); 463 464static inline int on_rt_rq(struct sched_rt_entity *rt_se) 465{ 466 return rt_se->on_rq; 467} 468 469#ifdef CONFIG_UCLAMP_TASK 470/* 471 * Verify the fitness of task @p to run on @cpu taking into account the uclamp 472 * settings. 473 * 474 * This check is only important for heterogeneous systems where uclamp_min value 475 * is higher than the capacity of a @cpu. For non-heterogeneous system this 476 * function will always return true. 477 * 478 * The function will return true if the capacity of the @cpu is >= the 479 * uclamp_min and false otherwise. 480 * 481 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min 482 * > uclamp_max. 483 */ 484static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 485{ 486 unsigned int min_cap; 487 unsigned int max_cap; 488 unsigned int cpu_cap; 489 490 /* Only heterogeneous systems can benefit from this check */ 491 if (!static_branch_unlikely(&sched_asym_cpucapacity)) { 492 return true; 493 } 494 495 min_cap = uclamp_eff_value(p, UCLAMP_MIN); 496 max_cap = uclamp_eff_value(p, UCLAMP_MAX); 497 498 cpu_cap = capacity_orig_of(cpu); 499 500 return cpu_cap >= min(min_cap, max_cap); 501} 502#else 503static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 504{ 505 return true; 506} 507#endif 508 509#ifdef CONFIG_RT_GROUP_SCHED 510 511static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 512{ 513 if (!rt_rq->tg) { 514 return RUNTIME_INF; 515 } 516 517 return rt_rq->rt_runtime; 518} 519 520static inline u64 sched_rt_period(struct rt_rq *rt_rq) 521{ 522 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 523} 524 525typedef struct task_group *rt_rq_iter_t; 526 527static inline struct task_group *next_task_group(struct task_group *tg) 528{ 529 do { 530 tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); 531 } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); 532 533 if (&tg->list == &task_groups) { 534 tg = NULL; 535 } 536 537 return tg; 538} 539 540#define cycle_each_rt_rq(rt_rq, iter, rq) 541 do { \ 542 for (iter = container_of(&task_groups, typeof(*iter), list); \ 543 (iter = next_task_group(iter)) && (rt_rq = iter->rt_rq[cpu_of(rq)]);) \ 544 } while (0) 545 546#define cycle_each_sched_rt_entity(rt_se) for (; rt_se; rt_se = rt_se->parent) 547 548static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 549{ 550 return rt_se->my_q; 551} 552 553static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 554static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 555 556static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 557{ 558 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 559 struct rq *rq = rq_of_rt_rq(rt_rq); 560 struct sched_rt_entity *rt_se; 561 562 int cpu = cpu_of(rq); 563 564 rt_se = rt_rq->tg->rt_se[cpu]; 565 566 if (rt_rq->rt_nr_running) { 567 if (!rt_se) { 568 enqueue_top_rt_rq(rt_rq); 569 } else if (!on_rt_rq(rt_se)) { 570 enqueue_rt_entity(rt_se, 0); 571 } 572 573 if (rt_rq->highest_prio.curr < curr->prio) { 574 resched_curr(rq); 575 } 576 } 577} 578 579static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 580{ 581 struct sched_rt_entity *rt_se; 582 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 583 584 rt_se = rt_rq->tg->rt_se[cpu]; 585 586 if (!rt_se) { 587 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); 588 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 589 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); 590 } else if (on_rt_rq(rt_se)) { 591 dequeue_rt_entity(rt_se, 0); 592 } 593} 594 595static inline int rt_rq_throttled(struct rt_rq *rt_rq) 596{ 597 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 598} 599 600static int rt_se_boosted(struct sched_rt_entity *rt_se) 601{ 602 struct rt_rq *rt_rq = group_rt_rq(rt_se); 603 struct task_struct *p; 604 605 if (rt_rq) { 606 return !!rt_rq->rt_nr_boosted; 607 } 608 609 p = rt_task_of(rt_se); 610 return p->prio != p->normal_prio; 611} 612 613#ifdef CONFIG_SMP 614static inline const struct cpumask *sched_rt_period_mask(void) 615{ 616 return this_rq()->rd->span; 617} 618#else 619static inline const struct cpumask *sched_rt_period_mask(void) 620{ 621 return cpu_online_mask; 622} 623#endif 624 625static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 626{ 627 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; 628} 629 630static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 631{ 632 return &rt_rq->tg->rt_bandwidth; 633} 634 635#else /* !CONFIG_RT_GROUP_SCHED */ 636 637static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 638{ 639 return rt_rq->rt_runtime; 640} 641 642static inline u64 sched_rt_period(struct rt_rq *rt_rq) 643{ 644 return ktime_to_ns(def_rt_bandwidth.rt_period); 645} 646 647typedef struct rt_rq *rt_rq_iter_t; 648 649#define cycle_each_rt_rq(rt_rq, iter, rq) for ((void)(iter), (rt_rq) = &(rq)->rt; (rt_rq); (rt_rq) = NULL) 650 651#define cycle_each_sched_rt_entity(rt_se) for (; rt_se; rt_se = NULL) 652 653static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 654{ 655 return NULL; 656} 657 658static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 659{ 660 struct rq *rq = rq_of_rt_rq(rt_rq); 661 662 if (!rt_rq->rt_nr_running) { 663 return; 664 } 665 666 enqueue_top_rt_rq(rt_rq); 667 resched_curr(rq); 668} 669 670static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 671{ 672 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); 673} 674 675static inline int rt_rq_throttled(struct rt_rq *rt_rq) 676{ 677 return rt_rq->rt_throttled; 678} 679 680static inline const struct cpumask *sched_rt_period_mask(void) 681{ 682 return cpu_online_mask; 683} 684 685static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 686{ 687 return &cpu_rq(cpu)->rt; 688} 689 690static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 691{ 692 return &def_rt_bandwidth; 693} 694 695#endif /* CONFIG_RT_GROUP_SCHED */ 696 697bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) 698{ 699 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 700 701 return (hrtimer_active(&rt_b->rt_period_timer) || rt_rq->rt_time < rt_b->rt_runtime); 702} 703 704#ifdef CONFIG_SMP 705/* 706 * We ran out of runtime, see if we can borrow some from our neighbours. 707 */ 708static void do_balance_runtime(struct rt_rq *rt_rq) 709{ 710 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 711 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 712 int i, weight; 713 u64 rt_period; 714 715 weight = cpumask_weight(rd->span); 716 717 raw_spin_lock(&rt_b->rt_runtime_lock); 718 rt_period = ktime_to_ns(rt_b->rt_period); 719 for_each_cpu(i, rd->span) 720 { 721 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 722 s64 diff; 723 724 if (iter == rt_rq) { 725 continue; 726 } 727 728 raw_spin_lock(&iter->rt_runtime_lock); 729 /* 730 * Either all rqs have inf runtime and there's nothing to steal 731 * or __disable_runtime() below sets a specific rq to inf to 732 * indicate its been disabled and disalow stealing. 733 */ 734 if (iter->rt_runtime == RUNTIME_INF) { 735 goto next; 736 } 737 738 /* 739 * From runqueues with spare time, take 1/n part of their 740 * spare time, but no more than our period. 741 */ 742 diff = iter->rt_runtime - iter->rt_time; 743 if (diff > 0) { 744 diff = div_u64((u64)diff, weight); 745 if (rt_rq->rt_runtime + diff > rt_period) { 746 diff = rt_period - rt_rq->rt_runtime; 747 } 748 iter->rt_runtime -= diff; 749 rt_rq->rt_runtime += diff; 750 if (rt_rq->rt_runtime == rt_period) { 751 raw_spin_unlock(&iter->rt_runtime_lock); 752 break; 753 } 754 } 755 next: 756 raw_spin_unlock(&iter->rt_runtime_lock); 757 } 758 raw_spin_unlock(&rt_b->rt_runtime_lock); 759} 760 761/* 762 * Ensure this RQ takes back all the runtime it lend to its neighbours. 763 */ 764static void __disable_runtime(struct rq *rq) 765{ 766 struct root_domain *rd = rq->rd; 767 rt_rq_iter_t iter; 768 struct rt_rq *rt_rq; 769 770 if (unlikely(!scheduler_running)) { 771 return; 772 } 773 774 cycle_each_rt_rq(rt_rq, iter, rq) { 775 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 776 s64 want; 777 int i; 778 779 raw_spin_lock(&rt_b->rt_runtime_lock); 780 raw_spin_lock(&rt_rq->rt_runtime_lock); 781 /* 782 * Either we're all inf and nobody needs to borrow, or we're 783 * already disabled and thus have nothing to do, or we have 784 * exactly the right amount of runtime to take out. 785 */ 786 if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) { 787 goto balanced; 788 } 789 raw_spin_unlock(&rt_rq->rt_runtime_lock); 790 791 /* 792 * Calculate the difference between what we started out with 793 * and what we current have, that's the amount of runtime 794 * we lend and now have to reclaim. 795 */ 796 want = rt_b->rt_runtime - rt_rq->rt_runtime; 797 798 /* 799 * Greedy reclaim, take back as much as we can. 800 */ 801 for_each_cpu(i, rd->span) 802 { 803 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 804 s64 diff; 805 806 /* 807 * Can't reclaim from ourselves or disabled runqueues. 808 */ 809 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) { 810 continue; 811 } 812 813 raw_spin_lock(&iter->rt_runtime_lock); 814 if (want > 0) { 815 diff = min_t(s64, iter->rt_runtime, want); 816 iter->rt_runtime -= diff; 817 want -= diff; 818 } else { 819 iter->rt_runtime -= want; 820 want -= want; 821 } 822 raw_spin_unlock(&iter->rt_runtime_lock); 823 824 if (!want) { 825 break; 826 } 827 } 828 829 raw_spin_lock(&rt_rq->rt_runtime_lock); 830 /* 831 * We cannot be left wanting - that would mean some runtime 832 * leaked out of the system. 833 */ 834 BUG_ON(want); 835 balanced: 836 /* 837 * Disable all the borrow logic by pretending we have inf 838 * runtime - in which case borrowing doesn't make sense. 839 */ 840 rt_rq->rt_runtime = RUNTIME_INF; 841 rt_rq->rt_throttled = 0; 842 raw_spin_unlock(&rt_rq->rt_runtime_lock); 843 raw_spin_unlock(&rt_b->rt_runtime_lock); 844 845 /* Make rt_rq available for pick_next_task() */ 846 sched_rt_rq_enqueue(rt_rq); 847 } 848} 849 850static void __enable_runtime(struct rq *rq) 851{ 852 rt_rq_iter_t iter; 853 struct rt_rq *rt_rq; 854 855 if (unlikely(!scheduler_running)) { 856 return; 857 } 858 859 /* 860 * Reset each runqueue's bandwidth settings 861 */ 862 cycle_each_rt_rq(rt_rq, iter, rq) { 863 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 864 865 raw_spin_lock(&rt_b->rt_runtime_lock); 866 raw_spin_lock(&rt_rq->rt_runtime_lock); 867 rt_rq->rt_runtime = rt_b->rt_runtime; 868 rt_rq->rt_time = 0; 869 rt_rq->rt_throttled = 0; 870 raw_spin_unlock(&rt_rq->rt_runtime_lock); 871 raw_spin_unlock(&rt_b->rt_runtime_lock); 872 } 873} 874 875static void balance_runtime(struct rt_rq *rt_rq) 876{ 877 if (!sched_feat(RT_RUNTIME_SHARE)) { 878 return; 879 } 880 881 if (rt_rq->rt_time > rt_rq->rt_runtime) { 882 raw_spin_unlock(&rt_rq->rt_runtime_lock); 883 do_balance_runtime(rt_rq); 884 raw_spin_lock(&rt_rq->rt_runtime_lock); 885 } 886} 887#else /* !CONFIG_SMP */ 888static inline void balance_runtime(struct rt_rq *rt_rq) 889{ 890} 891#endif /* CONFIG_SMP */ 892 893static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 894{ 895 int i, idle = 1, throttled = 0; 896 const struct cpumask *span; 897 898 span = sched_rt_period_mask(); 899#ifdef CONFIG_RT_GROUP_SCHED 900 /* 901 * When the tasks in the task_group run on either isolated 902 * CPUs or non-isolated CPUs, whether they are isolcpus or 903 * were isolated via cpusets, check all the online rt_rq 904 * to lest the timer run on a CPU which does not service 905 * all runqueues, potentially leaving other CPUs indefinitely 906 * throttled. 907 */ 908 span = cpu_online_mask; 909#endif 910 for_each_cpu(i, span) 911 { 912 int enqueue = 0; 913 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 914 struct rq *rq = rq_of_rt_rq(rt_rq); 915 int skip; 916 917 /* 918 * When span == cpu_online_mask, taking each rq->lock 919 * can be time-consuming. Try to avoid it when possible. 920 */ 921 raw_spin_lock(&rt_rq->rt_runtime_lock); 922 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) { 923 rt_rq->rt_runtime = rt_b->rt_runtime; 924 } 925 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; 926 raw_spin_unlock(&rt_rq->rt_runtime_lock); 927 if (skip) { 928 continue; 929 } 930 931 raw_spin_lock(&rq->lock); 932 update_rq_clock(rq); 933 934 if (rt_rq->rt_time) { 935 u64 runtime; 936 937 raw_spin_lock(&rt_rq->rt_runtime_lock); 938 if (rt_rq->rt_throttled) { 939 balance_runtime(rt_rq); 940 } 941 runtime = rt_rq->rt_runtime; 942 rt_rq->rt_time -= min(rt_rq->rt_time, overrun * runtime); 943 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 944 rt_rq->rt_throttled = 0; 945 enqueue = 1; 946 947 /* 948 * When we're idle and a woken (rt) task is 949 * throttled check_preempt_curr() will set 950 * skip_update and the time between the wakeup 951 * and this unthrottle will get accounted as 952 * 'runtime'. 953 */ 954 if (rt_rq->rt_nr_running && rq->curr == rq->idle) { 955 rq_clock_cancel_skipupdate(rq); 956 } 957 } 958 if (rt_rq->rt_time || rt_rq->rt_nr_running) { 959 idle = 0; 960 } 961 raw_spin_unlock(&rt_rq->rt_runtime_lock); 962 } else if (rt_rq->rt_nr_running) { 963 idle = 0; 964 if (!rt_rq_throttled(rt_rq)) { 965 enqueue = 1; 966 } 967 } 968 if (rt_rq->rt_throttled) { 969 throttled = 1; 970 } 971 972 if (enqueue) { 973 sched_rt_rq_enqueue(rt_rq); 974 } 975 raw_spin_unlock(&rq->lock); 976 } 977 978 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) { 979 return 1; 980 } 981 982 return idle; 983} 984 985static inline int rt_se_prio(struct sched_rt_entity *rt_se) 986{ 987#ifdef CONFIG_RT_GROUP_SCHED 988 struct rt_rq *rt_rq = group_rt_rq(rt_se); 989 990 if (rt_rq) { 991 return rt_rq->highest_prio.curr; 992 } 993#endif 994 995 return rt_task_of(rt_se)->prio; 996} 997 998static inline void try_start_rt_bandwidth(struct rt_bandwidth *rt_b) 999{ 1000 raw_spin_lock(&rt_b->rt_runtime_lock); 1001 if (!rt_b->rt_period_active) { 1002 rt_b->rt_period_active = 1; 1003 hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); 1004 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED_HARD); 1005 } 1006 raw_spin_unlock(&rt_b->rt_runtime_lock); 1007} 1008 1009static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 1010{ 1011 u64 runtime = sched_rt_runtime(rt_rq); 1012 1013 if (rt_rq->rt_throttled) { 1014 return rt_rq_throttled(rt_rq); 1015 } 1016 1017 if (runtime >= sched_rt_period(rt_rq)) { 1018 return 0; 1019 } 1020 1021 balance_runtime(rt_rq); 1022 runtime = sched_rt_runtime(rt_rq); 1023 if (runtime == RUNTIME_INF) { 1024 return 0; 1025 } 1026 1027 if (rt_rq->rt_time > runtime) { 1028 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 1029 1030 /* 1031 * Don't actually throttle groups that have no runtime assigned 1032 * but accrue some time due to boosting. 1033 */ 1034 if (likely(rt_b->rt_runtime)) { 1035 rt_rq->rt_throttled = 1; 1036 printk_deferred_once("sched: RT throttling activated\n"); 1037 } else { 1038 /* 1039 * In case we did anyway, make it go away, 1040 * replenishment is a joke, since it will replenish us 1041 * with exactly 0 ns. 1042 */ 1043 rt_rq->rt_time = 0; 1044 } 1045 1046 if (rt_rq_throttled(rt_rq)) { 1047 sched_rt_rq_dequeue(rt_rq); 1048 return 1; 1049 } 1050 } 1051 1052 return 0; 1053} 1054 1055/* 1056 * Update the current task's runtime statistics. Skip current tasks that 1057 * are not in our scheduling class. 1058 */ 1059static void update_curr_rt(struct rq *rq) 1060{ 1061 struct task_struct *curr = rq->curr; 1062 struct sched_rt_entity *rt_se = &curr->rt; 1063 u64 delta_exec; 1064 u64 now; 1065 1066 if (curr->sched_class != &rt_sched_class) { 1067 return; 1068 } 1069 1070 now = rq_clock_task(rq); 1071 delta_exec = now - curr->se.exec_start; 1072 if (unlikely((s64)delta_exec <= 0)) { 1073 return; 1074 } 1075 1076 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); 1077 1078 curr->se.sum_exec_runtime += delta_exec; 1079 account_group_exec_runtime(curr, delta_exec); 1080 1081 curr->se.exec_start = now; 1082 cgroup_account_cputime(curr, delta_exec); 1083 1084 if (!rt_bandwidth_enabled()) { 1085 return; 1086 } 1087 1088 cycle_each_sched_rt_entity(rt_se) { 1089 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1090 int exceeded; 1091 1092 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 1093 raw_spin_lock(&rt_rq->rt_runtime_lock); 1094 rt_rq->rt_time += delta_exec; 1095 exceeded = sched_rt_runtime_exceeded(rt_rq); 1096 if (exceeded) { 1097 resched_curr(rq); 1098 } 1099 raw_spin_unlock(&rt_rq->rt_runtime_lock); 1100 if (exceeded) { 1101 try_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); 1102 } 1103 } 1104 } 1105} 1106 1107static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count) 1108{ 1109 struct rq *rq = rq_of_rt_rq(rt_rq); 1110 1111 BUG_ON(&rq->rt != rt_rq); 1112 1113 if (!rt_rq->rt_queued) { 1114 return; 1115 } 1116 1117 BUG_ON(!rq->nr_running); 1118 1119 sub_nr_running(rq, count); 1120 rt_rq->rt_queued = 0; 1121} 1122 1123static void enqueue_top_rt_rq(struct rt_rq *rt_rq) 1124{ 1125 struct rq *rq = rq_of_rt_rq(rt_rq); 1126 1127 BUG_ON(&rq->rt != rt_rq); 1128 1129 if (rt_rq->rt_queued) { 1130 return; 1131 } 1132 1133 if (rt_rq_throttled(rt_rq)) { 1134 return; 1135 } 1136 1137 if (rt_rq->rt_nr_running) { 1138 add_nr_running(rq, rt_rq->rt_nr_running); 1139 rt_rq->rt_queued = 1; 1140 } 1141 1142 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 1143 cpufreq_update_util(rq, 0); 1144} 1145 1146#if defined CONFIG_SMP 1147 1148static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1149{ 1150 struct rq *rq = rq_of_rt_rq(rt_rq); 1151 1152#ifdef CONFIG_RT_GROUP_SCHED 1153 /* 1154 * Change rq's cpupri only if rt_rq is the top queue. 1155 */ 1156 if (&rq->rt != rt_rq) { 1157 return; 1158 } 1159#endif 1160 if (rq->online && prio < prev_prio) { 1161 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 1162 } 1163} 1164 1165static void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1166{ 1167 struct rq *rq = rq_of_rt_rq(rt_rq); 1168 1169#ifdef CONFIG_RT_GROUP_SCHED 1170 /* 1171 * Change rq's cpupri only if rt_rq is the top queue. 1172 */ 1173 if (&rq->rt != rt_rq) { 1174 return; 1175 } 1176#endif 1177 if (rq->online && rt_rq->highest_prio.curr != prev_prio) { 1178 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 1179 } 1180} 1181 1182#else /* CONFIG_SMP */ 1183 1184static inline void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1185{ 1186} 1187static inline void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1188{ 1189} 1190 1191#endif /* CONFIG_SMP */ 1192 1193#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 1194static void inc_rt_prio(struct rt_rq *rt_rq, int prio) 1195{ 1196 int prev_prio = rt_rq->highest_prio.curr; 1197 1198 if (prio < prev_prio) { 1199 rt_rq->highest_prio.curr = prio; 1200 } 1201 1202 inc_rt_prio_smp(rt_rq, prio, prev_prio); 1203} 1204 1205static void dec_rt_prio(struct rt_rq *rt_rq, int prio) 1206{ 1207 int prev_prio = rt_rq->highest_prio.curr; 1208 1209 if (rt_rq->rt_nr_running) { 1210 WARN_ON(prio < prev_prio); 1211 1212 /* 1213 * This may have been our highest task, and therefore 1214 * we may have some recomputation to do 1215 */ 1216 if (prio == prev_prio) { 1217 struct rt_prio_array *array = &rt_rq->active; 1218 1219 rt_rq->highest_prio.curr = sched_find_first_bit(array->bitmap); 1220 } 1221 } else { 1222 rt_rq->highest_prio.curr = MAX_RT_PRIO; 1223 } 1224 1225 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1226} 1227 1228#else 1229 1230static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) 1231{ 1232} 1233static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) 1234{ 1235} 1236 1237#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 1238 1239#ifdef CONFIG_RT_GROUP_SCHED 1240 1241static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1242{ 1243 if (rt_se_boosted(rt_se)) { 1244 rt_rq->rt_nr_boosted++; 1245 } 1246 1247 if (rt_rq->tg) { 1248 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1249 } 1250} 1251 1252static void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1253{ 1254 if (rt_se_boosted(rt_se)) { 1255 rt_rq->rt_nr_boosted--; 1256 } 1257 1258 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1259} 1260 1261#else /* CONFIG_RT_GROUP_SCHED */ 1262 1263static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1264{ 1265 start_rt_bandwidth(&def_rt_bandwidth); 1266} 1267 1268static inline void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1269{ 1270} 1271 1272#endif /* CONFIG_RT_GROUP_SCHED */ 1273 1274static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) 1275{ 1276 struct rt_rq *group_rq = group_rt_rq(rt_se); 1277 1278 if (group_rq) { 1279 return group_rq->rt_nr_running; 1280 } else { 1281 return 1; 1282 } 1283} 1284 1285static inline unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) 1286{ 1287 struct rt_rq *group_rq = group_rt_rq(rt_se); 1288 struct task_struct *tsk; 1289 1290 if (group_rq) { 1291 return group_rq->rr_nr_running; 1292 } 1293 1294 tsk = rt_task_of(rt_se); 1295 1296 return (tsk->policy == SCHED_RR) ? 1 : 0; 1297} 1298 1299static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1300{ 1301 int prio = rt_se_prio(rt_se); 1302 1303 WARN_ON(!rt_prio(prio)); 1304 rt_rq->rt_nr_running += rt_se_nr_running(rt_se); 1305 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); 1306 1307 inc_rt_prio(rt_rq, prio); 1308 inc_rt_migration(rt_se, rt_rq); 1309 inc_rt_group(rt_se, rt_rq); 1310} 1311 1312static inline void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1313{ 1314 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1315 WARN_ON(!rt_rq->rt_nr_running); 1316 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); 1317 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); 1318 1319 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1320 dec_rt_migration(rt_se, rt_rq); 1321 dec_rt_group(rt_se, rt_rq); 1322} 1323 1324/* 1325 * Change rt_se->run_list location unless SAVE && !MOVE 1326 * 1327 * assumes ENQUEUE/DEQUEUE flags match 1328 */ 1329static inline bool move_entity(unsigned int flags) 1330{ 1331 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) { 1332 return false; 1333 } 1334 1335 return true; 1336} 1337 1338static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) 1339{ 1340 list_del_init(&rt_se->run_list); 1341 1342 if (list_empty(array->queue + rt_se_prio(rt_se))) { 1343 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1344 } 1345 1346 rt_se->on_list = 0; 1347} 1348 1349static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1350{ 1351 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1352 struct rt_prio_array *array = &rt_rq->active; 1353 struct rt_rq *group_rq = group_rt_rq(rt_se); 1354 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1355 1356 /* 1357 * Don't enqueue the group if its throttled, or when empty. 1358 * The latter is a consequence of the former when a child group 1359 * get throttled and the current group doesn't have any other 1360 * active members. 1361 */ 1362 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { 1363 if (rt_se->on_list) { 1364 __delist_rt_entity(rt_se, array); 1365 } 1366 return; 1367 } 1368 1369 if (move_entity(flags)) { 1370 WARN_ON_ONCE(rt_se->on_list); 1371 if (flags & ENQUEUE_HEAD) { 1372 list_add(&rt_se->run_list, queue); 1373 } else { 1374 list_add_tail(&rt_se->run_list, queue); 1375 } 1376 1377 __set_bit(rt_se_prio(rt_se), array->bitmap); 1378 rt_se->on_list = 1; 1379 } 1380 rt_se->on_rq = 1; 1381 1382 inc_rt_tasks(rt_se, rt_rq); 1383} 1384 1385static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1386{ 1387 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1388 struct rt_prio_array *array = &rt_rq->active; 1389 1390 if (move_entity(flags)) { 1391 WARN_ON_ONCE(!rt_se->on_list); 1392 __delist_rt_entity(rt_se, array); 1393 } 1394 rt_se->on_rq = 0; 1395 1396 dec_rt_tasks(rt_se, rt_rq); 1397} 1398 1399/* 1400 * Because the prio of an upper entry depends on the lower 1401 * entries, we must remove entries top - down. 1402 */ 1403static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) 1404{ 1405 struct sched_rt_entity *back = NULL; 1406 unsigned int rt_nr_running; 1407 1408 cycle_each_sched_rt_entity(rt_se) { 1409 rt_se->back = back; 1410 back = rt_se; 1411 } 1412 1413 rt_nr_running = rt_rq_of_se(back)->rt_nr_running; 1414 1415 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1416 if (on_rt_rq(rt_se)) { 1417 __dequeue_rt_entity(rt_se, flags); 1418 } 1419 } 1420 dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running); 1421} 1422 1423static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1424{ 1425 struct rq *rq = rq_of_rt_se(rt_se); 1426 1427 dequeue_rt_stack(rt_se, flags); 1428 cycle_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, flags); 1429 enqueue_top_rt_rq(&rq->rt); 1430} 1431 1432static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1433{ 1434 struct rq *rq = rq_of_rt_se(rt_se); 1435 1436 dequeue_rt_stack(rt_se, flags); 1437 1438 cycle_each_sched_rt_entity(rt_se) { 1439 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1440 1441 if (rt_rq && rt_rq->rt_nr_running) { 1442 __enqueue_rt_entity(rt_se, flags); 1443 } 1444 } 1445 enqueue_top_rt_rq(&rq->rt); 1446} 1447 1448#ifdef CONFIG_SMP 1449static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, bool sync) 1450{ 1451 /* 1452 * If the waker is CFS, then an RT sync wakeup would preempt the waker 1453 * and force it to run for a likely small time after the RT wakee is 1454 * done. So, only honor RT sync wakeups from RT wakers. 1455 */ 1456 return sync && task_has_rt_policy(rq->curr) && p->prio <= rq->rt.highest_prio.next && rq->rt.rt_nr_running <= 0x2; 1457} 1458#else 1459static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, bool sync) 1460{ 1461 return 0; 1462} 1463#endif 1464 1465/* 1466 * Adding/removing a task to/from a priority array: 1467 */ 1468static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1469{ 1470 struct sched_rt_entity *rt_se = &p->rt; 1471 bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC); 1472 1473 if (flags & ENQUEUE_WAKEUP) { 1474 rt_se->timeout = 0; 1475 } 1476 1477 enqueue_rt_entity(rt_se, flags); 1478 walt_inc_cumulative_runnable_avg(rq, p); 1479 1480 if (!task_current(rq, p) && p->nr_cpus_allowed > 1 && !should_honor_rt_sync(rq, p, sync)) { 1481 enqueue_pushable_task(rq, p); 1482 } 1483} 1484 1485static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1486{ 1487 struct sched_rt_entity *rt_se = &p->rt; 1488 1489 update_curr_rt(rq); 1490 dequeue_rt_entity(rt_se, flags); 1491 walt_dec_cumulative_runnable_avg(rq, p); 1492 1493 dequeue_pushable_task(rq, p); 1494} 1495 1496/* 1497 * Put task to the head or the end of the run list without the overhead of 1498 * dequeue followed by enqueue. 1499 */ 1500static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1501{ 1502 if (on_rt_rq(rt_se)) { 1503 struct rt_prio_array *array = &rt_rq->active; 1504 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1505 1506 if (head) { 1507 list_move(&rt_se->run_list, queue); 1508 } else { 1509 list_move_tail(&rt_se->run_list, queue); 1510 } 1511 } 1512} 1513 1514static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) 1515{ 1516 struct sched_rt_entity *rt_se = &p->rt; 1517 struct rt_rq *rt_rq; 1518 1519 cycle_each_sched_rt_entity(rt_se) { 1520 rt_rq = rt_rq_of_se(rt_se); 1521 requeue_rt_entity(rt_rq, rt_se, head); 1522 } 1523} 1524 1525static void yield_task_rt(struct rq *rq) 1526{ 1527 requeue_task_rt(rq, rq->curr, 0); 1528} 1529 1530#ifdef CONFIG_SMP 1531static int find_lowest_rq(struct task_struct *task); 1532 1533static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1534{ 1535 struct task_struct *curr; 1536 struct rq *rq; 1537 struct rq *this_cpu_rq; 1538 bool test; 1539 int target_cpu = -1; 1540 bool sync = !!(flags & WF_SYNC); 1541 int this_cpu; 1542 1543 /* For anything but wake ups, just return the task_cpu */ 1544 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) { 1545 goto out; 1546 } 1547 1548 rq = cpu_rq(cpu); 1549 1550 rcu_read_lock(); 1551 curr = READ_ONCE(rq->curr); /* unlocked access */ 1552 this_cpu = smp_processor_id(); 1553 this_cpu_rq = cpu_rq(this_cpu); 1554 1555 /* 1556 * If the current task on @p's runqueue is an RT task, then 1557 * try to see if we can wake this RT task up on another 1558 * runqueue. Otherwise simply start this RT task 1559 * on its current runqueue. 1560 * 1561 * We want to avoid overloading runqueues. If the woken 1562 * task is a higher priority, then it will stay on this CPU 1563 * and the lower prio task should be moved to another CPU. 1564 * Even though this will probably make the lower prio task 1565 * lose its cache, we do not want to bounce a higher task 1566 * around just because it gave up its CPU, perhaps for a 1567 * lock? 1568 * 1569 * For equal prio tasks, we just let the scheduler sort it out. 1570 * 1571 * Otherwise, just let it ride on the affined RQ and the 1572 * post-schedule router will push the preempted task away 1573 * 1574 * This test is optimistic, if we get it wrong the load-balancer 1575 * will have to sort it out. 1576 * 1577 * We take into account the capacity of the CPU to ensure it fits the 1578 * requirement of the task - which is only important on heterogeneous 1579 * systems like big.LITTLE. 1580 */ 1581 test = curr && unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 0x2 || curr->prio <= p->prio); 1582 1583 /* 1584 * Respect the sync flag as long as the task can run on this CPU. 1585 */ 1586 if (should_honor_rt_sync(this_cpu_rq, p, sync) && cpumask_test_cpu(this_cpu, p->cpus_ptr)) { 1587 cpu = this_cpu; 1588 goto out_unlock; 1589 } 1590 1591 if (test || !rt_task_fits_capacity(p, cpu)) { 1592 int target = find_lowest_rq(p); 1593 /* 1594 * Bail out if we were forcing a migration to find a better 1595 * fitting CPU but our search failed. 1596 */ 1597 if (!test && target != -1 && !rt_task_fits_capacity(p, target)) { 1598 goto out_unlock; 1599 } 1600 1601 /* 1602 * Don't bother moving it if the destination CPU is 1603 * not running a lower priority task. 1604 */ 1605 if (target != -1 && p->prio < cpu_rq(target)->rt.highest_prio.curr) { 1606 cpu = target; 1607 } 1608 } 1609 1610out_unlock: 1611 rcu_read_unlock(); 1612 1613out: 1614 return cpu; 1615} 1616 1617static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1618{ 1619 /* 1620 * Current can't be migrated, useless to reschedule, 1621 * let's hope p can move out. 1622 */ 1623 if (rq->curr->nr_cpus_allowed == 1 || !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) { 1624 return; 1625 } 1626 1627 /* 1628 * p is migratable, so let's not schedule it and 1629 * see if it is pushed or pulled somewhere else. 1630 */ 1631 if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) { 1632 return; 1633 } 1634 1635 /* 1636 * There appear to be other CPUs that can accept 1637 * the current task but none can run 'p', so lets reschedule 1638 * to try and push the current task away: 1639 */ 1640 requeue_task_rt(rq, p, 1); 1641 resched_curr(rq); 1642} 1643 1644static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) 1645{ 1646 if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { 1647 /* 1648 * This is OK, because current is on_cpu, which avoids it being 1649 * picked for load-balance and preemption/IRQs are still 1650 * disabled avoiding further scheduler activity on it and we've 1651 * not yet started the picking loop. 1652 */ 1653 rq_unpin_lock(rq, rf); 1654 pull_rt_task(rq); 1655 rq_repin_lock(rq, rf); 1656 } 1657 1658 return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); 1659} 1660#endif /* CONFIG_SMP */ 1661 1662/* 1663 * Preempt the current task with a newly woken task if needed: 1664 */ 1665static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1666{ 1667 if (p->prio < rq->curr->prio) { 1668 resched_curr(rq); 1669 return; 1670 } 1671 1672#ifdef CONFIG_SMP 1673 /* 1674 * If: 1675 * 1676 * - the newly woken task is of equal priority to the current task 1677 * - the newly woken task is non-migratable while current is migratable 1678 * - current will be preempted on the next reschedule 1679 * 1680 * we should check to see if current can readily move to a different 1681 * cpu. If so, we will reschedule to allow the push logic to try 1682 * to move current somewhere else, making room for our non-migratable 1683 * task. 1684 */ 1685 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) { 1686 check_preempt_equal_prio(rq, p); 1687 } 1688#endif 1689} 1690 1691static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) 1692{ 1693 p->se.exec_start = rq_clock_task(rq); 1694 1695 /* The running task is never eligible for pushing */ 1696 dequeue_pushable_task(rq, p); 1697 1698 if (!first) { 1699 return; 1700 } 1701 1702 /* 1703 * If prev task was rt, put_prev_task() has already updated the 1704 * utilization. We only care of the case where we start to schedule a 1705 * rt task 1706 */ 1707 if (rq->curr->sched_class != &rt_sched_class) { 1708 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 1709 } 1710 1711 rt_queue_push_tasks(rq); 1712} 1713 1714static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) 1715{ 1716 struct rt_prio_array *array = &rt_rq->active; 1717 struct sched_rt_entity *next = NULL; 1718 struct list_head *queue; 1719 int idx; 1720 1721 idx = sched_find_first_bit(array->bitmap); 1722 BUG_ON(idx >= MAX_RT_PRIO); 1723 1724 queue = array->queue + idx; 1725 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1726 1727 return next; 1728} 1729 1730static struct task_struct *_pick_next_task_rt(struct rq *rq) 1731{ 1732 struct sched_rt_entity *rt_se; 1733 struct rt_rq *rt_rq = &rq->rt; 1734 1735 do { 1736 rt_se = pick_next_rt_entity(rq, rt_rq); 1737 BUG_ON(!rt_se); 1738 rt_rq = group_rt_rq(rt_se); 1739 } while (rt_rq); 1740 1741 return rt_task_of(rt_se); 1742} 1743 1744static struct task_struct *pick_next_task_rt(struct rq *rq) 1745{ 1746 struct task_struct *p; 1747 1748 if (!sched_rt_runnable(rq)) { 1749 return NULL; 1750 } 1751 1752 p = _pick_next_task_rt(rq); 1753 set_next_task_rt(rq, p, true); 1754 return p; 1755} 1756 1757static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1758{ 1759 update_curr_rt(rq); 1760 1761 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 1762 1763 /* 1764 * The previous task needs to be made eligible for pushing 1765 * if it is still active 1766 */ 1767 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) { 1768 enqueue_pushable_task(rq, p); 1769 } 1770} 1771 1772#ifdef CONFIG_SMP 1773 1774/* Only try algorithms three times */ 1775#define RT_MAX_TRIES 3 1776 1777static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1778{ 1779 if (!task_running(rq, p) && cpumask_test_cpu(cpu, p->cpus_ptr)) { 1780 return 1; 1781 } 1782 1783 return 0; 1784} 1785 1786/* 1787 * Return the highest pushable rq's task, which is suitable to be executed 1788 * on the CPU, NULL otherwise 1789 */ 1790static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1791{ 1792 struct plist_head *head = &rq->rt.pushable_tasks; 1793 struct task_struct *p; 1794 1795 if (!has_pushable_tasks(rq)) { 1796 return NULL; 1797 } 1798 1799 plist_for_each_entry(p, head, pushable_tasks) 1800 { 1801 if (pick_rt_task(rq, p, cpu)) { 1802 return p; 1803 } 1804 } 1805 1806 return NULL; 1807} 1808 1809#ifdef CONFIG_SCHED_RT_CAS 1810static int find_cas_cpu(struct sched_domain *sd, struct task_struct *task, struct cpumask *lowest_mask) 1811{ 1812 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 1813 struct sched_group *sg = NULL; 1814 struct sched_group *sg_target = NULL; 1815 struct sched_group *sg_backup = NULL; 1816 struct cpumask search_cpu, backup_search_cpu; 1817 int cpu = -1; 1818 int target_cpu = -1; 1819 unsigned long cpu_capacity; 1820 unsigned long boosted_tutil = uclamp_task_util(task); 1821 unsigned long target_capacity = ULONG_MAX; 1822 unsigned long util; 1823 unsigned long target_cpu_util = ULONG_MAX; 1824 int prev_cpu = task_cpu(task); 1825#ifdef CONFIG_SCHED_RTG 1826 struct cpumask *rtg_target = NULL; 1827#endif 1828 bool boosted = uclamp_boosted(task); 1829 1830 if (!sysctl_sched_enable_rt_cas) { 1831 return -1; 1832 } 1833 1834 rcu_read_lock(); 1835 1836#ifdef CONFIG_SCHED_RTG 1837 rtg_target = find_rtg_target(task); 1838#endif 1839 1840 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, 0)); 1841 if (!sd) { 1842 rcu_read_unlock(); 1843 return -1; 1844 } 1845 1846 sg = sd->groups; 1847 do { 1848 if (!cpumask_intersects(lowest_mask, sched_group_span(sg))) { 1849 continue; 1850 } 1851 1852 if (boosted) { 1853 if (cpumask_test_cpu(rd->max_cap_orig_cpu, sched_group_span(sg))) { 1854 sg_target = sg; 1855 break; 1856 } 1857 } 1858 1859 cpu = group_first_cpu(sg); 1860#ifdef CONFIG_SCHED_RTG 1861 /* honor the rtg tasks */ 1862 if (rtg_target) { 1863 if (cpumask_test_cpu(cpu, rtg_target)) { 1864 sg_target = sg; 1865 break; 1866 } 1867 1868 /* active LB or big_task favor cpus with more capacity */ 1869 if (task->state == TASK_RUNNING || boosted) { 1870 if (capacity_orig_of(cpu) > capacity_orig_of(cpumask_any(rtg_target))) { 1871 sg_target = sg; 1872 break; 1873 } 1874 1875 sg_backup = sg; 1876 continue; 1877 } 1878 } 1879#endif 1880 /* 1881 * 1. add margin to support task migration 1882 * 2. if task_util is high then all cpus, make sure the 1883 * sg_backup with the most powerful cpus is selected 1884 */ 1885 if (!rt_task_fits_capacity(task, cpu)) { 1886 sg_backup = sg; 1887 continue; 1888 } 1889 1890 /* support task boost */ 1891 cpu_capacity = capacity_orig_of(cpu); 1892 if (boosted_tutil > cpu_capacity) { 1893 sg_backup = sg; 1894 continue; 1895 } 1896 1897 /* sg_target: select the sg with smaller capacity */ 1898 if (cpu_capacity < target_capacity) { 1899 target_capacity = cpu_capacity; 1900 sg_target = sg; 1901 } 1902 } while (sg = sg->next, sg != sd->groups); 1903 1904 if (!sg_target) { 1905 sg_target = sg_backup; 1906 } 1907 1908 if (sg_target) { 1909 cpumask_and(&search_cpu, lowest_mask, sched_group_span(sg_target)); 1910 cpumask_copy(&backup_search_cpu, lowest_mask); 1911 cpumask_andnot(&backup_search_cpu, &backup_search_cpu, &search_cpu); 1912 } else { 1913 cpumask_copy(&search_cpu, lowest_mask); 1914 cpumask_clear(&backup_search_cpu); 1915 } 1916 1917retry: 1918 cpu = cpumask_first(&search_cpu); 1919 do { 1920 trace_sched_find_cas_cpu_each(task, cpu, target_cpu, cpu_isolated(cpu), idle_cpu(cpu), boosted_tutil, 1921 cpu_util(cpu), capacity_orig_of(cpu)); 1922 1923 if (cpu_isolated(cpu)) { 1924 continue; 1925 } 1926 1927 if (!cpumask_test_cpu(cpu, task->cpus_ptr)) { 1928 continue; 1929 } 1930 1931 /* find best cpu with smallest max_capacity */ 1932 if (target_cpu != -1 && capacity_orig_of(cpu) > capacity_orig_of(target_cpu)) { 1933 continue; 1934 } 1935 1936 util = cpu_util(cpu); 1937 /* Find the least loaded CPU */ 1938 if (util > target_cpu_util) { 1939 continue; 1940 } 1941 1942 /* 1943 * If the preivous CPU has same load, keep it as 1944 * target_cpu 1945 */ 1946 if (target_cpu_util == util && target_cpu == prev_cpu) { 1947 continue; 1948 } 1949 1950 /* 1951 * If candidate CPU is the previous CPU, select it. 1952 * If all above conditions are same, select the least 1953 * cumulative window demand CPU. 1954 */ 1955 target_cpu_util = util; 1956 target_cpu = cpu; 1957 } while ((cpu = cpumask_next(cpu, &search_cpu)) < nr_cpu_ids); 1958 1959 if (target_cpu != -1 && cpumask_test_cpu(target_cpu, lowest_mask)) { 1960 goto done; 1961 } else if (!cpumask_empty(&backup_search_cpu)) { 1962 cpumask_copy(&search_cpu, &backup_search_cpu); 1963 cpumask_clear(&backup_search_cpu); 1964 goto retry; 1965 } 1966 1967done: 1968 trace_sched_find_cas_cpu(task, lowest_mask, boosted_tutil, prev_cpu, target_cpu); 1969 rcu_read_unlock(); 1970 return target_cpu; 1971} 1972#endif 1973 1974static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1975 1976static int find_lowest_rq(struct task_struct *task) 1977{ 1978 struct sched_domain *sd; 1979 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); 1980 int this_cpu = smp_processor_id(); 1981 int cpu = task_cpu(task); 1982 int ret; 1983#ifdef CONFIG_SCHED_RT_CAS 1984 int cas_cpu; 1985#endif 1986 1987 /* Make sure the mask is initialized first */ 1988 if (unlikely(!lowest_mask)) { 1989 return -1; 1990 } 1991 1992 if (task->nr_cpus_allowed == 1) { 1993 return -1; /* No other targets possible */ 1994 } 1995 1996 /* 1997 * If we're on asym system ensure we consider the different capacities 1998 * of the CPUs when searching for the lowest_mask. 1999 */ 2000 if (static_branch_unlikely(&sched_asym_cpucapacity)) { 2001 ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, task, lowest_mask, rt_task_fits_capacity); 2002 } else { 2003 ret = cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask); 2004 } 2005 2006 if (!ret) { 2007 return -1; /* No targets found */ 2008 } 2009 2010#ifdef CONFIG_SCHED_RT_CAS 2011 cas_cpu = find_cas_cpu(sd, task, lowest_mask); 2012 if (cas_cpu != -1) { 2013 return cas_cpu; 2014 } 2015#endif 2016 2017 /* 2018 * At this point we have built a mask of CPUs representing the 2019 * lowest priority tasks in the system. Now we want to elect 2020 * the best one based on our affinity and topology. 2021 * 2022 * We prioritize the last CPU that the task executed on since 2023 * it is most likely cache-hot in that location. 2024 */ 2025 if (cpumask_test_cpu(cpu, lowest_mask)) { 2026 return cpu; 2027 } 2028 2029 /* 2030 * Otherwise, we consult the sched_domains span maps to figure 2031 * out which CPU is logically closest to our hot cache data. 2032 */ 2033 if (!cpumask_test_cpu(this_cpu, lowest_mask)) { 2034 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 2035 } 2036 2037 rcu_read_lock(); 2038 for_each_domain(cpu, sd) 2039 { 2040 if (sd->flags & SD_WAKE_AFFINE) { 2041 int best_cpu; 2042 2043 /* 2044 * "this_cpu" is cheaper to preempt than a 2045 * remote processor. 2046 */ 2047 if (this_cpu != -1 && cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 2048 rcu_read_unlock(); 2049 return this_cpu; 2050 } 2051 2052 best_cpu = cpumask_first_and(lowest_mask, sched_domain_span(sd)); 2053 if (best_cpu < nr_cpu_ids) { 2054 rcu_read_unlock(); 2055 return best_cpu; 2056 } 2057 } 2058 } 2059 rcu_read_unlock(); 2060 2061 /* 2062 * And finally, if there were no matches within the domains 2063 * just give the caller *something* to work with from the compatible 2064 * locations. 2065 */ 2066 if (this_cpu != -1) { 2067 return this_cpu; 2068 } 2069 2070 cpu = cpumask_any(lowest_mask); 2071 if (cpu < nr_cpu_ids) { 2072 return cpu; 2073 } 2074 2075 return -1; 2076} 2077 2078static struct task_struct *pick_next_pushable_task(struct rq *rq) 2079{ 2080 struct task_struct *p; 2081 2082 if (!has_pushable_tasks(rq)) { 2083 return NULL; 2084 } 2085 2086 p = plist_first_entry(&rq->rt.pushable_tasks, struct task_struct, pushable_tasks); 2087 2088 BUG_ON(rq->cpu != task_cpu(p)); 2089 BUG_ON(task_current(rq, p)); 2090 BUG_ON(p->nr_cpus_allowed <= 1); 2091 2092 BUG_ON(!task_on_rq_queued(p)); 2093 BUG_ON(!rt_task(p)); 2094 2095 return p; 2096} 2097 2098/* Will lock the rq it finds */ 2099static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 2100{ 2101 struct rq *lowest_rq = NULL; 2102 int tries; 2103 int cpu; 2104 2105 for (tries = 0; tries < RT_MAX_TRIES; tries++) { 2106 cpu = find_lowest_rq(task); 2107 if ((cpu == -1) || (cpu == rq->cpu)) { 2108 break; 2109 } 2110 2111 lowest_rq = cpu_rq(cpu); 2112 if (lowest_rq->rt.highest_prio.curr <= task->prio) { 2113 /* 2114 * Target rq has tasks of equal or higher priority, 2115 * retrying does not release any lock and is unlikely 2116 * to yield a different result. 2117 */ 2118 lowest_rq = NULL; 2119 break; 2120 } 2121 2122 /* if the prio of this runqueue changed, try again */ 2123 if (double_lock_balance(rq, lowest_rq)) { 2124 /* 2125 * We had to unlock the run queue. In 2126 * the mean time, task could have 2127 * migrated already or had its affinity changed. 2128 */ 2129 struct task_struct *next_task = pick_next_pushable_task(rq); 2130 if (unlikely(next_task != task || !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr))) { 2131 double_unlock_balance(rq, lowest_rq); 2132 lowest_rq = NULL; 2133 break; 2134 } 2135 } 2136 2137 /* If this rq is still suitable use it. */ 2138 if (lowest_rq->rt.highest_prio.curr > task->prio) { 2139 break; 2140 } 2141 2142 /* try again */ 2143 double_unlock_balance(rq, lowest_rq); 2144 lowest_rq = NULL; 2145 } 2146 2147 return lowest_rq; 2148} 2149 2150/* 2151 * If the current CPU has more than one RT task, see if the non 2152 * running task can migrate over to a CPU that is running a task 2153 * of lesser priority. 2154 */ 2155static int push_rt_task(struct rq *rq) 2156{ 2157 struct task_struct *next_task; 2158 struct rq *lowest_rq; 2159 int ret = 0; 2160 2161 if (!rq->rt.overloaded) { 2162 return 0; 2163 } 2164 2165 next_task = pick_next_pushable_task(rq); 2166 if (!next_task) { 2167 return 0; 2168 } 2169 2170retry: 2171 if (WARN_ON(next_task == rq->curr)) { 2172 return 0; 2173 } 2174 2175 /* 2176 * It's possible that the next_task slipped in of 2177 * higher priority than current. If that's the case 2178 * just reschedule current. 2179 */ 2180 if (unlikely(next_task->prio < rq->curr->prio)) { 2181 resched_curr(rq); 2182 return 0; 2183 } 2184 2185 /* We might release rq lock */ 2186 get_task_struct(next_task); 2187 2188 /* find_lock_lowest_rq locks the rq if found */ 2189 lowest_rq = find_lock_lowest_rq(next_task, rq); 2190 if (!lowest_rq) { 2191 struct task_struct *task; 2192 /* 2193 * find_lock_lowest_rq releases rq->lock 2194 * so it is possible that next_task has migrated. 2195 * 2196 * We need to make sure that the task is still on the same 2197 * run-queue and is also still the next task eligible for 2198 * pushing. 2199 */ 2200 task = pick_next_pushable_task(rq); 2201 if (task == next_task) { 2202 /* 2203 * The task hasn't migrated, and is still the next 2204 * eligible task, but we failed to find a run-queue 2205 * to push it to. Do not retry in this case, since 2206 * other CPUs will pull from us when ready. 2207 */ 2208 goto out; 2209 } 2210 2211 if (!task) { 2212 /* No more tasks, just exit */ 2213 goto out; 2214 } 2215 2216 /* 2217 * Something has shifted, try again. 2218 */ 2219 put_task_struct(next_task); 2220 next_task = task; 2221 goto retry; 2222 } 2223 2224 deactivate_task(rq, next_task, 0); 2225 set_task_cpu(next_task, lowest_rq->cpu); 2226 activate_task(lowest_rq, next_task, 0); 2227 ret = 1; 2228 2229 resched_curr(lowest_rq); 2230 2231 double_unlock_balance(rq, lowest_rq); 2232 2233out: 2234 put_task_struct(next_task); 2235 2236 return ret; 2237} 2238 2239static void push_rt_tasks(struct rq *rq) 2240{ 2241 /* push_rt_task will return true if it moved an RT */ 2242 while (push_rt_task(rq)) { 2243 ; 2244 } 2245} 2246 2247#ifdef HAVE_RT_PUSH_IPI 2248 2249/* 2250 * When a high priority task schedules out from a CPU and a lower priority 2251 * task is scheduled in, a check is made to see if there's any RT tasks 2252 * on other CPUs that are waiting to run because a higher priority RT task 2253 * is currently running on its CPU. In this case, the CPU with multiple RT 2254 * tasks queued on it (overloaded) needs to be notified that a CPU has opened 2255 * up that may be able to run one of its non-running queued RT tasks. 2256 * 2257 * All CPUs with overloaded RT tasks need to be notified as there is currently 2258 * no way to know which of these CPUs have the highest priority task waiting 2259 * to run. Instead of trying to take a spinlock on each of these CPUs, 2260 * which has shown to cause large latency when done on machines with many 2261 * CPUs, sending an IPI to the CPUs to have them push off the overloaded 2262 * RT tasks waiting to run. 2263 * 2264 * Just sending an IPI to each of the CPUs is also an issue, as on large 2265 * count CPU machines, this can cause an IPI storm on a CPU, especially 2266 * if its the only CPU with multiple RT tasks queued, and a large number 2267 * of CPUs scheduling a lower priority task at the same time. 2268 * 2269 * Each root domain has its own irq work function that can iterate over 2270 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT 2271 * tassk must be checked if there's one or many CPUs that are lowering 2272 * their priority, there's a single irq work iterator that will try to 2273 * push off RT tasks that are waiting to run. 2274 * 2275 * When a CPU schedules a lower priority task, it will kick off the 2276 * irq work iterator that will jump to each CPU with overloaded RT tasks. 2277 * As it only takes the first CPU that schedules a lower priority task 2278 * to start the process, the rto_start variable is incremented and if 2279 * the atomic result is one, then that CPU will try to take the rto_lock. 2280 * This prevents high contention on the lock as the process handles all 2281 * CPUs scheduling lower priority tasks. 2282 * 2283 * All CPUs that are scheduling a lower priority task will increment the 2284 * rt_loop_next variable. This will make sure that the irq work iterator 2285 * checks all RT overloaded CPUs whenever a CPU schedules a new lower 2286 * priority task, even if the iterator is in the middle of a scan. Incrementing 2287 * the rt_loop_next will cause the iterator to perform another scan. 2288 * 2289 */ 2290static int rto_next_cpu(struct root_domain *rd) 2291{ 2292 int next; 2293 int cpu; 2294 2295 /* 2296 * When starting the IPI RT pushing, the rto_cpu is set to -1, 2297 * rt_next_cpu() will simply return the first CPU found in 2298 * the rto_mask. 2299 * 2300 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it 2301 * will return the next CPU found in the rto_mask. 2302 * 2303 * If there are no more CPUs left in the rto_mask, then a check is made 2304 * against rto_loop and rto_loop_next. rto_loop is only updated with 2305 * the rto_lock held, but any CPU may increment the rto_loop_next 2306 * without any locking. 2307 */ 2308 for (;;) { 2309 /* When rto_cpu is -1 this acts like cpumask_first() */ 2310 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); 2311 2312 rd->rto_cpu = cpu; 2313 2314 if (cpu < nr_cpu_ids) { 2315 return cpu; 2316 } 2317 2318 rd->rto_cpu = -1; 2319 2320 /* 2321 * ACQUIRE ensures we see the @rto_mask changes 2322 * made prior to the @next value observed. 2323 * 2324 * Matches WMB in rt_set_overload(). 2325 */ 2326 next = atomic_read_acquire(&rd->rto_loop_next); 2327 if (rd->rto_loop == next) { 2328 break; 2329 } 2330 2331 rd->rto_loop = next; 2332 } 2333 2334 return -1; 2335} 2336 2337static inline bool rto_start_trylock(atomic_t *v) 2338{ 2339 return !atomic_cmpxchg_acquire(v, 0, 1); 2340} 2341 2342static inline void rto_start_unlock(atomic_t *v) 2343{ 2344 atomic_set_release(v, 0); 2345} 2346 2347static void tell_cpu_to_push(struct rq *rq) 2348{ 2349 int cpu = -1; 2350 2351 /* Keep the loop going if the IPI is currently active */ 2352 atomic_inc(&rq->rd->rto_loop_next); 2353 2354 /* Only one CPU can initiate a loop at a time */ 2355 if (!rto_start_trylock(&rq->rd->rto_loop_start)) { 2356 return; 2357 } 2358 2359 raw_spin_lock(&rq->rd->rto_lock); 2360 2361 /* 2362 * The rto_cpu is updated under the lock, if it has a valid CPU 2363 * then the IPI is still running and will continue due to the 2364 * update to loop_next, and nothing needs to be done here. 2365 * Otherwise it is finishing up and an ipi needs to be sent. 2366 */ 2367 if (rq->rd->rto_cpu < 0) { 2368 cpu = rto_next_cpu(rq->rd); 2369 } 2370 2371 raw_spin_unlock(&rq->rd->rto_lock); 2372 2373 rto_start_unlock(&rq->rd->rto_loop_start); 2374 2375 if (cpu >= 0) { 2376 /* Make sure the rd does not get freed while pushing */ 2377 sched_get_rd(rq->rd); 2378 irq_work_queue_on(&rq->rd->rto_push_work, cpu); 2379 } 2380} 2381 2382/* Called from hardirq context */ 2383void rto_push_irq_work_func(struct irq_work *work) 2384{ 2385 struct root_domain *rd = container_of(work, struct root_domain, rto_push_work); 2386 struct rq *rq; 2387 int cpu; 2388 2389 rq = this_rq(); 2390 /* 2391 * We do not need to grab the lock to check for has_pushable_tasks. 2392 * When it gets updated, a check is made if a push is possible. 2393 */ 2394 if (has_pushable_tasks(rq)) { 2395 raw_spin_lock(&rq->lock); 2396 push_rt_tasks(rq); 2397 raw_spin_unlock(&rq->lock); 2398 } 2399 2400 raw_spin_lock(&rd->rto_lock); 2401 2402 /* Pass the IPI to the next rt overloaded queue */ 2403 cpu = rto_next_cpu(rd); 2404 2405 raw_spin_unlock(&rd->rto_lock); 2406 2407 if (cpu < 0) { 2408 sched_put_rd(rd); 2409 return; 2410 } 2411 2412 /* Try the next RT overloaded CPU */ 2413 irq_work_queue_on(&rd->rto_push_work, cpu); 2414} 2415#endif /* HAVE_RT_PUSH_IPI */ 2416 2417static void pull_rt_task(struct rq *this_rq) 2418{ 2419 int this_cpu = this_rq->cpu, cpu; 2420 bool resched = false; 2421 struct task_struct *p; 2422 struct rq *src_rq; 2423 int rt_overload_count = rt_overloaded(this_rq); 2424 if (likely(!rt_overload_count)) { 2425 return; 2426 } 2427 2428 /* 2429 * Match the barrier from rt_set_overloaded; this guarantees that if we 2430 * see overloaded we must also see the rto_mask bit. 2431 */ 2432 smp_rmb(); 2433 2434 /* If we are the only overloaded CPU do nothing */ 2435 if (rt_overload_count == 1 && cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) { 2436 return; 2437 } 2438 2439#ifdef HAVE_RT_PUSH_IPI 2440 if (sched_feat(RT_PUSH_IPI)) { 2441 tell_cpu_to_push(this_rq); 2442 return; 2443 } 2444#endif 2445 2446 for_each_cpu(cpu, this_rq->rd->rto_mask) 2447 { 2448 if (this_cpu == cpu) { 2449 continue; 2450 } 2451 2452 src_rq = cpu_rq(cpu); 2453 /* 2454 * Don't bother taking the src_rq->lock if the next highest 2455 * task is known to be lower-priority than our current task. 2456 * This may look racy, but if this value is about to go 2457 * logically higher, the src_rq will push this task away. 2458 * And if its going logically lower, we do not care 2459 */ 2460 if (src_rq->rt.highest_prio.next >= this_rq->rt.highest_prio.curr) { 2461 continue; 2462 } 2463 2464 /* 2465 * We can potentially drop this_rq's lock in 2466 * double_lock_balance, and another CPU could 2467 * alter this_rq 2468 */ 2469 double_lock_balance(this_rq, src_rq); 2470 2471 /* 2472 * We can pull only a task, which is pushable 2473 * on its rq, and no others. 2474 */ 2475 p = pick_highest_pushable_task(src_rq, this_cpu); 2476 /* 2477 * Do we have an RT task that preempts 2478 * the to-be-scheduled task? 2479 */ 2480 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 2481 WARN_ON(p == src_rq->curr); 2482 WARN_ON(!task_on_rq_queued(p)); 2483 2484 /* 2485 * There's a chance that p is higher in priority 2486 * than what's currently running on its CPU. 2487 * This is just that p is wakeing up and hasn't 2488 * had a chance to schedule. We only pull 2489 * p if it is lower in priority than the 2490 * current task on the run queue 2491 */ 2492 if (p->prio < src_rq->curr->prio) { 2493 goto skip; 2494 } 2495 2496 resched = true; 2497 2498 deactivate_task(src_rq, p, 0); 2499 set_task_cpu(p, this_cpu); 2500 activate_task(this_rq, p, 0); 2501 /* 2502 * We continue with the search, just in 2503 * case there's an even higher prio task 2504 * in another runqueue. (low likelihood 2505 * but possible) 2506 */ 2507 } 2508 skip: 2509 double_unlock_balance(this_rq, src_rq); 2510 } 2511 2512 if (resched) { 2513 resched_curr(this_rq); 2514 } 2515} 2516 2517/* 2518 * If we are not running and we are not going to reschedule soon, we should 2519 * try to push tasks away now 2520 */ 2521static void task_woken_rt(struct rq *rq, struct task_struct *p) 2522{ 2523 bool need_to_push = !task_running(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && 2524 (dl_task(rq->curr) || rt_task(rq->curr)) && 2525 (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio); 2526 if (need_to_push) { 2527 push_rt_tasks(rq); 2528 } 2529} 2530 2531/* Assumes rq->lock is held */ 2532static void rq_online_rt(struct rq *rq) 2533{ 2534 if (rq->rt.overloaded) { 2535 rt_set_overload(rq); 2536 } 2537 2538 __enable_runtime(rq); 2539 2540 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 2541} 2542 2543/* Assumes rq->lock is held */ 2544static void rq_offline_rt(struct rq *rq) 2545{ 2546 if (rq->rt.overloaded) { 2547 rt_clear_overload(rq); 2548 } 2549 2550 __disable_runtime(rq); 2551 2552 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); 2553} 2554 2555/* 2556 * When switch from the rt queue, we bring ourselves to a position 2557 * that we might want to pull RT tasks from other runqueues. 2558 */ 2559static void switched_from_rt(struct rq *rq, struct task_struct *p) 2560{ 2561 /* 2562 * If there are other RT tasks then we will reschedule 2563 * and the scheduling of the other RT tasks will handle 2564 * the balancing. But if we are the last RT task 2565 * we may need to handle the pulling of RT tasks 2566 * now. 2567 */ 2568 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running || cpu_isolated(cpu_of(rq))) { 2569 return; 2570 } 2571 2572 rt_queue_pull_task(rq); 2573} 2574 2575void __init init_sched_rt_class(void) 2576{ 2577 unsigned int i; 2578 2579 for_each_possible_cpu(i) 2580 { 2581 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i)); 2582 } 2583} 2584#endif /* CONFIG_SMP */ 2585 2586/* 2587 * When switching a task to RT, we may overload the runqueue 2588 * with RT tasks. In this case we try to push them off to 2589 * other runqueues. 2590 */ 2591static void switched_to_rt(struct rq *rq, struct task_struct *p) 2592{ 2593 /* 2594 * If we are running, update the avg_rt tracking, as the running time 2595 * will now on be accounted into the latter. 2596 */ 2597 if (task_current(rq, p)) { 2598 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 2599 return; 2600 } 2601 2602 /* 2603 * If we are not running we may need to preempt the current 2604 * running task. If that current running task is also an RT task 2605 * then see if we can move to another run queue. 2606 */ 2607 if (task_on_rq_queued(p)) { 2608#ifdef CONFIG_SMP 2609 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) { 2610 rt_queue_push_tasks(rq); 2611 } 2612#endif /* CONFIG_SMP */ 2613 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) { 2614 resched_curr(rq); 2615 } 2616 } 2617} 2618 2619/* 2620 * Priority of the task has changed. This may cause 2621 * us to initiate a push or pull. 2622 */ 2623static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 2624{ 2625 if (!task_on_rq_queued(p)) { 2626 return; 2627 } 2628 2629 if (rq->curr == p) { 2630#ifdef CONFIG_SMP 2631 /* 2632 * If our priority decreases while running, we 2633 * may need to pull tasks to this runqueue. 2634 */ 2635 if (oldprio < p->prio) { 2636 rt_queue_pull_task(rq); 2637 } 2638 2639 /* 2640 * If there's a higher priority task waiting to run 2641 * then reschedule. 2642 */ 2643 if (p->prio > rq->rt.highest_prio.curr) { 2644 resched_curr(rq); 2645 } 2646#else 2647 /* For UP simply resched on drop of prio */ 2648 if (oldprio < p->prio) { 2649 resched_curr(rq); 2650 } 2651#endif /* CONFIG_SMP */ 2652 } else { 2653 /* 2654 * This task is not running, but if it is 2655 * greater than the current running task 2656 * then reschedule. 2657 */ 2658 if (p->prio < rq->curr->prio) { 2659 resched_curr(rq); 2660 } 2661 } 2662} 2663 2664#ifdef CONFIG_POSIX_TIMERS 2665static void watchdog(struct rq *rq, struct task_struct *p) 2666{ 2667 unsigned long soft, hard; 2668 2669 /* max may change after cur was read, this will be fixed next tick */ 2670 soft = task_rlimit(p, RLIMIT_RTTIME); 2671 hard = task_rlimit_max(p, RLIMIT_RTTIME); 2672 2673 if (soft != RLIM_INFINITY) { 2674 unsigned long next; 2675 2676 if (p->rt.watchdog_stamp != jiffies) { 2677 p->rt.timeout++; 2678 p->rt.watchdog_stamp = jiffies; 2679 } 2680 2681 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC / HZ); 2682 if (p->rt.timeout > next) { 2683 posix_cputimers_rt_watchdog(&p->posix_cputimers, p->se.sum_exec_runtime); 2684 } 2685 } 2686} 2687#else 2688static inline void watchdog(struct rq *rq, struct task_struct *p) 2689{ 2690} 2691#endif 2692 2693/* 2694 * scheduler tick hitting a task of our scheduling class. 2695 * 2696 * NOTE: This function can be called remotely by the tick offload that 2697 * goes along full dynticks. Therefore no local assumption can be made 2698 * and everything must be accessed through the @rq and @curr passed in 2699 * parameters. 2700 */ 2701static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2702{ 2703 struct sched_rt_entity *rt_se = &p->rt; 2704 2705 update_curr_rt(rq); 2706 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 2707 2708 watchdog(rq, p); 2709 2710 /* 2711 * RR tasks need a special form of timeslice management. 2712 * FIFO tasks have no timeslices. 2713 */ 2714 if (p->policy != SCHED_RR) { 2715 return; 2716 } 2717 2718 if (--p->rt.time_slice) { 2719 return; 2720 } 2721 2722 p->rt.time_slice = sched_rr_timeslice; 2723 2724 /* 2725 * Requeue to the end of queue if we (and all of our ancestors) are not 2726 * the only element on the queue 2727 */ 2728 cycle_each_sched_rt_entity(rt_se) { 2729 if (rt_se->run_list.prev != rt_se->run_list.next) { 2730 requeue_task_rt(rq, p, 0); 2731 resched_curr(rq); 2732 return; 2733 } 2734 } 2735} 2736 2737#ifdef CONFIG_SCHED_RT_ACTIVE_LB 2738static int rt_active_load_balance_cpu_stop(void *data) 2739{ 2740 struct rq *busiest_rq = data; 2741 struct task_struct *next_task = busiest_rq->rt_push_task; 2742 struct rq *lowest_rq = NULL; 2743 unsigned long flags; 2744 2745 raw_spin_lock_irqsave(&busiest_rq->lock, flags); 2746 busiest_rq->rt_active_balance = 0; 2747 2748 /* find_lock_lowest_rq locks the rq if found */ 2749 lowest_rq = find_lock_lowest_rq(next_task, busiest_rq); 2750 if (!lowest_rq) { 2751 goto out; 2752 } 2753 2754 if (capacity_orig_of(cpu_of(lowest_rq)) <= capacity_orig_of(task_cpu(next_task))) { 2755 goto unlock; 2756 } 2757 2758 deactivate_task(busiest_rq, next_task, 0); 2759 set_task_cpu(next_task, lowest_rq->cpu); 2760 activate_task(lowest_rq, next_task, 0); 2761 2762 resched_curr(lowest_rq); 2763unlock: 2764 double_unlock_balance(busiest_rq, lowest_rq); 2765out: 2766 put_task_struct(next_task); 2767 raw_spin_unlock_irqrestore(&busiest_rq->lock, flags); 2768 2769 return 0; 2770} 2771 2772static void check_for_migration_rt(struct rq *rq, struct task_struct *p) 2773{ 2774 bool need_actvie_lb = false; 2775 bool misfit_task = false; 2776 int cpu = task_cpu(p); 2777 unsigned long cpu_orig_cap; 2778#ifdef CONFIG_SCHED_RTG 2779 struct cpumask *rtg_target = NULL; 2780#endif 2781 2782 if (!sysctl_sched_enable_rt_active_lb) { 2783 return; 2784 } 2785 2786 if (p->nr_cpus_allowed == 1) { 2787 return; 2788 } 2789 2790 cpu_orig_cap = capacity_orig_of(cpu); 2791 /* cpu has max capacity, no need to do balance */ 2792 if (cpu_orig_cap == rq->rd->max_cpu_capacity) { 2793 return; 2794 } 2795 2796#ifdef CONFIG_SCHED_RTG 2797 rtg_target = find_rtg_target(p); 2798 if (rtg_target) { 2799 misfit_task = capacity_orig_of(cpumask_first(rtg_target)) > cpu_orig_cap; 2800 } else { 2801 misfit_task = !rt_task_fits_capacity(p, cpu); 2802 } 2803#else 2804 misfit_task = !rt_task_fits_capacity(p, cpu); 2805#endif 2806 if (misfit_task) { 2807 raw_spin_lock(&rq->lock); 2808 if (!rq->active_balance && !rq->rt_active_balance) { 2809 rq->rt_active_balance = 1; 2810 rq->rt_push_task = p; 2811 get_task_struct(p); 2812 need_actvie_lb = true; 2813 } 2814 raw_spin_unlock(&rq->lock); 2815 2816 if (need_actvie_lb) { 2817 stop_one_cpu_nowait(task_cpu(p), rt_active_load_balance_cpu_stop, rq, &rq->rt_active_balance_work); 2818 } 2819 } 2820} 2821#endif 2822 2823static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 2824{ 2825 /* 2826 * Time slice is 0 for SCHED_FIFO tasks 2827 */ 2828 if (task->policy == SCHED_RR) { 2829 return sched_rr_timeslice; 2830 } else { 2831 return 0; 2832 } 2833} 2834 2835const struct sched_class rt_sched_class __section("__rt_sched_class") = { 2836 .enqueue_task = enqueue_task_rt, 2837 .dequeue_task = dequeue_task_rt, 2838 .yield_task = yield_task_rt, 2839 2840 .check_preempt_curr = check_preempt_curr_rt, 2841 2842 .pick_next_task = pick_next_task_rt, 2843 .put_prev_task = put_prev_task_rt, 2844 .set_next_task = set_next_task_rt, 2845 2846#ifdef CONFIG_SMP 2847 .balance = balance_rt, 2848 .select_task_rq = select_task_rq_rt, 2849 .set_cpus_allowed = set_cpus_allowed_common, 2850 .rq_online = rq_online_rt, 2851 .rq_offline = rq_offline_rt, 2852 .task_woken = task_woken_rt, 2853 .switched_from = switched_from_rt, 2854#endif 2855 2856 .task_tick = task_tick_rt, 2857 2858 .get_rr_interval = get_rr_interval_rt, 2859 2860 .prio_changed = prio_changed_rt, 2861 .switched_to = switched_to_rt, 2862 2863 .update_curr = update_curr_rt, 2864 2865#ifdef CONFIG_UCLAMP_TASK 2866 .uclamp_enabled = 1, 2867#endif 2868#ifdef CONFIG_SCHED_WALT 2869 .fixup_walt_sched_stats = fixup_walt_sched_stats_common, 2870#endif 2871#ifdef CONFIG_SCHED_RT_ACTIVE_LB 2872 .check_for_migration = check_for_migration_rt, 2873#endif 2874}; 2875 2876#ifdef CONFIG_RT_GROUP_SCHED 2877/* 2878 * Ensure that the real time constraints are schedulable. 2879 */ 2880static DEFINE_MUTEX(rt_constraints_mutex); 2881 2882static inline int tg_has_rt_tasks(struct task_group *tg) 2883{ 2884 struct task_struct *task; 2885 struct css_task_iter it; 2886 int ret = 0; 2887 2888 /* 2889 * Autogroups do not have RT tasks; see autogroup_create(). 2890 */ 2891 if (task_group_is_autogroup(tg)) { 2892 return 0; 2893 } 2894 2895 css_task_iter_start(&tg->css, 0, &it); 2896 while (!ret && (task = css_task_iter_next(&it))) { 2897 ret |= rt_task(task); 2898 } 2899 css_task_iter_end(&it); 2900 2901 return ret; 2902} 2903 2904struct rt_schedulable_data { 2905 struct task_group *tg; 2906 u64 rt_period; 2907 u64 rt_runtime; 2908}; 2909 2910static int tg_rt_schedulable(struct task_group *tg, void *data) 2911{ 2912 struct rt_schedulable_data *d = data; 2913 struct task_group *child; 2914 unsigned long total, sum = 0; 2915 u64 period, runtime; 2916 2917 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 2918 runtime = tg->rt_bandwidth.rt_runtime; 2919 2920 if (tg == d->tg) { 2921 period = d->rt_period; 2922 runtime = d->rt_runtime; 2923 } 2924 2925 /* 2926 * Cannot have more runtime than the period. 2927 */ 2928 if (runtime > period && runtime != RUNTIME_INF) { 2929 return -EINVAL; 2930 } 2931 2932 /* 2933 * Ensure we don't starve existing RT tasks if runtime turns zero. 2934 */ 2935 if (rt_bandwidth_enabled() && !runtime && tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) { 2936 return -EBUSY; 2937 } 2938 2939 total = to_ratio(period, runtime); 2940 /* 2941 * Nobody can have more than the global setting allows. 2942 */ 2943 if (total > to_ratio(global_rt_period(), global_rt_runtime())) { 2944 return -EINVAL; 2945 } 2946 2947 /* 2948 * The sum of our children's runtime should not exceed our own. 2949 */ 2950 list_for_each_entry_rcu(child, &tg->children, siblings) 2951 { 2952 period = ktime_to_ns(child->rt_bandwidth.rt_period); 2953 runtime = child->rt_bandwidth.rt_runtime; 2954 2955 if (child == d->tg) { 2956 period = d->rt_period; 2957 runtime = d->rt_runtime; 2958 } 2959 2960 sum += to_ratio(period, runtime); 2961 } 2962 2963 if (sum > total) { 2964 return -EINVAL; 2965 } 2966 2967 return 0; 2968} 2969 2970static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 2971{ 2972 int ret; 2973 2974 struct rt_schedulable_data data = { 2975 .tg = tg, 2976 .rt_period = period, 2977 .rt_runtime = runtime, 2978 }; 2979 2980 rcu_read_lock(); 2981 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 2982 rcu_read_unlock(); 2983 2984 return ret; 2985} 2986 2987static int tg_set_rt_bandwidth(struct task_group *tg, u64 rt_period, u64 rt_runtime) 2988{ 2989 int i, err = 0; 2990 2991 /* 2992 * Disallowing the root group RT runtime is BAD, it would disallow the 2993 * kernel creating (and or operating) RT threads. 2994 */ 2995 if (tg == &root_task_group && rt_runtime == 0) { 2996 return -EINVAL; 2997 } 2998 2999 /* No period doesn't make any sense. */ 3000 if (rt_period == 0) { 3001 return -EINVAL; 3002 } 3003 3004 /* 3005 * Bound quota to defend quota against overflow during bandwidth shift. 3006 */ 3007 if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) { 3008 return -EINVAL; 3009 } 3010 3011 mutex_lock(&rt_constraints_mutex); 3012 err = __rt_schedulable(tg, rt_period, rt_runtime); 3013 if (err) { 3014 goto unlock; 3015 } 3016 3017 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 3018 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 3019 tg->rt_bandwidth.rt_runtime = rt_runtime; 3020 3021 for_each_possible_cpu(i) 3022 { 3023 struct rt_rq *rt_rq = tg->rt_rq[i]; 3024 3025 raw_spin_lock(&rt_rq->rt_runtime_lock); 3026 rt_rq->rt_runtime = rt_runtime; 3027 raw_spin_unlock(&rt_rq->rt_runtime_lock); 3028 } 3029 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 3030unlock: 3031 mutex_unlock(&rt_constraints_mutex); 3032 3033 return err; 3034} 3035 3036int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 3037{ 3038 u64 rt_runtime, rt_period; 3039 3040 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 3041 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 3042 if (rt_runtime_us < 0) { 3043 rt_runtime = RUNTIME_INF; 3044 } else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) { 3045 return -EINVAL; 3046 } 3047 3048 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 3049} 3050 3051long sched_group_rt_runtime(struct task_group *tg) 3052{ 3053 u64 rt_runtime_us; 3054 3055 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) { 3056 return -1; 3057 } 3058 3059 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 3060 do_div(rt_runtime_us, NSEC_PER_USEC); 3061 return rt_runtime_us; 3062} 3063 3064int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) 3065{ 3066 u64 rt_runtime, rt_period; 3067 3068 if (rt_period_us > U64_MAX / NSEC_PER_USEC) { 3069 return -EINVAL; 3070 } 3071 3072 rt_period = rt_period_us * NSEC_PER_USEC; 3073 rt_runtime = tg->rt_bandwidth.rt_runtime; 3074 3075 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 3076} 3077 3078long sched_group_rt_period(struct task_group *tg) 3079{ 3080 u64 rt_period_us; 3081 3082 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 3083 do_div(rt_period_us, NSEC_PER_USEC); 3084 return rt_period_us; 3085} 3086 3087static int sched_rt_global_constraints(void) 3088{ 3089 int ret = 0; 3090 3091 mutex_lock(&rt_constraints_mutex); 3092 ret = __rt_schedulable(NULL, 0, 0); 3093 mutex_unlock(&rt_constraints_mutex); 3094 3095 return ret; 3096} 3097 3098int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 3099{ 3100 /* Don't accept realtime tasks when there is no way for them to run */ 3101 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) { 3102 return 0; 3103 } 3104 3105 return 1; 3106} 3107 3108#else /* !CONFIG_RT_GROUP_SCHED */ 3109static int sched_rt_global_constraints(void) 3110{ 3111 unsigned long flags; 3112 int i; 3113 3114 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 3115 for_each_possible_cpu(i) 3116 { 3117 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 3118 3119 raw_spin_lock(&rt_rq->rt_runtime_lock); 3120 rt_rq->rt_runtime = global_rt_runtime(); 3121 raw_spin_unlock(&rt_rq->rt_runtime_lock); 3122 } 3123 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 3124 3125 return 0; 3126} 3127#endif /* CONFIG_RT_GROUP_SCHED */ 3128 3129static int sched_rt_global_validate(void) 3130{ 3131 if (sysctl_sched_rt_period <= 0) { 3132 return -EINVAL; 3133 } 3134 3135 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || 3136 ((u64)sysctl_sched_rt_runtime * NSEC_PER_USEC > max_rt_runtime))) { 3137 return -EINVAL; 3138 } 3139 3140 return 0; 3141} 3142 3143static void sched_rt_do_global(void) 3144{ 3145 raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock); 3146 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 3147 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 3148 raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock); 3149} 3150 3151int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) 3152{ 3153 int old_period, old_runtime; 3154 static DEFINE_MUTEX(mutex); 3155 int ret; 3156 3157 mutex_lock(&mutex); 3158 old_period = sysctl_sched_rt_period; 3159 old_runtime = sysctl_sched_rt_runtime; 3160 3161 ret = proc_dointvec(table, write, buffer, lenp, ppos); 3162 if (!ret && write) { 3163 ret = sched_rt_global_validate(); 3164 if (ret) { 3165 goto undo; 3166 } 3167 3168 ret = sched_dl_global_validate(); 3169 if (ret) { 3170 goto undo; 3171 } 3172 3173 ret = sched_rt_global_constraints(); 3174 if (ret) { 3175 goto undo; 3176 } 3177 3178 sched_rt_do_global(); 3179 sched_dl_do_global(); 3180 } 3181 if (0) { 3182 undo: 3183 sysctl_sched_rt_period = old_period; 3184 sysctl_sched_rt_runtime = old_runtime; 3185 } 3186 mutex_unlock(&mutex); 3187 3188 return ret; 3189} 3190 3191int sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) 3192{ 3193 int ret; 3194 static DEFINE_MUTEX(mutex); 3195 3196 mutex_lock(&mutex); 3197 ret = proc_dointvec(table, write, buffer, lenp, ppos); 3198 /* 3199 * Make sure that internally we keep jiffies. 3200 * Also, writing zero resets the timeslice to default: 3201 */ 3202 if (!ret && write) { 3203 sched_rr_timeslice = 3204 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : msecs_to_jiffies(sysctl_sched_rr_timeslice); 3205 } 3206 mutex_unlock(&mutex); 3207 3208 return ret; 3209} 3210 3211#ifdef CONFIG_SCHED_DEBUG 3212void print_rt_stats(struct seq_file *m, int cpu) 3213{ 3214 rt_rq_iter_t iter; 3215 struct rt_rq *rt_rq; 3216 3217 rcu_read_lock(); 3218 cycle_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) print_rt_rq(m, cpu, rt_rq); 3219 rcu_read_unlock(); 3220} 3221#endif /* CONFIG_SCHED_DEBUG */ 3222