1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Scheduler topology setup/handling methods
4  */
5 #include "sched.h"
6 
7 DEFINE_MUTEX(sched_domains_mutex);
8 #ifdef CONFIG_LOCKDEP
9 EXPORT_SYMBOL_GPL(sched_domains_mutex);
10 #endif
11 
12 /* Protected by sched_domains_mutex: */
13 static cpumask_var_t sched_domains_tmpmask;
14 static cpumask_var_t sched_domains_tmpmask2;
15 
16 #define IMBALANCE_SD_SHARE_CPUCAPACITY 110
17 #define IMBALANCE_SD_SHARE_PKG 117
18 #define IMBALANCE_SD_NUMA 2
19 #define IMBALANCE_SD_NUMA_DIRECT 2
20 
21 #ifdef CONFIG_SCHED_DEBUG
22 
sched_debug_setup(char *str)23 static int __init sched_debug_setup(char *str)
24 {
25     sched_debug_enabled = true;
26 
27     return 0;
28 }
29 early_param("sched_debug", sched_debug_setup);
30 
sched_debug(void)31 static inline bool sched_debug(void)
32 {
33     return sched_debug_enabled;
34 }
35 
36 #define SD_FLAG(_name, mflags) [__##_name] = {.meta_flags = mflags, .name = #_name},
37 const struct sd_flag_debug sd_flag_debug[] = {
38 #include <linux/sched/sd_flags.h>
39 };
40 #undef SD_FLAG
41 
sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask)42 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask)
43 {
44     struct sched_group *group = sd->groups;
45     unsigned long flags = sd->flags;
46     unsigned int idx;
47 
48     cpumask_clear(groupmask);
49 
50     printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
51     printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name);
52 
53     if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
54         printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
55     }
56     if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
57         printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
58     }
59 
60     for_each_set_bit(idx, &flags, __SD_FLAG_CNT)
61     {
62         unsigned int flag = BIT(idx);
63         unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
64 
65         if ((meta_flags & SDF_SHARED_CHILD) && sd->child && !(sd->child->flags & flag)) {
66             printk(KERN_ERR "ERROR: flag %s set here but not in child\n", sd_flag_debug[idx].name);
67         }
68 
69         if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && !(sd->parent->flags & flag)) {
70             printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", sd_flag_debug[idx].name);
71         }
72     }
73 
74     printk(KERN_DEBUG "%*s groups:", level + 1, "");
75     do {
76         if (!group) {
77             printk("\n");
78             printk(KERN_ERR "ERROR: group is NULL\n");
79             break;
80         }
81 
82         if (!cpumask_weight(sched_group_span(group))) {
83             printk(KERN_CONT "\n");
84             printk(KERN_ERR "ERROR: empty group\n");
85             break;
86         }
87 
88         if (!(sd->flags & SD_OVERLAP) && cpumask_intersects(groupmask, sched_group_span(group))) {
89             printk(KERN_CONT "\n");
90             printk(KERN_ERR "ERROR: repeated CPUs\n");
91             break;
92         }
93 
94         cpumask_or(groupmask, groupmask, sched_group_span(group));
95 
96         printk(KERN_CONT " %d:{ span=%*pbl", group->sgc->id, cpumask_pr_args(sched_group_span(group)));
97 
98         if ((sd->flags & SD_OVERLAP) && !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
99             printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group)));
100         }
101 
102         if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
103             printk(KERN_CONT " cap=%lu", group->sgc->capacity);
104         }
105 
106         if (group == sd->groups && sd->child && !cpumask_equal(sched_domain_span(sd->child), sched_group_span(group))) {
107             printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
108         }
109 
110         printk(KERN_CONT " }");
111 
112         group = group->next;
113 
114         if (group != sd->groups) {
115             printk(KERN_CONT ",");
116         }
117     } while (group != sd->groups);
118     printk(KERN_CONT "\n");
119 
120     if (!cpumask_equal(sched_domain_span(sd), groupmask)) {
121         printk(KERN_ERR "ERROR: groups don't span domain->span\n");
122     }
123 
124     if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) {
125         printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
126     }
127     return 0;
128 }
129 
sched_domain_debug(struct sched_domain *sd, int cpu)130 static void sched_domain_debug(struct sched_domain *sd, int cpu)
131 {
132     int level = 0;
133 
134     if (!sched_debug_enabled) {
135         return;
136     }
137 
138     if (!sd) {
139         printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
140         return;
141     }
142 
143     printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
144 
145     for (;;) {
146         if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) {
147             break;
148         }
149         level++;
150         sd = sd->parent;
151         if (!sd) {
152             break;
153         }
154     }
155 }
156 #else /* !CONFIG_SCHED_DEBUG */
157 
158 #define sched_debug_enabled 0
159 #define sched_domain_debug(sd, cpu)                                                                                    \
160     do {                                                                                                               \
161     } while (0)
sched_debug(void)162 static inline bool sched_debug(void)
163 {
164     return false;
165 }
166 #endif /* CONFIG_SCHED_DEBUG */
167 
168 /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
169 #define SD_FLAG(name, mflags) ((name) * !!((mflags)&SDF_NEEDS_GROUPS)) |
170 static const unsigned int SD_DEGENERATE_GROUPS_MASK =
171 #include <linux/sched/sd_flags.h>
172     0;
173 #undef SD_FLAG
174 
sd_degenerate(struct sched_domain *sd)175 static int sd_degenerate(struct sched_domain *sd)
176 {
177     if (cpumask_weight(sched_domain_span(sd)) == 1) {
178         return 1;
179     }
180 
181     /* Following flags need at least 2 groups */
182     if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && (sd->groups != sd->groups->next)) {
183         return 0;
184     }
185 
186     /* Following flags don't use groups */
187     if (sd->flags & (SD_WAKE_AFFINE)) {
188         return 0;
189     }
190 
191     return 1;
192 }
193 
sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)194 static int sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
195 {
196     unsigned long cflags = sd->flags, pflags = parent->flags;
197 
198     if (sd_degenerate(parent)) {
199         return 1;
200     }
201 
202     if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) {
203         return 0;
204     }
205 
206     /* Flags needing groups don't count if only 1 group in parent */
207     if (parent->groups == parent->groups->next) {
208         pflags &= ~SD_DEGENERATE_GROUPS_MASK;
209     }
210 
211     if (~cflags & pflags) {
212         return 0;
213     }
214 
215     return 1;
216 }
217 
218 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
219 DEFINE_STATIC_KEY_FALSE(sched_energy_present);
220 unsigned int sysctl_sched_energy_aware = 1;
221 DEFINE_MUTEX(sched_energy_mutex);
222 bool sched_energy_update;
223 
224 #ifdef CONFIG_PROC_SYSCTL
sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos)225 int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos)
226 {
227     int ret, state;
228 
229     if (write && !capable(CAP_SYS_ADMIN)) {
230         return -EPERM;
231     }
232 
233     ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
234     if (!ret && write) {
235         state = static_branch_unlikely(&sched_energy_present);
236         if (state != sysctl_sched_energy_aware) {
237             mutex_lock(&sched_energy_mutex);
238             sched_energy_update = 1;
239             rebuild_sched_domains();
240             sched_energy_update = 0;
241             mutex_unlock(&sched_energy_mutex);
242         }
243     }
244 
245     return ret;
246 }
247 #endif
248 
free_pd(struct perf_domain *pd)249 static void free_pd(struct perf_domain *pd)
250 {
251     struct perf_domain *tmp;
252 
253     while (pd) {
254         tmp = pd->next;
255         kfree(pd);
256         pd = tmp;
257     }
258 }
259 
find_pd(struct perf_domain *pd, int cpu)260 static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
261 {
262     while (pd) {
263         if (cpumask_test_cpu(cpu, perf_domain_span(pd))) {
264             return pd;
265         }
266         pd = pd->next;
267     }
268 
269     return NULL;
270 }
271 
pd_init(int cpu)272 static struct perf_domain *pd_init(int cpu)
273 {
274     struct em_perf_domain *obj = em_cpu_get(cpu);
275     struct perf_domain *pd;
276 
277     if (!obj) {
278         if (sched_debug()) {
279             pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
280         }
281         return NULL;
282     }
283 
284     pd = kzalloc(sizeof(*pd), GFP_KERNEL);
285     if (!pd) {
286         return NULL;
287     }
288     pd->em_pd = obj;
289 
290     return pd;
291 }
292 
perf_domain_debug(const struct cpumask *cpu_map, struct perf_domain *pd)293 static void perf_domain_debug(const struct cpumask *cpu_map, struct perf_domain *pd)
294 {
295     if (!sched_debug() || !pd) {
296         return;
297     }
298 
299     printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
300 
301     while (pd) {
302         printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", cpumask_first(perf_domain_span(pd)),
303                cpumask_pr_args(perf_domain_span(pd)), em_pd_nr_perf_states(pd->em_pd));
304         pd = pd->next;
305     }
306 
307     printk(KERN_CONT "\n");
308 }
309 
destroy_perf_domain_rcu(struct rcu_head *rp)310 static void destroy_perf_domain_rcu(struct rcu_head *rp)
311 {
312     struct perf_domain *pd;
313 
314     pd = container_of(rp, struct perf_domain, rcu);
315     free_pd(pd);
316 }
317 
sched_energy_set(bool has_eas)318 static void sched_energy_set(bool has_eas)
319 {
320     if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
321         if (sched_debug()) {
322             pr_info("%s: stopping EAS\n", __func__);
323         }
324         static_branch_disable_cpuslocked(&sched_energy_present);
325     } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
326         if (sched_debug()) {
327             pr_info("%s: starting EAS\n", __func__);
328         }
329         static_branch_enable_cpuslocked(&sched_energy_present);
330     }
331 }
332 
333 /*
334  * EAS can be used on a root domain if it meets all the following conditions:
335  *    1. an Energy Model (EM) is available;
336  *    2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
337  *    3. no SMT is detected.
338  *    4. the EM complexity is low enough to keep scheduling overheads low;
339  *    5. schedutil is driving the frequency of all CPUs of the rd;
340  *
341  * The complexity of the Energy Model is defined as:
342  *
343  *              C = nr_pd * (nr_cpus + nr_ps)
344  *
345  * with parameters defined as:
346  *  - nr_pd:    the number of performance domains
347  *  - nr_cpus:  the number of CPUs
348  *  - nr_ps:    the sum of the number of performance states of all performance
349  *              domains (for example, on a system with 2 performance domains,
350  *              with 10 performance states each, nr_ps = 2 * 10 = 20).
351  *
352  * It is generally not a good idea to use such a model in the wake-up path on
353  * very complex platforms because of the associated scheduling overheads. The
354  * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
355  * with per-CPU DVFS and less than 8 performance states each, for example.
356  */
357 #define EM_MAX_COMPLEXITY 2048
358 
build_perf_domains(const struct cpumask *cpu_map)359 static bool build_perf_domains(const struct cpumask *cpu_map)
360 {
361     int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
362     struct perf_domain *pd = NULL, *tmp;
363     int cpu = cpumask_first(cpu_map);
364     struct root_domain *rd = cpu_rq(cpu)->rd;
365     struct cpufreq_policy *policy;
366     struct cpufreq_governor *gov;
367 
368     if (!sysctl_sched_energy_aware) {
369         goto free;
370     }
371 
372     /* EAS is enabled for asymmetric CPU capacity topologies. */
373     if (!per_cpu(sd_asym_cpucapacity, cpu)) {
374         if (sched_debug()) {
375             pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", cpumask_pr_args(cpu_map));
376         }
377         goto free;
378     }
379 
380     /* EAS definitely does *not* handle SMT */
381     if (sched_smt_active()) {
382         pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", cpumask_pr_args(cpu_map));
383         goto free;
384     }
385 
386     for_each_cpu(i, cpu_map)
387     {
388         /* Skip already covered CPUs. */
389         if (find_pd(pd, i)) {
390             continue;
391         }
392 
393         /* Do not attempt EAS if schedutil is not being used. */
394         policy = cpufreq_cpu_get(i);
395         if (!policy) {
396             goto free;
397         }
398         gov = policy->governor;
399         cpufreq_cpu_put(policy);
400         if (gov != &schedutil_gov) {
401             if (rd->pd) {
402                 pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", cpumask_pr_args(cpu_map));
403             }
404             goto free;
405         }
406 
407         /* Create the new pd and add it to the local list. */
408         tmp = pd_init(i);
409         if (!tmp) {
410             goto free;
411         }
412         tmp->next = pd;
413         pd = tmp;
414 
415         /*
416          * Count performance domains and performance states for the
417          * complexity check.
418          */
419         nr_pd++;
420         nr_ps += em_pd_nr_perf_states(pd->em_pd);
421     }
422 
423     /* Bail out if the Energy Model complexity is too high. */
424     if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
425         WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", cpumask_pr_args(cpu_map));
426         goto free;
427     }
428 
429     perf_domain_debug(cpu_map, pd);
430 
431     /* Attach the new list of performance domains to the root domain. */
432     tmp = rd->pd;
433     rcu_assign_pointer(rd->pd, pd);
434     if (tmp) {
435         call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
436     }
437 
438     return !!pd;
439 
440 free:
441     free_pd(pd);
442     tmp = rd->pd;
443     rcu_assign_pointer(rd->pd, NULL);
444     if (tmp) {
445         call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
446     }
447 
448     return false;
449 }
450 #else
free_pd(struct perf_domain *pd)451 static void free_pd(struct perf_domain *pd)
452 {
453 }
454 #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
455 
free_rootdomain(struct rcu_head *rcu)456 static void free_rootdomain(struct rcu_head *rcu)
457 {
458     struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
459 
460     cpupri_cleanup(&rd->cpupri);
461     cpudl_cleanup(&rd->cpudl);
462     free_cpumask_var(rd->dlo_mask);
463     free_cpumask_var(rd->rto_mask);
464     free_cpumask_var(rd->online);
465     free_cpumask_var(rd->span);
466     free_pd(rd->pd);
467     kfree(rd);
468 }
469 
rq_attach_root(struct rq *rq, struct root_domain *rd)470 void rq_attach_root(struct rq *rq, struct root_domain *rd)
471 {
472     struct root_domain *old_rd = NULL;
473     unsigned long flags;
474 
475     raw_spin_lock_irqsave(&rq->lock, flags);
476 
477     if (rq->rd) {
478         old_rd = rq->rd;
479 
480         if (cpumask_test_cpu(rq->cpu, old_rd->online)) {
481             set_rq_offline(rq);
482         }
483 
484         cpumask_clear_cpu(rq->cpu, old_rd->span);
485 
486         /*
487          * If we dont want to free the old_rd yet then
488          * set old_rd to NULL to skip the freeing later
489          * in this function:
490          */
491         if (!atomic_dec_and_test(&old_rd->refcount)) {
492             old_rd = NULL;
493         }
494     }
495 
496     atomic_inc(&rd->refcount);
497     rq->rd = rd;
498 
499     cpumask_set_cpu(rq->cpu, rd->span);
500     if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) {
501         set_rq_online(rq);
502     }
503 
504     raw_spin_unlock_irqrestore(&rq->lock, flags);
505 
506     if (old_rd) {
507         call_rcu(&old_rd->rcu, free_rootdomain);
508     }
509 }
510 
sched_get_rd(struct root_domain *rd)511 void sched_get_rd(struct root_domain *rd)
512 {
513     atomic_inc(&rd->refcount);
514 }
515 
sched_put_rd(struct root_domain *rd)516 void sched_put_rd(struct root_domain *rd)
517 {
518     if (!atomic_dec_and_test(&rd->refcount)) {
519         return;
520     }
521 
522     call_rcu(&rd->rcu, free_rootdomain);
523 }
524 
init_rootdomain(struct root_domain *rd)525 static int init_rootdomain(struct root_domain *rd)
526 {
527     if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) {
528         goto out;
529     }
530     if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) {
531         goto free_span;
532     }
533     if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) {
534         goto free_online;
535     }
536     if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) {
537         goto free_dlo_mask;
538     }
539 
540 #ifdef HAVE_RT_PUSH_IPI
541     rd->rto_cpu = -1;
542     raw_spin_lock_init(&rd->rto_lock);
543     init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
544 #endif
545 
546     init_dl_bw(&rd->dl_bw);
547     if (cpudl_init(&rd->cpudl) != 0) {
548         goto free_rto_mask;
549     }
550 
551     if (cpupri_init(&rd->cpupri) != 0) {
552         goto free_cpudl;
553     }
554 
555 #ifdef CONFIG_SCHED_RT_CAS
556     rd->max_cap_orig_cpu = -1;
557 #endif
558     return 0;
559 
560 free_cpudl:
561     cpudl_cleanup(&rd->cpudl);
562 free_rto_mask:
563     free_cpumask_var(rd->rto_mask);
564 free_dlo_mask:
565     free_cpumask_var(rd->dlo_mask);
566 free_online:
567     free_cpumask_var(rd->online);
568 free_span:
569     free_cpumask_var(rd->span);
570 out:
571     return -ENOMEM;
572 }
573 
574 /*
575  * By default the system creates a single root-domain with all CPUs as
576  * members (mimicking the global state we have today).
577  */
578 struct root_domain def_root_domain;
579 
init_defrootdomain(void)580 void init_defrootdomain(void)
581 {
582     init_rootdomain(&def_root_domain);
583 
584     atomic_set(&def_root_domain.refcount, 1);
585 }
586 
alloc_rootdomain(void)587 static struct root_domain *alloc_rootdomain(void)
588 {
589     struct root_domain *rd;
590 
591     rd = kzalloc(sizeof(*rd), GFP_KERNEL);
592     if (!rd) {
593         return NULL;
594     }
595 
596     if (init_rootdomain(rd) != 0) {
597         kfree(rd);
598         return NULL;
599     }
600 
601     return rd;
602 }
603 
free_sched_groups(struct sched_group *sg, int free_sgc)604 static void free_sched_groups(struct sched_group *sg, int free_sgc)
605 {
606     struct sched_group *tmp, *first;
607 
608     if (!sg) {
609         return;
610     }
611 
612     first = sg;
613     do {
614         tmp = sg->next;
615 
616         if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) {
617             kfree(sg->sgc);
618         }
619 
620         if (atomic_dec_and_test(&sg->ref)) {
621             kfree(sg);
622         }
623         sg = tmp;
624     } while (sg != first);
625 }
626 
destroy_sched_domain(struct sched_domain *sd)627 static void destroy_sched_domain(struct sched_domain *sd)
628 {
629     /*
630      * A normal sched domain may have multiple group references, an
631      * overlapping domain, having private groups, only one.  Iterate,
632      * dropping group/capacity references, freeing where none remain.
633      */
634     free_sched_groups(sd->groups, 1);
635 
636     if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) {
637         kfree(sd->shared);
638     }
639     kfree(sd);
640 }
641 
destroy_sched_domains_rcu(struct rcu_head *rcu)642 static void destroy_sched_domains_rcu(struct rcu_head *rcu)
643 {
644     struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
645 
646     while (sd) {
647         struct sched_domain *parent = sd->parent;
648         destroy_sched_domain(sd);
649         sd = parent;
650     }
651 }
652 
destroy_sched_domains(struct sched_domain *sd)653 static void destroy_sched_domains(struct sched_domain *sd)
654 {
655     if (sd) {
656         call_rcu(&sd->rcu, destroy_sched_domains_rcu);
657     }
658 }
659 
660 /*
661  * Keep a special pointer to the highest sched_domain that has
662  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
663  * allows us to avoid some pointer chasing select_idle_sibling().
664  *
665  * Also keep a unique ID per domain (we use the first CPU number in
666  * the cpumask of the domain), this allows us to quickly tell if
667  * two CPUs are in the same cache domain, see cpus_share_cache().
668  */
669 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
670 DEFINE_PER_CPU(int, sd_llc_size);
671 DEFINE_PER_CPU(int, sd_llc_id);
672 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
673 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
674 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
675 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
676 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
677 
update_top_cache_domain(int cpu)678 static void update_top_cache_domain(int cpu)
679 {
680     struct sched_domain_shared *sds = NULL;
681     struct sched_domain *sd;
682     int id = cpu;
683     int size = 1;
684 
685     sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
686     if (sd) {
687         id = cpumask_first(sched_domain_span(sd));
688         size = cpumask_weight(sched_domain_span(sd));
689         sds = sd->shared;
690     }
691 
692     rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
693     per_cpu(sd_llc_size, cpu) = size;
694     per_cpu(sd_llc_id, cpu) = id;
695     rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
696 
697     sd = lowest_flag_domain(cpu, SD_NUMA);
698     rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
699 
700     sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
701     rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
702 
703     sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
704     rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
705 }
706 
707 /*
708  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
709  * hold the hotplug lock.
710  */
cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)711 static void cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
712 {
713     struct rq *rq = cpu_rq(cpu);
714     struct sched_domain *tmp;
715     int numa_distance = 0;
716 
717     /* Remove the sched domains which do not contribute to scheduling. */
718     for (tmp = sd; tmp;) {
719         struct sched_domain *parent = tmp->parent;
720         if (!parent) {
721             break;
722         }
723 
724         if (sd_parent_degenerate(tmp, parent)) {
725             tmp->parent = parent->parent;
726             if (parent->parent) {
727                 parent->parent->child = tmp;
728             }
729             /*
730              * Transfer SD_PREFER_SIBLING down in case of a
731              * degenerate parent; the spans match for this
732              * so the property transfers.
733              */
734             if (parent->flags & SD_PREFER_SIBLING) {
735                 tmp->flags |= SD_PREFER_SIBLING;
736             }
737             destroy_sched_domain(parent);
738         } else {
739             tmp = tmp->parent;
740         }
741     }
742 
743     if (sd && sd_degenerate(sd)) {
744         tmp = sd;
745         sd = sd->parent;
746         destroy_sched_domain(tmp);
747         if (sd) {
748             sd->child = NULL;
749         }
750     }
751 
752     for (tmp = sd; tmp; tmp = tmp->parent) {
753         numa_distance += !!(tmp->flags & SD_NUMA);
754     }
755 
756     sched_domain_debug(sd, cpu);
757 
758     rq_attach_root(rq, rd);
759     tmp = rq->sd;
760     rcu_assign_pointer(rq->sd, sd);
761     dirty_sched_domain_sysctl(cpu);
762     destroy_sched_domains(tmp);
763 
764     update_top_cache_domain(cpu);
765 }
766 
767 struct s_data {
768     struct sched_domain *__percpu *sd;
769     struct root_domain *rd;
770 };
771 
772 enum s_alloc {
773     sa_rootdomain,
774     sa_sd,
775     sa_sd_storage,
776     sa_none,
777 };
778 
779 /*
780  * Return the canonical balance CPU for this group, this is the first CPU
781  * of this group that's also in the balance mask.
782  *
783  * The balance mask are all those CPUs that could actually end up at this
784  * group. See build_balance_mask().
785  *
786  * Also see should_we_balance().
787  */
group_balance_cpu(struct sched_group *sg)788 int group_balance_cpu(struct sched_group *sg)
789 {
790     return cpumask_first(group_balance_mask(sg));
791 }
792 
793 /*
794  * NUMA topology (first read the regular topology blurb below)
795  *
796  * Given a node-distance table, for example:
797  *
798  *   node   0   1   2   3
799  *     0:  10  20  30  20
800  *     1:  20  10  20  30
801  *     2:  30  20  10  20
802  *     3:  20  30  20  10
803  *
804  * which represents a 4 node ring topology like:
805  *
806  *   0 ----- 1
807  *   |       |
808  *   |       |
809  *   |       |
810  *   3 ----- 2
811  *
812  * We want to construct domains and groups to represent this. The way we go
813  * about doing this is to build the domains on 'hops'. For each NUMA level we
814  * construct the mask of all nodes reachable in @level hops.
815  *
816  * For the above NUMA topology that gives 3 levels:
817  *
818  * NUMA-2    0-3        0-3        0-3        0-3
819  *  groups:    {0-1,3},{1-3}    {0-2},{0,2-3}    {1-3},{0-1,3}    {0,2-3},{0-2}
820  *
821  * NUMA-1    0-1,3        0-2        1-3        0,2-3
822  *  groups:    {0},{1},{3}    {0},{1},{2}    {1},{2},{3}    {0},{2},{3}
823  *
824  * NUMA-0    0        1        2        3
825  *
826  *
827  * As can be seen; things don't nicely line up as with the regular topology.
828  * When we iterate a domain in child domain chunks some nodes can be
829  * represented multiple times -- hence the "overlap" naming for this part of
830  * the topology.
831  *
832  * In order to minimize this overlap, we only build enough groups to cover the
833  * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
834  *
835  * Because
836  *
837  *  - the first group of each domain is its child domain; this
838  *    gets us the first 0-1,3
839  *  - the only uncovered node is 2, who's child domain is 1-3.
840  *
841  * However, because of the overlap, computing a unique CPU for each group is
842  * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
843  * groups include the CPUs of Node-0, while those CPUs would not in fact ever
844  * end up at those groups (they would end up in group: 0-1,3).
845  *
846  * To correct this we have to introduce the group balance mask. This mask
847  * will contain those CPUs in the group that can reach this group given the
848  * (child) domain tree.
849  *
850  * With this we can once again compute balance_cpu and sched_group_capacity
851  * relations.
852  *
853  * XXX include words on how balance_cpu is unique and therefore can be
854  * used for sched_group_capacity links.
855  *
856  *
857  * Another 'interesting' topology is
858  *
859  *   node   0   1   2   3
860  *     0:  10  20  20  30
861  *     1:  20  10  20  20
862  *     2:  20  20  10  20
863  *     3:  30  20  20  10
864  *
865  * Which looks a little like
866  *
867  *   0 ----- 1
868  *   |     / |
869  *   |   /   |
870  *   | /     |
871  *   2 ----- 3
872  *
873  * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
874  * are not.
875  *
876  * This leads to a few particularly weird cases where the sched_domain's are
877  * not of the same number for each CPU. Consider:
878  *
879  * NUMA-2    0-3                        0-3
880  *  groups:    {0-2},{1-3}                    {1-3},{0-2}
881  *
882  * NUMA-1    0-2        0-3        0-3        1-3
883  *
884  * NUMA-0    0        1        2        3
885  *
886  */
887 
888 /*
889  * Build the balance mask; it contains only those CPUs that can arrive at this
890  * group and should be considered to continue balancing.
891  *
892  * We do this during the group creation pass, therefore the group information
893  * isn't complete yet, however since each group represents a (child) domain we
894  * can fully construct this using the sched_domain bits (which are already
895  * complete).
896  */
build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)897 static void build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
898 {
899     const struct cpumask *sg_span = sched_group_span(sg);
900     struct sd_data *sdd = sd->private;
901     struct sched_domain *sibling;
902     int i;
903 
904     cpumask_clear(mask);
905 
906     for_each_cpu(i, sg_span)
907     {
908         sibling = *per_cpu_ptr(sdd->sd, i);
909         /*
910          * Can happen in the asymmetric case, where these siblings are
911          * unused. The mask will not be empty because those CPUs that
912          * do have the top domain _should_ span the domain.
913          */
914         if (!sibling->child) {
915             continue;
916         }
917 
918         /* If we would not end up here, we can't continue from here */
919         if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) {
920             continue;
921         }
922 
923         cpumask_set_cpu(i, mask);
924     }
925 
926     /* We must not have empty masks here */
927     WARN_ON_ONCE(cpumask_empty(mask));
928 }
929 
930 /*
931  * XXX: This creates per-node group entries; since the load-balancer will
932  * immediately access remote memory to construct this group's load-balance
933  * statistics having the groups node local is of dubious benefit.
934  */
build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)935 static struct sched_group *build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
936 {
937     struct sched_group *sg;
938     struct cpumask *sg_span;
939 
940     sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu));
941     if (!sg) {
942         return NULL;
943     }
944 
945     sg_span = sched_group_span(sg);
946     if (sd->child) {
947         cpumask_copy(sg_span, sched_domain_span(sd->child));
948     } else {
949         cpumask_copy(sg_span, sched_domain_span(sd));
950     }
951 
952     atomic_inc(&sg->ref);
953     return sg;
954 }
955 
init_overlap_sched_group(struct sched_domain *sd, struct sched_group *sg)956 static void init_overlap_sched_group(struct sched_domain *sd, struct sched_group *sg)
957 {
958     struct cpumask *mask = sched_domains_tmpmask2;
959     struct sd_data *sdd = sd->private;
960     struct cpumask *sg_span;
961     int cpu;
962 
963     build_balance_mask(sd, sg, mask);
964     cpu = cpumask_first_and(sched_group_span(sg), mask);
965 
966     sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
967     if (atomic_inc_return(&sg->sgc->ref) == 1) {
968         cpumask_copy(group_balance_mask(sg), mask);
969     } else {
970         WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
971     }
972 
973     /*
974      * Initialize sgc->capacity such that even if we mess up the
975      * domains and no possible iteration will get us here, we won't
976      * die on a /0 trap.
977      */
978     sg_span = sched_group_span(sg);
979     sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
980     sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
981     sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
982 }
983 
find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)984 static struct sched_domain *find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
985 {
986     /*
987      * The proper descendant would be the one whose child won't span out
988      * of sd
989      */
990     while (sibling->child && !cpumask_subset(sched_domain_span(sibling->child), sched_domain_span(sd))) {
991         sibling = sibling->child;
992     }
993 
994     /*
995      * As we are referencing sgc across different topology level, we need
996      * to go down to skip those sched_domains which don't contribute to
997      * scheduling because they will be degenerated in cpu_attach_domain
998      */
999     while (sibling->child && cpumask_equal(sched_domain_span(sibling->child), sched_domain_span(sibling))) {
1000         sibling = sibling->child;
1001     }
1002 
1003     return sibling;
1004 }
1005 
build_overlap_sched_groups(struct sched_domain *sd, int cpu)1006 static int build_overlap_sched_groups(struct sched_domain *sd, int cpu)
1007 {
1008     struct sched_group *first = NULL, *last = NULL, *sg;
1009     const struct cpumask *span = sched_domain_span(sd);
1010     struct cpumask *covered = sched_domains_tmpmask;
1011     struct sd_data *sdd = sd->private;
1012     struct sched_domain *sibling;
1013     int i;
1014 
1015     cpumask_clear(covered);
1016 
1017     for_each_cpu_wrap(i, span, cpu)
1018     {
1019         struct cpumask *sg_span;
1020 
1021         if (cpumask_test_cpu(i, covered)) {
1022             continue;
1023         }
1024 
1025         sibling = *per_cpu_ptr(sdd->sd, i);
1026         /*
1027          * Asymmetric node setups can result in situations where the
1028          * domain tree is of unequal depth, make sure to skip domains
1029          * that already cover the entire range.
1030          *
1031          * In that case build_sched_domains() will have terminated the
1032          * iteration early and our sibling sd spans will be empty.
1033          * Domains should always include the CPU they're built on, so
1034          * check that.
1035          */
1036         if (!cpumask_test_cpu(i, sched_domain_span(sibling))) {
1037             continue;
1038         }
1039 
1040         /*
1041          * Usually we build sched_group by sibling's child sched_domain
1042          * But for machines whose NUMA diameter are 3 or above, we move
1043          * to build sched_group by sibling's proper descendant's child
1044          * domain because sibling's child sched_domain will span out of
1045          * the sched_domain being built as below.
1046          *
1047          * Smallest diameter=3 topology is:
1048          *
1049          *   node   0   1   2   3
1050          *     0:  10  20  30  40
1051          *     1:  20  10  20  30
1052          *     2:  30  20  10  20
1053          *     3:  40  30  20  10
1054          *
1055          *   0 --- 1 --- 2 --- 3
1056          *
1057          * NUMA-3       0-3             N/A             N/A             0-3
1058          *  groups:     {0-2},{1-3}                                     {1-3},{0-2}
1059          *
1060          * NUMA-2       0-2             0-3             0-3             1-3
1061          *  groups:     {0-1},{1-3}     {0-2},{2-3}     {1-3},{0-1}     {2-3},{0-2}
1062          *
1063          * NUMA-1       0-1             0-2             1-3             2-3
1064          *  groups:     {0},{1}         {1},{2},{0}     {2},{3},{1}     {3},{2}
1065          *
1066          * NUMA-0       0               1               2               3
1067          *
1068          * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
1069          * group span isn't a subset of the domain span.
1070          */
1071         if (sibling->child && !cpumask_subset(sched_domain_span(sibling->child), span)) {
1072             sibling = find_descended_sibling(sd, sibling);
1073         }
1074 
1075         sg = build_group_from_child_sched_domain(sibling, cpu);
1076         if (!sg) {
1077             goto fail;
1078         }
1079 
1080         sg_span = sched_group_span(sg);
1081         cpumask_or(covered, covered, sg_span);
1082 
1083         init_overlap_sched_group(sibling, sg);
1084 
1085         if (!first) {
1086             first = sg;
1087         }
1088         if (last) {
1089             last->next = sg;
1090         }
1091         last = sg;
1092         last->next = first;
1093     }
1094     sd->groups = first;
1095 
1096     return 0;
1097 
1098 fail:
1099     free_sched_groups(first, 0);
1100 
1101     return -ENOMEM;
1102 }
1103 
1104 /*
1105  * Package topology (also see the load-balance blurb in fair.c)
1106  *
1107  * The scheduler builds a tree structure to represent a number of important
1108  * topology features. By default (default_topology[]) these include:
1109  *
1110  *  - Simultaneous multithreading (SMT)
1111  *  - Multi-Core Cache (MC)
1112  *  - Package (DIE)
1113  *
1114  * Where the last one more or less denotes everything up to a NUMA node.
1115  *
1116  * The tree consists of 3 primary data structures:
1117  *
1118  *    sched_domain -> sched_group -> sched_group_capacity
1119  *        ^ ^             ^ ^
1120  *          `-'             `-'
1121  *
1122  * The sched_domains are per-CPU and have a two way link (parent & child) and
1123  * denote the ever growing mask of CPUs belonging to that level of topology.
1124  *
1125  * Each sched_domain has a circular (double) linked list of sched_group's, each
1126  * denoting the domains of the level below (or individual CPUs in case of the
1127  * first domain level). The sched_group linked by a sched_domain includes the
1128  * CPU of that sched_domain [*].
1129  *
1130  * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
1131  *
1132  * CPU   0   1   2   3   4   5   6   7
1133  *
1134  * DIE  [                             ]
1135  * MC   [             ] [             ]
1136  * SMT  [     ] [     ] [     ] [     ]
1137  *
1138  *  - or -
1139  *
1140  * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
1141  * MC    0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
1142  * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
1143  *
1144  * CPU   0   1   2   3   4   5   6   7
1145  *
1146  * One way to think about it is: sched_domain moves you up and down among these
1147  * topology levels, while sched_group moves you sideways through it, at child
1148  * domain granularity.
1149  *
1150  * sched_group_capacity ensures each unique sched_group has shared storage.
1151  *
1152  * There are two related construction problems, both require a CPU that
1153  * uniquely identify each group (for a given domain):
1154  *
1155  *  - The first is the balance_cpu (see should_we_balance() and the
1156  *    load-balance blub in fair.c); for each group we only want 1 CPU to
1157  *    continue balancing at a higher domain.
1158  *
1159  *  - The second is the sched_group_capacity; we want all identical groups
1160  *    to share a single sched_group_capacity.
1161  *
1162  * Since these topologies are exclusive by construction. That is, its
1163  * impossible for an SMT thread to belong to multiple cores, and cores to
1164  * be part of multiple caches. There is a very clear and unique location
1165  * for each CPU in the hierarchy.
1166  *
1167  * Therefore computing a unique CPU for each group is trivial (the iteration
1168  * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
1169  * group), we can simply pick the first CPU in each group.
1170  *
1171  *
1172  * [*] in other words, the first group of each domain is its child domain.
1173  */
1174 
get_group(int cpu, struct sd_data *sdd)1175 static struct sched_group *get_group(int cpu, struct sd_data *sdd)
1176 {
1177     struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1178     struct sched_domain *child = sd->child;
1179     struct sched_group *sg;
1180     bool already_visited;
1181 
1182     if (child) {
1183         cpu = cpumask_first(sched_domain_span(child));
1184     }
1185 
1186     sg = *per_cpu_ptr(sdd->sg, cpu);
1187     sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1188 
1189     /* Increase refcounts for claim_allocations: */
1190     already_visited = atomic_inc_return(&sg->ref) > 1;
1191     /* sgc visits should follow a similar trend as sg */
1192     WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1193 
1194     /* If we have already visited that group, it's already initialized. */
1195     if (already_visited) {
1196         return sg;
1197     }
1198 
1199     if (child) {
1200         cpumask_copy(sched_group_span(sg), sched_domain_span(child));
1201         cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
1202     } else {
1203         cpumask_set_cpu(cpu, sched_group_span(sg));
1204         cpumask_set_cpu(cpu, group_balance_mask(sg));
1205     }
1206 
1207     sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
1208     sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
1209     sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
1210 
1211     return sg;
1212 }
1213 
1214 /*
1215  * build_sched_groups will build a circular linked list of the groups
1216  * covered by the given span, will set each group's ->cpumask correctly,
1217  * and will initialize their ->sgc.
1218  *
1219  * Assumes the sched_domain tree is fully constructed
1220  */
build_sched_groups(struct sched_domain *sd, int cpu)1221 static int build_sched_groups(struct sched_domain *sd, int cpu)
1222 {
1223     struct sched_group *first = NULL, *last = NULL;
1224     struct sd_data *sdd = sd->private;
1225     const struct cpumask *span = sched_domain_span(sd);
1226     struct cpumask *covered;
1227     int i;
1228 
1229     lockdep_assert_held(&sched_domains_mutex);
1230     covered = sched_domains_tmpmask;
1231 
1232     cpumask_clear(covered);
1233 
1234     for_each_cpu_wrap(i, span, cpu)
1235     {
1236         struct sched_group *sg;
1237 
1238         if (cpumask_test_cpu(i, covered)) {
1239             continue;
1240         }
1241 
1242         sg = get_group(i, sdd);
1243 
1244         cpumask_or(covered, covered, sched_group_span(sg));
1245 
1246         if (!first) {
1247             first = sg;
1248         }
1249         if (last) {
1250             last->next = sg;
1251         }
1252         last = sg;
1253     }
1254     last->next = first;
1255     sd->groups = first;
1256 
1257     return 0;
1258 }
1259 
1260 /*
1261  * Initialize sched groups cpu_capacity.
1262  *
1263  * cpu_capacity indicates the capacity of sched group, which is used while
1264  * distributing the load between different sched groups in a sched domain.
1265  * Typically cpu_capacity for all the groups in a sched domain will be same
1266  * unless there are asymmetries in the topology. If there are asymmetries,
1267  * group having more cpu_capacity will pickup more load compared to the
1268  * group having less cpu_capacity.
1269  */
init_sched_groups_capacity(int cpu, struct sched_domain *sd)1270 void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
1271 {
1272     struct sched_group *sg = sd->groups;
1273 #ifdef CONFIG_CPU_ISOLATION_OPT
1274     cpumask_t avail_mask;
1275 #endif
1276 
1277     WARN_ON(!sg);
1278 
1279     do {
1280         int cpu, max_cpu = -1;
1281 
1282 #ifdef CONFIG_CPU_ISOLATION_OPT
1283         cpumask_andnot(&avail_mask, sched_group_span(sg), cpu_isolated_mask);
1284         sg->group_weight = cpumask_weight(&avail_mask);
1285 #else
1286         sg->group_weight = cpumask_weight(sched_group_span(sg));
1287 #endif
1288 
1289         if (!(sd->flags & SD_ASYM_PACKING)) {
1290             goto next;
1291         }
1292 
1293         for_each_cpu(cpu, sched_group_span(sg))
1294         {
1295             if (max_cpu < 0) {
1296                 max_cpu = cpu;
1297             } else if (sched_asym_prefer(cpu, max_cpu)) {
1298                 max_cpu = cpu;
1299             }
1300         }
1301         sg->asym_prefer_cpu = max_cpu;
1302 
1303     next:
1304         sg = sg->next;
1305     } while (sg != sd->groups);
1306 
1307     if (cpu != group_balance_cpu(sg)) {
1308         return;
1309     }
1310 
1311     update_group_capacity(sd, cpu);
1312 }
1313 
1314 /*
1315  * Initializers for schedule domains
1316  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
1317  */
1318 
1319 static int default_relax_domain_level = -1;
1320 int sched_domain_level_max;
1321 
setup_relax_domain_level(char *str)1322 static int __init setup_relax_domain_level(char *str)
1323 {
1324     if (kstrtoint(str, 0, &default_relax_domain_level)) {
1325         pr_warn("Unable to set relax_domain_level\n");
1326     }
1327 
1328     return 1;
1329 }
1330 __setup("relax_domain_level=", setup_relax_domain_level);
1331 
set_domain_attribute(struct sched_domain *sd, struct sched_domain_attr *attr)1332 static void set_domain_attribute(struct sched_domain *sd, struct sched_domain_attr *attr)
1333 {
1334     int request;
1335 
1336     if (!attr || attr->relax_domain_level < 0) {
1337         if (default_relax_domain_level < 0) {
1338             return;
1339         }
1340         request = default_relax_domain_level;
1341     } else {
1342         request = attr->relax_domain_level;
1343     }
1344 
1345     if (sd->level > request) {
1346         /* Turn off idle balance on this domain: */
1347         sd->flags &= ~(SD_BALANCE_WAKE | SD_BALANCE_NEWIDLE);
1348     }
1349 }
1350 
1351 static void __sdt_free(const struct cpumask *cpu_map);
1352 static int __sdt_alloc(const struct cpumask *cpu_map);
1353 
__free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map)1354 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map)
1355 {
1356     switch (what) {
1357         case sa_rootdomain:
1358             if (!atomic_read(&d->rd->refcount)) {
1359                 free_rootdomain(&d->rd->rcu);
1360             }
1361             fallthrough;
1362         case sa_sd:
1363             free_percpu(d->sd);
1364             fallthrough;
1365         case sa_sd_storage:
1366             __sdt_free(cpu_map);
1367             fallthrough;
1368         case sa_none:
1369             break;
1370     }
1371 }
1372 
__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)1373 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1374 {
1375     memset(d, 0, sizeof(*d));
1376 
1377     if (__sdt_alloc(cpu_map)) {
1378         return sa_sd_storage;
1379     }
1380     d->sd = alloc_percpu(struct sched_domain *);
1381     if (!d->sd) {
1382         return sa_sd_storage;
1383     }
1384     d->rd = alloc_rootdomain();
1385     if (!d->rd) {
1386         return sa_sd;
1387     }
1388 
1389     return sa_rootdomain;
1390 }
1391 
1392 /*
1393  * NULL the sd_data elements we've used to build the sched_domain and
1394  * sched_group structure so that the subsequent __free_domain_allocs()
1395  * will not free the data we're using.
1396  */
claim_allocations(int cpu, struct sched_domain *sd)1397 static void claim_allocations(int cpu, struct sched_domain *sd)
1398 {
1399     struct sd_data *sdd = sd->private;
1400 
1401     WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1402     *per_cpu_ptr(sdd->sd, cpu) = NULL;
1403 
1404     if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) {
1405         *per_cpu_ptr(sdd->sds, cpu) = NULL;
1406     }
1407 
1408     if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) {
1409         *per_cpu_ptr(sdd->sg, cpu) = NULL;
1410     }
1411 
1412     if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) {
1413         *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1414     }
1415 }
1416 
1417 #ifdef CONFIG_NUMA
1418 enum numa_topology_type sched_numa_topology_type;
1419 
1420 static int sched_domains_numa_levels;
1421 static int sched_domains_curr_level;
1422 
1423 int sched_max_numa_distance;
1424 static int *sched_domains_numa_distance;
1425 static struct cpumask ***sched_domains_numa_masks;
1426 int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
1427 #endif
1428 
1429 /*
1430  * SD_flags allowed in topology descriptions.
1431  *
1432  * These flags are purely descriptive of the topology and do not prescribe
1433  * behaviour. Behaviour is artificial and mapped in the below sd_init()
1434  * function:
1435  *
1436  *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
1437  *   SD_SHARE_PKG_RESOURCES - describes shared caches
1438  *   SD_NUMA                - describes NUMA topologies
1439  *
1440  * Odd one out, which beside describing the topology has a quirk also
1441  * prescribes the desired behaviour that goes along with it:
1442  *
1443  *   SD_ASYM_PACKING        - describes SMT quirks
1444  */
1445 #define TOPOLOGY_SD_FLAGS (SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_NUMA | SD_ASYM_PACKING)
1446 
sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain *child, int dflags, int cpu)1447 static struct sched_domain *sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map,
1448                                     struct sched_domain *child, int dflags, int cpu)
1449 {
1450     struct sd_data *sdd = &tl->data;
1451     struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1452     int sd_id, sd_weight, sd_flags = 0;
1453 
1454 #ifdef CONFIG_NUMA
1455     /*
1456      * Ugly hack to pass state to sd_numa_mask()...
1457      */
1458     sched_domains_curr_level = tl->numa_level;
1459 #endif
1460 
1461     sd_weight = cpumask_weight(tl->mask(cpu));
1462 
1463     if (tl->sd_flags) {
1464         sd_flags = (*tl->sd_flags)();
1465     }
1466     if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, "wrong sd_flags in topology description\n")) {
1467         sd_flags &= TOPOLOGY_SD_FLAGS;
1468     }
1469 
1470     /* Apply detected topology flags */
1471     sd_flags |= dflags;
1472 
1473     *sd = (struct sched_domain) {
1474         .min_interval = sd_weight,
1475         .max_interval = 2 * sd_weight,
1476         .busy_factor = 16,
1477         .imbalance_pct = 117,
1478 
1479         .cache_nice_tries = 0,
1480 
1481         .flags = 1 * SD_BALANCE_NEWIDLE | 1 * SD_BALANCE_EXEC | 1 * SD_BALANCE_FORK | 0 * SD_BALANCE_WAKE |
1482                  1 * SD_WAKE_AFFINE | 0 * SD_SHARE_CPUCAPACITY | 0 * SD_SHARE_PKG_RESOURCES | 0 * SD_SERIALIZE |
1483                  1 * SD_PREFER_SIBLING | 0 * SD_NUMA | sd_flags,
1484 
1485         .last_balance = jiffies,
1486         .balance_interval = sd_weight,
1487         .max_newidle_lb_cost = 0,
1488         .next_decay_max_lb_cost = jiffies,
1489         .child = child,
1490 #ifdef CONFIG_SCHED_DEBUG
1491         .name = tl->name,
1492 #endif
1493     };
1494 
1495     cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
1496     sd_id = cpumask_first(sched_domain_span(sd));
1497 
1498     /*
1499      * Convert topological properties into behaviour.
1500      */
1501 
1502     /* Don't attempt to spread across CPUs of different capacities. */
1503     if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) {
1504         sd->child->flags &= ~SD_PREFER_SIBLING;
1505     }
1506 
1507     if (sd->flags & SD_SHARE_CPUCAPACITY) {
1508         sd->imbalance_pct = IMBALANCE_SD_SHARE_CPUCAPACITY;
1509     } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1510         sd->imbalance_pct = IMBALANCE_SD_SHARE_PKG;
1511         sd->cache_nice_tries = 1;
1512 
1513 #ifdef CONFIG_NUMA
1514     } else if (sd->flags & SD_NUMA) {
1515         sd->cache_nice_tries = IMBALANCE_SD_NUMA;
1516 
1517         sd->flags &= ~SD_PREFER_SIBLING;
1518         sd->flags |= SD_SERIALIZE;
1519         if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
1520             sd->flags &= ~(SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE);
1521         }
1522 
1523 #endif
1524     } else {
1525         sd->cache_nice_tries = 1;
1526     }
1527 
1528     /*
1529      * For all levels sharing cache; connect a sched_domain_shared
1530      * instance.
1531      */
1532     if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1533         sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1534         atomic_inc(&sd->shared->ref);
1535         atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1536     }
1537 
1538     sd->private = sdd;
1539 
1540     return sd;
1541 }
1542 
1543 /*
1544  * Topology list, bottom-up.
1545  */
1546 static struct sched_domain_topology_level default_topology[] = {
1547 #ifdef CONFIG_SCHED_SMT
1548     {cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)},
1549 #endif
1550 #ifdef CONFIG_SCHED_MC
1551     {cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC)},
1552 #endif
1553     {cpu_cpu_mask, SD_INIT_NAME(DIE)},
1554     {
1555         NULL,
1556     },
1557 };
1558 
1559 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
1560 
1561 #define for_each_sd_topology(tl) for (tl = sched_domain_topology; (tl)->mask; (tl)++)
1562 
set_sched_topology(struct sched_domain_topology_level *tl)1563 void set_sched_topology(struct sched_domain_topology_level *tl)
1564 {
1565     if (WARN_ON_ONCE(sched_smp_initialized)) {
1566         return;
1567     }
1568 
1569     sched_domain_topology = tl;
1570 }
1571 
1572 #ifdef CONFIG_NUMA
1573 
sd_numa_mask(int cpu)1574 static const struct cpumask *sd_numa_mask(int cpu)
1575 {
1576     return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1577 }
1578 
sched_numa_warn(const char *str)1579 static void sched_numa_warn(const char *str)
1580 {
1581     static int done = false;
1582     int i, j;
1583 
1584     if (done) {
1585         return;
1586     }
1587 
1588     done = true;
1589 
1590     printk(KERN_WARNING "ERROR: %s\n\n", str);
1591 
1592     for (i = 0; i < nr_node_ids; i++) {
1593         printk(KERN_WARNING "  ");
1594         for (j = 0; j < nr_node_ids; j++) {
1595             printk(KERN_CONT "%02d ", node_distance(i, j));
1596         }
1597         printk(KERN_CONT "\n");
1598     }
1599     printk(KERN_WARNING "\n");
1600 }
1601 
find_numa_distance(int distance)1602 bool find_numa_distance(int distance)
1603 {
1604     int i;
1605 
1606     if (distance == node_distance(0, 0)) {
1607         return true;
1608     }
1609 
1610     for (i = 0; i < sched_domains_numa_levels; i++) {
1611         if (sched_domains_numa_distance[i] == distance) {
1612             return true;
1613         }
1614     }
1615 
1616     return false;
1617 }
1618 
1619 /*
1620  * A system can have three types of NUMA topology:
1621  * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
1622  * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
1623  * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
1624  *
1625  * The difference between a glueless mesh topology and a backplane
1626  * topology lies in whether communication between not directly
1627  * connected nodes goes through intermediary nodes (where programs
1628  * could run), or through backplane controllers. This affects
1629  * placement of programs.
1630  *
1631  * The type of topology can be discerned with the following tests:
1632  * - If the maximum distance between any nodes is 1 hop, the system
1633  *   is directly connected.
1634  * - If for two nodes A and B, located N > 1 hops away from each other,
1635  *   there is an intermediary node C, which is < N hops away from both
1636  *   nodes A and B, the system is a glueless mesh.
1637  */
init_numa_topology_type(void)1638 static void init_numa_topology_type(void)
1639 {
1640     int a, b, c, n;
1641 
1642     n = sched_max_numa_distance;
1643 
1644     if (sched_domains_numa_levels <= IMBALANCE_SD_NUMA_DIRECT) {
1645         sched_numa_topology_type = NUMA_DIRECT;
1646         return;
1647     }
1648 
1649     for_each_online_node(a)
1650     {
1651         for_each_online_node(b)
1652         {
1653             /* Find two nodes furthest removed from each other. */
1654             if (node_distance(a, b) < n) {
1655                 continue;
1656             }
1657 
1658             /* Is there an intermediary node between a and b? */
1659             for_each_online_node(c)
1660             {
1661                 if (node_distance(a, c) < n && node_distance(b, c) < n) {
1662                     sched_numa_topology_type = NUMA_GLUELESS_MESH;
1663                     return;
1664                 }
1665             }
1666 
1667             sched_numa_topology_type = NUMA_BACKPLANE;
1668             return;
1669         }
1670     }
1671 }
1672 
1673 #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
1674 
sched_init_numa(void)1675 void sched_init_numa(void)
1676 {
1677     struct sched_domain_topology_level *tl;
1678     unsigned long *distance_map;
1679     int nr_levels = 0;
1680     int i, j;
1681 
1682     /*
1683      * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1684      * unique distances in the node_distance() table.
1685      */
1686     distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
1687     if (!distance_map) {
1688         return;
1689     }
1690 
1691     bitmap_zero(distance_map, NR_DISTANCE_VALUES);
1692     for (i = 0; i < nr_node_ids; i++) {
1693         for (j = 0; j < nr_node_ids; j++) {
1694             int distance = node_distance(i, j);
1695             if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
1696                 sched_numa_warn("Invalid distance value range");
1697                 return;
1698             }
1699 
1700             bitmap_set(distance_map, distance, 1);
1701         }
1702     }
1703     /*
1704      * We can now figure out how many unique distance values there are and
1705      * allocate memory accordingly.
1706      */
1707     nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
1708 
1709     sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
1710     if (!sched_domains_numa_distance) {
1711         bitmap_free(distance_map);
1712         return;
1713     }
1714 
1715     for (i = 0, j = 0; i < nr_levels; i++, j++) {
1716         j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
1717         sched_domains_numa_distance[i] = j;
1718     }
1719 
1720     bitmap_free(distance_map);
1721 
1722     /*
1723      * 'nr_levels' contains the number of unique distances
1724      *
1725      * The sched_domains_numa_distance[] array includes the actual distance
1726      * numbers.
1727      */
1728 
1729     /*
1730      * Here, we should temporarily reset sched_domains_numa_levels to 0.
1731      * If it fails to allocate memory for array sched_domains_numa_masks[][],
1732      * the array will contain less then 'nr_levels' members. This could be
1733      * dangerous when we use it to iterate array sched_domains_numa_masks[][]
1734      * in other functions.
1735      *
1736      * We reset it to 'nr_levels' at the end of this function.
1737      */
1738     sched_domains_numa_levels = 0;
1739 
1740     sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
1741     if (!sched_domains_numa_masks) {
1742         return;
1743     }
1744 
1745     /*
1746      * Now for each level, construct a mask per node which contains all
1747      * CPUs of nodes that are that many hops away from us.
1748      */
1749     for (i = 0; i < nr_levels; i++) {
1750         sched_domains_numa_masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1751         if (!sched_domains_numa_masks[i]) {
1752             return;
1753         }
1754 
1755         for (j = 0; j < nr_node_ids; j++) {
1756             struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1757             int k;
1758 
1759             if (!mask) {
1760                 return;
1761             }
1762 
1763             sched_domains_numa_masks[i][j] = mask;
1764 
1765             for_each_node(k)
1766             {
1767                 if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) {
1768                     sched_numa_warn("Node-distance not symmetric");
1769                 }
1770 
1771                 if (node_distance(j, k) > sched_domains_numa_distance[i]) {
1772                     continue;
1773                 }
1774 
1775                 cpumask_or(mask, mask, cpumask_of_node(k));
1776             }
1777         }
1778     }
1779 
1780     /* Compute default topology size */
1781     for (i = 0; sched_domain_topology[i].mask; i++) {
1782         ;
1783     }
1784 
1785     tl = kzalloc((i + nr_levels + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1786     if (!tl) {
1787         return;
1788     }
1789 
1790     /*
1791      * Copy the default topology bits..
1792      */
1793     for (i = 0; sched_domain_topology[i].mask; i++) {
1794         tl[i] = sched_domain_topology[i];
1795     }
1796 
1797     /*
1798      * Add the NUMA identity distance, aka single NODE.
1799      */
1800     tl[i++] = (struct sched_domain_topology_level) {.mask = sd_numa_mask, .numa_level = 0, SD_INIT_NAME(NODE)};
1801 
1802     /*
1803      * .. and append 'j' levels of NUMA goodness.
1804      */
1805     for (j = 1; j < nr_levels; i++, j++) {
1806         tl[i] = (struct sched_domain_topology_level) {
1807             .mask = sd_numa_mask,
1808             .sd_flags = cpu_numa_flags,
1809             .flags = SDTL_OVERLAP,
1810             .numa_level = j,
1811             SD_INIT_NAME(NUMA)};
1812     }
1813 
1814     sched_domain_topology = tl;
1815 
1816     sched_domains_numa_levels = nr_levels;
1817     sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
1818 
1819     init_numa_topology_type();
1820 }
1821 
sched_domains_numa_masks_set(unsigned int cpu)1822 void sched_domains_numa_masks_set(unsigned int cpu)
1823 {
1824     int node = cpu_to_node(cpu);
1825     int i, j;
1826 
1827     for (i = 0; i < sched_domains_numa_levels; i++) {
1828         for (j = 0; j < nr_node_ids; j++) {
1829             if (node_distance(j, node) <= sched_domains_numa_distance[i]) {
1830                 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1831             }
1832         }
1833     }
1834 }
1835 
sched_domains_numa_masks_clear(unsigned int cpu)1836 void sched_domains_numa_masks_clear(unsigned int cpu)
1837 {
1838     int i, j;
1839 
1840     for (i = 0; i < sched_domains_numa_levels; i++) {
1841         for (j = 0; j < nr_node_ids; j++) {
1842             cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1843         }
1844     }
1845 }
1846 
1847 /*
1848  * sched_numa_find_closest() - given the NUMA topology, find the cpu
1849  *                             closest to @cpu from @cpumask.
1850  * cpumask: cpumask to find a cpu from
1851  * cpu: cpu to be close to
1852  *
1853  * returns: cpu, or nr_cpu_ids when nothing found.
1854  */
sched_numa_find_closest(const struct cpumask *cpus, int cpu)1855 int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1856 {
1857     int i, j = cpu_to_node(cpu);
1858 
1859     for (i = 0; i < sched_domains_numa_levels; i++) {
1860         cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1861         if (cpu < nr_cpu_ids) {
1862             return cpu;
1863         }
1864     }
1865     return nr_cpu_ids;
1866 }
1867 
1868 #endif /* CONFIG_NUMA */
1869 
__sdt_alloc(const struct cpumask *cpu_map)1870 static int __sdt_alloc(const struct cpumask *cpu_map)
1871 {
1872     struct sched_domain_topology_level *tl;
1873     int j;
1874 
1875     for (tl = sched_domain_topology; (tl)->mask; (tl)++) {
1876         struct sd_data *sdd = &tl->data;
1877 
1878         sdd->sd = alloc_percpu(struct sched_domain *);
1879         if (!sdd->sd) {
1880             return -ENOMEM;
1881         }
1882 
1883         sdd->sds = alloc_percpu(struct sched_domain_shared *);
1884         if (!sdd->sds) {
1885             return -ENOMEM;
1886         }
1887 
1888         sdd->sg = alloc_percpu(struct sched_group *);
1889         if (!sdd->sg) {
1890             return -ENOMEM;
1891         }
1892 
1893         sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1894         if (!sdd->sgc) {
1895             return -ENOMEM;
1896         }
1897 
1898         for_each_cpu(j, cpu_map)
1899         {
1900             struct sched_domain *sd;
1901             struct sched_domain_shared *sds;
1902             struct sched_group *sg;
1903             struct sched_group_capacity *sgc;
1904 
1905             sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j));
1906             if (!sd) {
1907                 return -ENOMEM;
1908             }
1909 
1910             *per_cpu_ptr(sdd->sd, j) = sd;
1911 
1912             sds = kzalloc_node(sizeof(struct sched_domain_shared), GFP_KERNEL, cpu_to_node(j));
1913             if (!sds) {
1914                 return -ENOMEM;
1915             }
1916 
1917             *per_cpu_ptr(sdd->sds, j) = sds;
1918 
1919             sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j));
1920             if (!sg) {
1921                 return -ENOMEM;
1922             }
1923 
1924             sg->next = sg;
1925 
1926             *per_cpu_ptr(sdd->sg, j) = sg;
1927 
1928             sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), GFP_KERNEL, cpu_to_node(j));
1929             if (!sgc) {
1930                 return -ENOMEM;
1931             }
1932 
1933 #ifdef CONFIG_SCHED_DEBUG
1934             sgc->id = j;
1935 #endif
1936 
1937             *per_cpu_ptr(sdd->sgc, j) = sgc;
1938         }
1939     }
1940 
1941     return 0;
1942 }
1943 
__sdt_free(const struct cpumask *cpu_map)1944 static void __sdt_free(const struct cpumask *cpu_map)
1945 {
1946     struct sched_domain_topology_level *tl;
1947     int j;
1948 
1949     for (tl = sched_domain_topology; (tl)->mask; (tl)++) {
1950         struct sd_data *sdd = &tl->data;
1951 
1952         for_each_cpu(j, cpu_map) {
1953             struct sched_domain *sd;
1954 
1955             if (sdd->sd) {
1956                 sd = *per_cpu_ptr(sdd->sd, j);
1957                 if (sd && (sd->flags & SD_OVERLAP)) {
1958                     free_sched_groups(sd->groups, 0);
1959                 }
1960                 kfree(*per_cpu_ptr(sdd->sd, j));
1961             }
1962 
1963             if (sdd->sds) {
1964                 kfree(*per_cpu_ptr(sdd->sds, j));
1965             }
1966             if (sdd->sg) {
1967                 kfree(*per_cpu_ptr(sdd->sg, j));
1968             }
1969             if (sdd->sgc) {
1970                 kfree(*per_cpu_ptr(sdd->sgc, j));
1971             }
1972         }
1973         free_percpu(sdd->sd);
1974         sdd->sd = NULL;
1975         free_percpu(sdd->sds);
1976         sdd->sds = NULL;
1977         free_percpu(sdd->sg);
1978         sdd->sg = NULL;
1979         free_percpu(sdd->sgc);
1980         sdd->sgc = NULL;
1981     }
1982 }
1983 
build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int dflags, int cpu)1984 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map,
1985                                                struct sched_domain_attr *attr, struct sched_domain *child, int dflags,
1986                                                int cpu)
1987 {
1988     struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
1989 
1990     if (child) {
1991         sd->level = child->level + 1;
1992         sched_domain_level_max = max(sched_domain_level_max, sd->level);
1993         child->parent = sd;
1994 
1995         if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) {
1996             pr_err("BUG: arch topology borken\n");
1997 #ifdef CONFIG_SCHED_DEBUG
1998             pr_err("     the %s domain not a subset of the %s domain\n", child->name, sd->name);
1999 #endif
2000             /* Fixup, ensure @sd has at least @child CPUs. */
2001             cpumask_or(sched_domain_span(sd), sched_domain_span(sd), sched_domain_span(child));
2002         }
2003     }
2004     set_domain_attribute(sd, attr);
2005 
2006     return sd;
2007 }
2008 
2009 /*
2010  * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
2011  * any two given CPUs at this (non-NUMA) topology level.
2012  */
topology_span_sane(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, int cpu)2013 static bool topology_span_sane(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, int cpu)
2014 {
2015     int i;
2016 
2017     /* NUMA levels are allowed to overlap */
2018     if (tl->flags & SDTL_OVERLAP) {
2019         return true;
2020     }
2021 
2022     /*
2023      * Non-NUMA levels cannot partially overlap - they must be either
2024      * completely equal or completely disjoint. Otherwise we can end up
2025      * breaking the sched_group lists - i.e. a later get_group() pass
2026      * breaks the linking done for an earlier span.
2027      */
2028     for_each_cpu(i, cpu_map)
2029     {
2030         if (i == cpu) {
2031             continue;
2032         }
2033         /*
2034          * We should 'and' all those masks with 'cpu_map' to exactly
2035          * match the topology we're about to build, but that can only
2036          * remove CPUs, which only lessens our ability to detect
2037          * overlaps
2038          */
2039         if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && cpumask_intersects(tl->mask(cpu), tl->mask(i))) {
2040             return false;
2041         }
2042     }
2043 
2044     return true;
2045 }
2046 
2047 /*
2048  * Find the sched_domain_topology_level where all CPU capacities are visible
2049  * for all CPUs.
2050  */
asym_cpu_capacity_level(const struct cpumask *cpu_map)2051 static struct sched_domain_topology_level *asym_cpu_capacity_level(const struct cpumask *cpu_map)
2052 {
2053     int i, j, asym_level = 0;
2054     bool asym = false;
2055     struct sched_domain_topology_level *tl, *asym_tl = NULL;
2056     unsigned long cap;
2057 
2058     /* Is there any asymmetry? */
2059     cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
2060 
2061     for_each_cpu(i, cpu_map)
2062     {
2063         if (arch_scale_cpu_capacity(i) != cap) {
2064             asym = true;
2065             break;
2066         }
2067     }
2068 
2069     if (!asym) {
2070         return NULL;
2071     }
2072 
2073     /*
2074      * Examine topology from all CPU's point of views to detect the lowest
2075      * sched_domain_topology_level where a highest capacity CPU is visible
2076      * to everyone.
2077      */
2078     for_each_cpu(i, cpu_map)
2079     {
2080         unsigned long max_capacity = arch_scale_cpu_capacity(i);
2081         int tl_id = 0;
2082 
2083         for (tl = sched_domain_topology; (tl)->mask; (tl)++) {
2084             if (tl_id < asym_level) {
2085                 goto next_level;
2086             }
2087 
2088             for_each_cpu_and(j, tl->mask(i), cpu_map) {
2089                 unsigned long capacity;
2090 
2091                 capacity = arch_scale_cpu_capacity(j);
2092                 if (capacity <= max_capacity) {
2093                     continue;
2094                 }
2095 
2096                 max_capacity = capacity;
2097                 asym_level = tl_id;
2098                 asym_tl = tl;
2099             }
2100         next_level:
2101             tl_id++;
2102         }
2103     }
2104 
2105     return asym_tl;
2106 }
2107 
2108 /*
2109  * Build sched domains for a given set of CPUs and attach the sched domains
2110  * to the individual CPUs
2111  */
build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)2112 static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
2113 {
2114     enum s_alloc alloc_state = sa_none;
2115     struct sched_domain *sd;
2116     struct s_data d;
2117     struct rq *rq = NULL;
2118     int i, ret = -ENOMEM;
2119     struct sched_domain_topology_level *tl_asym;
2120     bool has_asym = false;
2121 
2122     if (WARN_ON(cpumask_empty(cpu_map))) {
2123         goto error;
2124     }
2125 
2126     alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
2127     if (alloc_state != sa_rootdomain) {
2128         goto error;
2129     }
2130 
2131     tl_asym = asym_cpu_capacity_level(cpu_map);
2132 
2133     /* Set up domains for CPUs specified by the cpu_map: */
2134     for_each_cpu(i, cpu_map)
2135     {
2136         struct sched_domain_topology_level *tl;
2137         int dflags = 0;
2138 
2139         sd = NULL;
2140         for (tl = sched_domain_topology; (tl)->mask; (tl)++) {
2141             if (tl == tl_asym) {
2142                 dflags |= SD_ASYM_CPUCAPACITY;
2143                 has_asym = true;
2144             }
2145 
2146             if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) {
2147                 goto error;
2148             }
2149 
2150             sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
2151 
2152             if (tl == sched_domain_topology) {
2153                 *per_cpu_ptr(d.sd, i) = sd;
2154             }
2155             if (tl->flags & SDTL_OVERLAP) {
2156                 sd->flags |= SD_OVERLAP;
2157             }
2158             if (cpumask_equal(cpu_map, sched_domain_span(sd))) {
2159                 break;
2160             }
2161         }
2162     }
2163 
2164     /* Build the groups for the domains */
2165     for_each_cpu(i, cpu_map)
2166     {
2167         for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2168             sd->span_weight = cpumask_weight(sched_domain_span(sd));
2169             if (sd->flags & SD_OVERLAP) {
2170                 if (build_overlap_sched_groups(sd, i)) {
2171                     goto error;
2172                 }
2173             } else {
2174                 if (build_sched_groups(sd, i)) {
2175                     goto error;
2176                 }
2177             }
2178         }
2179     }
2180 
2181     /* Calculate CPU capacity for physical packages and nodes */
2182     for (i = nr_cpumask_bits - 1; i >= 0; i--) {
2183         if (!cpumask_test_cpu(i, cpu_map)) {
2184             continue;
2185         }
2186 
2187         for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2188             claim_allocations(i, sd);
2189             init_sched_groups_capacity(i, sd);
2190         }
2191     }
2192 
2193     /* Attach the domains */
2194     rcu_read_lock();
2195     for_each_cpu(i, cpu_map)
2196     {
2197 #ifdef CONFIG_SCHED_RT_CAS
2198         int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
2199 #endif
2200 
2201         rq = cpu_rq(i);
2202         sd = *per_cpu_ptr(d.sd, i);
2203 
2204 #ifdef CONFIG_SCHED_RT_CAS
2205         if (max_cpu < 0 || arch_scale_cpu_capacity(i) > arch_scale_cpu_capacity(max_cpu)) {
2206             WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
2207         }
2208 #endif
2209 
2210         /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
2211         if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) {
2212             WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
2213         }
2214 
2215         cpu_attach_domain(sd, d.rd, i);
2216     }
2217     rcu_read_unlock();
2218 
2219     if (has_asym) {
2220         static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
2221     }
2222 
2223     if (rq && sched_debug_enabled) {
2224         pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map),
2225                 rq->rd->max_cpu_capacity);
2226     }
2227 
2228     ret = 0;
2229 error:
2230     __free_domain_allocs(&d, alloc_state, cpu_map);
2231 
2232     return ret;
2233 }
2234 
2235 /* Current sched domains: */
2236 static cpumask_var_t *doms_cur;
2237 
2238 /* Number of sched domains in 'doms_cur': */
2239 static int ndoms_cur;
2240 
2241 /* Attribues of custom domains in 'doms_cur' */
2242 static struct sched_domain_attr *dattr_cur;
2243 
2244 /*
2245  * Special case: If a kmalloc() of a doms_cur partition (array of
2246  * cpumask) fails, then fallback to a single sched domain,
2247  * as determined by the single cpumask fallback_doms.
2248  */
2249 static cpumask_var_t fallback_doms;
2250 
2251 /*
2252  * arch_update_cpu_topology lets virtualized architectures update the
2253  * CPU core maps. It is supposed to return 1 if the topology changed
2254  * or 0 if it stayed the same.
2255  */
arch_update_cpu_topology(void)2256 int __weak arch_update_cpu_topology(void)
2257 {
2258     return 0;
2259 }
2260 
alloc_sched_domains(unsigned int ndoms)2261 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
2262 {
2263     int i;
2264     cpumask_var_t *doms;
2265 
2266     doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
2267     if (!doms) {
2268         return NULL;
2269     }
2270     for (i = 0; i < ndoms; i++) {
2271         if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
2272             free_sched_domains(doms, i);
2273             return NULL;
2274         }
2275     }
2276     return doms;
2277 }
2278 
free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)2279 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
2280 {
2281     unsigned int i;
2282     for (i = 0; i < ndoms; i++) {
2283         free_cpumask_var(doms[i]);
2284     }
2285     kfree(doms);
2286 }
2287 
2288 /*
2289  * Set up scheduler domains and groups.  For now this just excludes isolated
2290  * CPUs, but could be used to exclude other special cases in the future.
2291  */
sched_init_domains(const struct cpumask *cpu_map)2292 int sched_init_domains(const struct cpumask *cpu_map)
2293 {
2294     int err;
2295 
2296     zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
2297     zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
2298     zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
2299 
2300     arch_update_cpu_topology();
2301     ndoms_cur = 1;
2302     doms_cur = alloc_sched_domains(ndoms_cur);
2303     if (!doms_cur) {
2304         doms_cur = &fallback_doms;
2305     }
2306     cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
2307     err = build_sched_domains(doms_cur[0], NULL);
2308     register_sched_domain_sysctl();
2309 
2310     return err;
2311 }
2312 
2313 /*
2314  * Detach sched domains from a group of CPUs specified in cpu_map
2315  * These CPUs will now be attached to the NULL domain
2316  */
detach_destroy_domains(const struct cpumask *cpu_map)2317 static void detach_destroy_domains(const struct cpumask *cpu_map)
2318 {
2319     unsigned int cpu = cpumask_any(cpu_map);
2320     int i;
2321 
2322     if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) {
2323         static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
2324     }
2325 
2326     rcu_read_lock();
2327     for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i);
2328     rcu_read_unlock();
2329 }
2330 
2331 /* handle null as "default" */
dattrs_equal(struct sched_domain_attr *cur, int idx_cur, struct sched_domain_attr *new, int idx_new)2332 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, struct sched_domain_attr *new, int idx_new)
2333 {
2334     struct sched_domain_attr tmp;
2335 
2336     /* Fast path: */
2337     if (!new && !cur) {
2338         return 1;
2339     }
2340 
2341     tmp = SD_ATTR_INIT;
2342 
2343     return !memcmp(cur ? (cur + idx_cur) : &tmp, new ? (new + idx_new) : &tmp, sizeof(struct sched_domain_attr));
2344 }
2345 
2346 /*
2347  * Partition sched domains as specified by the 'ndoms_new'
2348  * cpumasks in the array doms_new[] of cpumasks. This compares
2349  * doms_new[] to the current sched domain partitioning, doms_cur[].
2350  * It destroys each deleted domain and builds each new domain.
2351  *
2352  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
2353  * The masks don't intersect (don't overlap.) We should setup one
2354  * sched domain for each mask. CPUs not in any of the cpumasks will
2355  * not be load balanced. If the same cpumask appears both in the
2356  * current 'doms_cur' domains and in the new 'doms_new', we can leave
2357  * it as it is.
2358  *
2359  * The passed in 'doms_new' should be allocated using
2360  * alloc_sched_domains.  This routine takes ownership of it and will
2361  * free_sched_domains it when done with it. If the caller failed the
2362  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
2363  * and partition_sched_domains() will fallback to the single partition
2364  * 'fallback_doms', it also forces the domains to be rebuilt.
2365  *
2366  * If doms_new == NULL it will be replaced with cpu_online_mask.
2367  * ndoms_new == 0 is a special case for destroying existing domains,
2368  * and it will not create the default domain.
2369  *
2370  * Call with hotplug lock and sched_domains_mutex held
2371  */
partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new)2372 void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new)
2373 {
2374     bool __maybe_unused has_eas = false;
2375     int i, j, n;
2376     int new_topology;
2377 
2378     lockdep_assert_held(&sched_domains_mutex);
2379 
2380     /* Always unregister in case we don't destroy any domains: */
2381     unregister_sched_domain_sysctl();
2382 
2383     /* Let the architecture update CPU core mappings: */
2384     new_topology = arch_update_cpu_topology();
2385 
2386     if (!doms_new) {
2387         WARN_ON_ONCE(dattr_new);
2388         n = 0;
2389         doms_new = alloc_sched_domains(1);
2390         if (doms_new) {
2391             n = 1;
2392             cpumask_and(doms_new[0], cpu_active_mask, housekeeping_cpumask(HK_FLAG_DOMAIN));
2393         }
2394     } else {
2395         n = ndoms_new;
2396     }
2397 
2398     /* Destroy deleted domains: */
2399     for (i = 0; i < ndoms_cur; i++) {
2400         for (j = 0; j < n && !new_topology; j++) {
2401             if (cpumask_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) {
2402                 struct root_domain *rd;
2403 
2404                 /*
2405                  * This domain won't be destroyed and as such
2406                  * its dl_bw->total_bw needs to be cleared.  It
2407                  * will be recomputed in function
2408                  * update_tasks_root_domain().
2409                  */
2410                 rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2411                 dl_clear_root_domain(rd);
2412                 goto match1;
2413             }
2414         }
2415         /* No match - a current sched domain not in new doms_new[] */
2416         detach_destroy_domains(doms_cur[i]);
2417     match1:;
2418     }
2419 
2420     n = ndoms_cur;
2421     if (!doms_new) {
2422         n = 0;
2423         doms_new = &fallback_doms;
2424         cpumask_and(doms_new[0], cpu_active_mask, housekeeping_cpumask(HK_FLAG_DOMAIN));
2425     }
2426 
2427     /* Build new domains: */
2428     for (i = 0; i < ndoms_new; i++) {
2429         for (j = 0; j < n && !new_topology; j++) {
2430             if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) {
2431                 goto match2;
2432             }
2433         }
2434         /* No match - add a new doms_new */
2435         build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
2436     match2:;
2437     }
2438 
2439 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
2440     /* Build perf. domains: */
2441     for (i = 0; i < ndoms_new; i++) {
2442         for (j = 0; j < n && !sched_energy_update; j++) {
2443             if (cpumask_equal(doms_new[i], doms_cur[j]) && cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
2444                 has_eas = true;
2445                 goto match3;
2446             }
2447         }
2448         /* No match - add perf. domains for a new rd */
2449         has_eas |= build_perf_domains(doms_new[i]);
2450     match3:;
2451     }
2452     sched_energy_set(has_eas);
2453 #endif
2454 
2455     /* Remember the new sched domains: */
2456     if (doms_cur != &fallback_doms) {
2457         free_sched_domains(doms_cur, ndoms_cur);
2458     }
2459 
2460     kfree(dattr_cur);
2461     doms_cur = doms_new;
2462     dattr_cur = dattr_new;
2463     ndoms_cur = ndoms_new;
2464 
2465     register_sched_domain_sysctl();
2466 }
2467 
2468 /*
2469  * Call with hotplug lock held
2470  */
partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new)2471 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new)
2472 {
2473     mutex_lock(&sched_domains_mutex);
2474     partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
2475     mutex_unlock(&sched_domains_mutex);
2476 }
2477