1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Scheduler topology setup/handling methods
4 */
5#include "sched.h"
6
7DEFINE_MUTEX(sched_domains_mutex);
8#ifdef CONFIG_LOCKDEP
9EXPORT_SYMBOL_GPL(sched_domains_mutex);
10#endif
11
12/* Protected by sched_domains_mutex: */
13static cpumask_var_t sched_domains_tmpmask;
14static cpumask_var_t sched_domains_tmpmask2;
15
16#define IMBALANCE_SD_SHARE_CPUCAPACITY 110
17#define IMBALANCE_SD_SHARE_PKG 117
18#define IMBALANCE_SD_NUMA 2
19#define IMBALANCE_SD_NUMA_DIRECT 2
20
21#ifdef CONFIG_SCHED_DEBUG
22
23static int __init sched_debug_setup(char *str)
24{
25    sched_debug_enabled = true;
26
27    return 0;
28}
29early_param("sched_debug", sched_debug_setup);
30
31static inline bool sched_debug(void)
32{
33    return sched_debug_enabled;
34}
35
36#define SD_FLAG(_name, mflags) [__##_name] = {.meta_flags = mflags, .name = #_name},
37const struct sd_flag_debug sd_flag_debug[] = {
38#include <linux/sched/sd_flags.h>
39};
40#undef SD_FLAG
41
42static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask)
43{
44    struct sched_group *group = sd->groups;
45    unsigned long flags = sd->flags;
46    unsigned int idx;
47
48    cpumask_clear(groupmask);
49
50    printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
51    printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name);
52
53    if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
54        printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
55    }
56    if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
57        printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
58    }
59
60    for_each_set_bit(idx, &flags, __SD_FLAG_CNT)
61    {
62        unsigned int flag = BIT(idx);
63        unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
64
65        if ((meta_flags & SDF_SHARED_CHILD) && sd->child && !(sd->child->flags & flag)) {
66            printk(KERN_ERR "ERROR: flag %s set here but not in child\n", sd_flag_debug[idx].name);
67        }
68
69        if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && !(sd->parent->flags & flag)) {
70            printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", sd_flag_debug[idx].name);
71        }
72    }
73
74    printk(KERN_DEBUG "%*s groups:", level + 1, "");
75    do {
76        if (!group) {
77            printk("\n");
78            printk(KERN_ERR "ERROR: group is NULL\n");
79            break;
80        }
81
82        if (!cpumask_weight(sched_group_span(group))) {
83            printk(KERN_CONT "\n");
84            printk(KERN_ERR "ERROR: empty group\n");
85            break;
86        }
87
88        if (!(sd->flags & SD_OVERLAP) && cpumask_intersects(groupmask, sched_group_span(group))) {
89            printk(KERN_CONT "\n");
90            printk(KERN_ERR "ERROR: repeated CPUs\n");
91            break;
92        }
93
94        cpumask_or(groupmask, groupmask, sched_group_span(group));
95
96        printk(KERN_CONT " %d:{ span=%*pbl", group->sgc->id, cpumask_pr_args(sched_group_span(group)));
97
98        if ((sd->flags & SD_OVERLAP) && !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
99            printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group)));
100        }
101
102        if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
103            printk(KERN_CONT " cap=%lu", group->sgc->capacity);
104        }
105
106        if (group == sd->groups && sd->child && !cpumask_equal(sched_domain_span(sd->child), sched_group_span(group))) {
107            printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
108        }
109
110        printk(KERN_CONT " }");
111
112        group = group->next;
113
114        if (group != sd->groups) {
115            printk(KERN_CONT ",");
116        }
117    } while (group != sd->groups);
118    printk(KERN_CONT "\n");
119
120    if (!cpumask_equal(sched_domain_span(sd), groupmask)) {
121        printk(KERN_ERR "ERROR: groups don't span domain->span\n");
122    }
123
124    if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) {
125        printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
126    }
127    return 0;
128}
129
130static void sched_domain_debug(struct sched_domain *sd, int cpu)
131{
132    int level = 0;
133
134    if (!sched_debug_enabled) {
135        return;
136    }
137
138    if (!sd) {
139        printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
140        return;
141    }
142
143    printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
144
145    for (;;) {
146        if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) {
147            break;
148        }
149        level++;
150        sd = sd->parent;
151        if (!sd) {
152            break;
153        }
154    }
155}
156#else /* !CONFIG_SCHED_DEBUG */
157
158#define sched_debug_enabled 0
159#define sched_domain_debug(sd, cpu)                                                                                    \
160    do {                                                                                                               \
161    } while (0)
162static inline bool sched_debug(void)
163{
164    return false;
165}
166#endif /* CONFIG_SCHED_DEBUG */
167
168/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
169#define SD_FLAG(name, mflags) ((name) * !!((mflags)&SDF_NEEDS_GROUPS)) |
170static const unsigned int SD_DEGENERATE_GROUPS_MASK =
171#include <linux/sched/sd_flags.h>
172    0;
173#undef SD_FLAG
174
175static int sd_degenerate(struct sched_domain *sd)
176{
177    if (cpumask_weight(sched_domain_span(sd)) == 1) {
178        return 1;
179    }
180
181    /* Following flags need at least 2 groups */
182    if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && (sd->groups != sd->groups->next)) {
183        return 0;
184    }
185
186    /* Following flags don't use groups */
187    if (sd->flags & (SD_WAKE_AFFINE)) {
188        return 0;
189    }
190
191    return 1;
192}
193
194static int sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
195{
196    unsigned long cflags = sd->flags, pflags = parent->flags;
197
198    if (sd_degenerate(parent)) {
199        return 1;
200    }
201
202    if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) {
203        return 0;
204    }
205
206    /* Flags needing groups don't count if only 1 group in parent */
207    if (parent->groups == parent->groups->next) {
208        pflags &= ~SD_DEGENERATE_GROUPS_MASK;
209    }
210
211    if (~cflags & pflags) {
212        return 0;
213    }
214
215    return 1;
216}
217
218#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
219DEFINE_STATIC_KEY_FALSE(sched_energy_present);
220unsigned int sysctl_sched_energy_aware = 1;
221DEFINE_MUTEX(sched_energy_mutex);
222bool sched_energy_update;
223
224#ifdef CONFIG_PROC_SYSCTL
225int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos)
226{
227    int ret, state;
228
229    if (write && !capable(CAP_SYS_ADMIN)) {
230        return -EPERM;
231    }
232
233    ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
234    if (!ret && write) {
235        state = static_branch_unlikely(&sched_energy_present);
236        if (state != sysctl_sched_energy_aware) {
237            mutex_lock(&sched_energy_mutex);
238            sched_energy_update = 1;
239            rebuild_sched_domains();
240            sched_energy_update = 0;
241            mutex_unlock(&sched_energy_mutex);
242        }
243    }
244
245    return ret;
246}
247#endif
248
249static void free_pd(struct perf_domain *pd)
250{
251    struct perf_domain *tmp;
252
253    while (pd) {
254        tmp = pd->next;
255        kfree(pd);
256        pd = tmp;
257    }
258}
259
260static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
261{
262    while (pd) {
263        if (cpumask_test_cpu(cpu, perf_domain_span(pd))) {
264            return pd;
265        }
266        pd = pd->next;
267    }
268
269    return NULL;
270}
271
272static struct perf_domain *pd_init(int cpu)
273{
274    struct em_perf_domain *obj = em_cpu_get(cpu);
275    struct perf_domain *pd;
276
277    if (!obj) {
278        if (sched_debug()) {
279            pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
280        }
281        return NULL;
282    }
283
284    pd = kzalloc(sizeof(*pd), GFP_KERNEL);
285    if (!pd) {
286        return NULL;
287    }
288    pd->em_pd = obj;
289
290    return pd;
291}
292
293static void perf_domain_debug(const struct cpumask *cpu_map, struct perf_domain *pd)
294{
295    if (!sched_debug() || !pd) {
296        return;
297    }
298
299    printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
300
301    while (pd) {
302        printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", cpumask_first(perf_domain_span(pd)),
303               cpumask_pr_args(perf_domain_span(pd)), em_pd_nr_perf_states(pd->em_pd));
304        pd = pd->next;
305    }
306
307    printk(KERN_CONT "\n");
308}
309
310static void destroy_perf_domain_rcu(struct rcu_head *rp)
311{
312    struct perf_domain *pd;
313
314    pd = container_of(rp, struct perf_domain, rcu);
315    free_pd(pd);
316}
317
318static void sched_energy_set(bool has_eas)
319{
320    if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
321        if (sched_debug()) {
322            pr_info("%s: stopping EAS\n", __func__);
323        }
324        static_branch_disable_cpuslocked(&sched_energy_present);
325    } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
326        if (sched_debug()) {
327            pr_info("%s: starting EAS\n", __func__);
328        }
329        static_branch_enable_cpuslocked(&sched_energy_present);
330    }
331}
332
333/*
334 * EAS can be used on a root domain if it meets all the following conditions:
335 *    1. an Energy Model (EM) is available;
336 *    2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
337 *    3. no SMT is detected.
338 *    4. the EM complexity is low enough to keep scheduling overheads low;
339 *    5. schedutil is driving the frequency of all CPUs of the rd;
340 *
341 * The complexity of the Energy Model is defined as:
342 *
343 *              C = nr_pd * (nr_cpus + nr_ps)
344 *
345 * with parameters defined as:
346 *  - nr_pd:    the number of performance domains
347 *  - nr_cpus:  the number of CPUs
348 *  - nr_ps:    the sum of the number of performance states of all performance
349 *              domains (for example, on a system with 2 performance domains,
350 *              with 10 performance states each, nr_ps = 2 * 10 = 20).
351 *
352 * It is generally not a good idea to use such a model in the wake-up path on
353 * very complex platforms because of the associated scheduling overheads. The
354 * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
355 * with per-CPU DVFS and less than 8 performance states each, for example.
356 */
357#define EM_MAX_COMPLEXITY 2048
358
359static bool build_perf_domains(const struct cpumask *cpu_map)
360{
361    int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
362    struct perf_domain *pd = NULL, *tmp;
363    int cpu = cpumask_first(cpu_map);
364    struct root_domain *rd = cpu_rq(cpu)->rd;
365    struct cpufreq_policy *policy;
366    struct cpufreq_governor *gov;
367
368    if (!sysctl_sched_energy_aware) {
369        goto free;
370    }
371
372    /* EAS is enabled for asymmetric CPU capacity topologies. */
373    if (!per_cpu(sd_asym_cpucapacity, cpu)) {
374        if (sched_debug()) {
375            pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", cpumask_pr_args(cpu_map));
376        }
377        goto free;
378    }
379
380    /* EAS definitely does *not* handle SMT */
381    if (sched_smt_active()) {
382        pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", cpumask_pr_args(cpu_map));
383        goto free;
384    }
385
386    for_each_cpu(i, cpu_map)
387    {
388        /* Skip already covered CPUs. */
389        if (find_pd(pd, i)) {
390            continue;
391        }
392
393        /* Do not attempt EAS if schedutil is not being used. */
394        policy = cpufreq_cpu_get(i);
395        if (!policy) {
396            goto free;
397        }
398        gov = policy->governor;
399        cpufreq_cpu_put(policy);
400        if (gov != &schedutil_gov) {
401            if (rd->pd) {
402                pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", cpumask_pr_args(cpu_map));
403            }
404            goto free;
405        }
406
407        /* Create the new pd and add it to the local list. */
408        tmp = pd_init(i);
409        if (!tmp) {
410            goto free;
411        }
412        tmp->next = pd;
413        pd = tmp;
414
415        /*
416         * Count performance domains and performance states for the
417         * complexity check.
418         */
419        nr_pd++;
420        nr_ps += em_pd_nr_perf_states(pd->em_pd);
421    }
422
423    /* Bail out if the Energy Model complexity is too high. */
424    if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
425        WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", cpumask_pr_args(cpu_map));
426        goto free;
427    }
428
429    perf_domain_debug(cpu_map, pd);
430
431    /* Attach the new list of performance domains to the root domain. */
432    tmp = rd->pd;
433    rcu_assign_pointer(rd->pd, pd);
434    if (tmp) {
435        call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
436    }
437
438    return !!pd;
439
440free:
441    free_pd(pd);
442    tmp = rd->pd;
443    rcu_assign_pointer(rd->pd, NULL);
444    if (tmp) {
445        call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
446    }
447
448    return false;
449}
450#else
451static void free_pd(struct perf_domain *pd)
452{
453}
454#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
455
456static void free_rootdomain(struct rcu_head *rcu)
457{
458    struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
459
460    cpupri_cleanup(&rd->cpupri);
461    cpudl_cleanup(&rd->cpudl);
462    free_cpumask_var(rd->dlo_mask);
463    free_cpumask_var(rd->rto_mask);
464    free_cpumask_var(rd->online);
465    free_cpumask_var(rd->span);
466    free_pd(rd->pd);
467    kfree(rd);
468}
469
470void rq_attach_root(struct rq *rq, struct root_domain *rd)
471{
472    struct root_domain *old_rd = NULL;
473    unsigned long flags;
474
475    raw_spin_lock_irqsave(&rq->lock, flags);
476
477    if (rq->rd) {
478        old_rd = rq->rd;
479
480        if (cpumask_test_cpu(rq->cpu, old_rd->online)) {
481            set_rq_offline(rq);
482        }
483
484        cpumask_clear_cpu(rq->cpu, old_rd->span);
485
486        /*
487         * If we dont want to free the old_rd yet then
488         * set old_rd to NULL to skip the freeing later
489         * in this function:
490         */
491        if (!atomic_dec_and_test(&old_rd->refcount)) {
492            old_rd = NULL;
493        }
494    }
495
496    atomic_inc(&rd->refcount);
497    rq->rd = rd;
498
499    cpumask_set_cpu(rq->cpu, rd->span);
500    if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) {
501        set_rq_online(rq);
502    }
503
504    raw_spin_unlock_irqrestore(&rq->lock, flags);
505
506    if (old_rd) {
507        call_rcu(&old_rd->rcu, free_rootdomain);
508    }
509}
510
511void sched_get_rd(struct root_domain *rd)
512{
513    atomic_inc(&rd->refcount);
514}
515
516void sched_put_rd(struct root_domain *rd)
517{
518    if (!atomic_dec_and_test(&rd->refcount)) {
519        return;
520    }
521
522    call_rcu(&rd->rcu, free_rootdomain);
523}
524
525static int init_rootdomain(struct root_domain *rd)
526{
527    if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) {
528        goto out;
529    }
530    if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) {
531        goto free_span;
532    }
533    if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) {
534        goto free_online;
535    }
536    if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) {
537        goto free_dlo_mask;
538    }
539
540#ifdef HAVE_RT_PUSH_IPI
541    rd->rto_cpu = -1;
542    raw_spin_lock_init(&rd->rto_lock);
543    init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
544#endif
545
546    init_dl_bw(&rd->dl_bw);
547    if (cpudl_init(&rd->cpudl) != 0) {
548        goto free_rto_mask;
549    }
550
551    if (cpupri_init(&rd->cpupri) != 0) {
552        goto free_cpudl;
553    }
554
555#ifdef CONFIG_SCHED_RT_CAS
556    rd->max_cap_orig_cpu = -1;
557#endif
558    return 0;
559
560free_cpudl:
561    cpudl_cleanup(&rd->cpudl);
562free_rto_mask:
563    free_cpumask_var(rd->rto_mask);
564free_dlo_mask:
565    free_cpumask_var(rd->dlo_mask);
566free_online:
567    free_cpumask_var(rd->online);
568free_span:
569    free_cpumask_var(rd->span);
570out:
571    return -ENOMEM;
572}
573
574/*
575 * By default the system creates a single root-domain with all CPUs as
576 * members (mimicking the global state we have today).
577 */
578struct root_domain def_root_domain;
579
580void init_defrootdomain(void)
581{
582    init_rootdomain(&def_root_domain);
583
584    atomic_set(&def_root_domain.refcount, 1);
585}
586
587static struct root_domain *alloc_rootdomain(void)
588{
589    struct root_domain *rd;
590
591    rd = kzalloc(sizeof(*rd), GFP_KERNEL);
592    if (!rd) {
593        return NULL;
594    }
595
596    if (init_rootdomain(rd) != 0) {
597        kfree(rd);
598        return NULL;
599    }
600
601    return rd;
602}
603
604static void free_sched_groups(struct sched_group *sg, int free_sgc)
605{
606    struct sched_group *tmp, *first;
607
608    if (!sg) {
609        return;
610    }
611
612    first = sg;
613    do {
614        tmp = sg->next;
615
616        if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) {
617            kfree(sg->sgc);
618        }
619
620        if (atomic_dec_and_test(&sg->ref)) {
621            kfree(sg);
622        }
623        sg = tmp;
624    } while (sg != first);
625}
626
627static void destroy_sched_domain(struct sched_domain *sd)
628{
629    /*
630     * A normal sched domain may have multiple group references, an
631     * overlapping domain, having private groups, only one.  Iterate,
632     * dropping group/capacity references, freeing where none remain.
633     */
634    free_sched_groups(sd->groups, 1);
635
636    if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) {
637        kfree(sd->shared);
638    }
639    kfree(sd);
640}
641
642static void destroy_sched_domains_rcu(struct rcu_head *rcu)
643{
644    struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
645
646    while (sd) {
647        struct sched_domain *parent = sd->parent;
648        destroy_sched_domain(sd);
649        sd = parent;
650    }
651}
652
653static void destroy_sched_domains(struct sched_domain *sd)
654{
655    if (sd) {
656        call_rcu(&sd->rcu, destroy_sched_domains_rcu);
657    }
658}
659
660/*
661 * Keep a special pointer to the highest sched_domain that has
662 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
663 * allows us to avoid some pointer chasing select_idle_sibling().
664 *
665 * Also keep a unique ID per domain (we use the first CPU number in
666 * the cpumask of the domain), this allows us to quickly tell if
667 * two CPUs are in the same cache domain, see cpus_share_cache().
668 */
669DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
670DEFINE_PER_CPU(int, sd_llc_size);
671DEFINE_PER_CPU(int, sd_llc_id);
672DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
673DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
674DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
675DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
676DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
677
678static void update_top_cache_domain(int cpu)
679{
680    struct sched_domain_shared *sds = NULL;
681    struct sched_domain *sd;
682    int id = cpu;
683    int size = 1;
684
685    sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
686    if (sd) {
687        id = cpumask_first(sched_domain_span(sd));
688        size = cpumask_weight(sched_domain_span(sd));
689        sds = sd->shared;
690    }
691
692    rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
693    per_cpu(sd_llc_size, cpu) = size;
694    per_cpu(sd_llc_id, cpu) = id;
695    rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
696
697    sd = lowest_flag_domain(cpu, SD_NUMA);
698    rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
699
700    sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
701    rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
702
703    sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
704    rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
705}
706
707/*
708 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
709 * hold the hotplug lock.
710 */
711static void cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
712{
713    struct rq *rq = cpu_rq(cpu);
714    struct sched_domain *tmp;
715    int numa_distance = 0;
716
717    /* Remove the sched domains which do not contribute to scheduling. */
718    for (tmp = sd; tmp;) {
719        struct sched_domain *parent = tmp->parent;
720        if (!parent) {
721            break;
722        }
723
724        if (sd_parent_degenerate(tmp, parent)) {
725            tmp->parent = parent->parent;
726            if (parent->parent) {
727                parent->parent->child = tmp;
728            }
729            /*
730             * Transfer SD_PREFER_SIBLING down in case of a
731             * degenerate parent; the spans match for this
732             * so the property transfers.
733             */
734            if (parent->flags & SD_PREFER_SIBLING) {
735                tmp->flags |= SD_PREFER_SIBLING;
736            }
737            destroy_sched_domain(parent);
738        } else {
739            tmp = tmp->parent;
740        }
741    }
742
743    if (sd && sd_degenerate(sd)) {
744        tmp = sd;
745        sd = sd->parent;
746        destroy_sched_domain(tmp);
747        if (sd) {
748            sd->child = NULL;
749        }
750    }
751
752    for (tmp = sd; tmp; tmp = tmp->parent) {
753        numa_distance += !!(tmp->flags & SD_NUMA);
754    }
755
756    sched_domain_debug(sd, cpu);
757
758    rq_attach_root(rq, rd);
759    tmp = rq->sd;
760    rcu_assign_pointer(rq->sd, sd);
761    dirty_sched_domain_sysctl(cpu);
762    destroy_sched_domains(tmp);
763
764    update_top_cache_domain(cpu);
765}
766
767struct s_data {
768    struct sched_domain *__percpu *sd;
769    struct root_domain *rd;
770};
771
772enum s_alloc {
773    sa_rootdomain,
774    sa_sd,
775    sa_sd_storage,
776    sa_none,
777};
778
779/*
780 * Return the canonical balance CPU for this group, this is the first CPU
781 * of this group that's also in the balance mask.
782 *
783 * The balance mask are all those CPUs that could actually end up at this
784 * group. See build_balance_mask().
785 *
786 * Also see should_we_balance().
787 */
788int group_balance_cpu(struct sched_group *sg)
789{
790    return cpumask_first(group_balance_mask(sg));
791}
792
793/*
794 * NUMA topology (first read the regular topology blurb below)
795 *
796 * Given a node-distance table, for example:
797 *
798 *   node   0   1   2   3
799 *     0:  10  20  30  20
800 *     1:  20  10  20  30
801 *     2:  30  20  10  20
802 *     3:  20  30  20  10
803 *
804 * which represents a 4 node ring topology like:
805 *
806 *   0 ----- 1
807 *   |       |
808 *   |       |
809 *   |       |
810 *   3 ----- 2
811 *
812 * We want to construct domains and groups to represent this. The way we go
813 * about doing this is to build the domains on 'hops'. For each NUMA level we
814 * construct the mask of all nodes reachable in @level hops.
815 *
816 * For the above NUMA topology that gives 3 levels:
817 *
818 * NUMA-2    0-3        0-3        0-3        0-3
819 *  groups:    {0-1,3},{1-3}    {0-2},{0,2-3}    {1-3},{0-1,3}    {0,2-3},{0-2}
820 *
821 * NUMA-1    0-1,3        0-2        1-3        0,2-3
822 *  groups:    {0},{1},{3}    {0},{1},{2}    {1},{2},{3}    {0},{2},{3}
823 *
824 * NUMA-0    0        1        2        3
825 *
826 *
827 * As can be seen; things don't nicely line up as with the regular topology.
828 * When we iterate a domain in child domain chunks some nodes can be
829 * represented multiple times -- hence the "overlap" naming for this part of
830 * the topology.
831 *
832 * In order to minimize this overlap, we only build enough groups to cover the
833 * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
834 *
835 * Because
836 *
837 *  - the first group of each domain is its child domain; this
838 *    gets us the first 0-1,3
839 *  - the only uncovered node is 2, who's child domain is 1-3.
840 *
841 * However, because of the overlap, computing a unique CPU for each group is
842 * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
843 * groups include the CPUs of Node-0, while those CPUs would not in fact ever
844 * end up at those groups (they would end up in group: 0-1,3).
845 *
846 * To correct this we have to introduce the group balance mask. This mask
847 * will contain those CPUs in the group that can reach this group given the
848 * (child) domain tree.
849 *
850 * With this we can once again compute balance_cpu and sched_group_capacity
851 * relations.
852 *
853 * XXX include words on how balance_cpu is unique and therefore can be
854 * used for sched_group_capacity links.
855 *
856 *
857 * Another 'interesting' topology is
858 *
859 *   node   0   1   2   3
860 *     0:  10  20  20  30
861 *     1:  20  10  20  20
862 *     2:  20  20  10  20
863 *     3:  30  20  20  10
864 *
865 * Which looks a little like
866 *
867 *   0 ----- 1
868 *   |     / |
869 *   |   /   |
870 *   | /     |
871 *   2 ----- 3
872 *
873 * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
874 * are not.
875 *
876 * This leads to a few particularly weird cases where the sched_domain's are
877 * not of the same number for each CPU. Consider:
878 *
879 * NUMA-2    0-3                        0-3
880 *  groups:    {0-2},{1-3}                    {1-3},{0-2}
881 *
882 * NUMA-1    0-2        0-3        0-3        1-3
883 *
884 * NUMA-0    0        1        2        3
885 *
886 */
887
888/*
889 * Build the balance mask; it contains only those CPUs that can arrive at this
890 * group and should be considered to continue balancing.
891 *
892 * We do this during the group creation pass, therefore the group information
893 * isn't complete yet, however since each group represents a (child) domain we
894 * can fully construct this using the sched_domain bits (which are already
895 * complete).
896 */
897static void build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
898{
899    const struct cpumask *sg_span = sched_group_span(sg);
900    struct sd_data *sdd = sd->private;
901    struct sched_domain *sibling;
902    int i;
903
904    cpumask_clear(mask);
905
906    for_each_cpu(i, sg_span)
907    {
908        sibling = *per_cpu_ptr(sdd->sd, i);
909        /*
910         * Can happen in the asymmetric case, where these siblings are
911         * unused. The mask will not be empty because those CPUs that
912         * do have the top domain _should_ span the domain.
913         */
914        if (!sibling->child) {
915            continue;
916        }
917
918        /* If we would not end up here, we can't continue from here */
919        if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) {
920            continue;
921        }
922
923        cpumask_set_cpu(i, mask);
924    }
925
926    /* We must not have empty masks here */
927    WARN_ON_ONCE(cpumask_empty(mask));
928}
929
930/*
931 * XXX: This creates per-node group entries; since the load-balancer will
932 * immediately access remote memory to construct this group's load-balance
933 * statistics having the groups node local is of dubious benefit.
934 */
935static struct sched_group *build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
936{
937    struct sched_group *sg;
938    struct cpumask *sg_span;
939
940    sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu));
941    if (!sg) {
942        return NULL;
943    }
944
945    sg_span = sched_group_span(sg);
946    if (sd->child) {
947        cpumask_copy(sg_span, sched_domain_span(sd->child));
948    } else {
949        cpumask_copy(sg_span, sched_domain_span(sd));
950    }
951
952    atomic_inc(&sg->ref);
953    return sg;
954}
955
956static void init_overlap_sched_group(struct sched_domain *sd, struct sched_group *sg)
957{
958    struct cpumask *mask = sched_domains_tmpmask2;
959    struct sd_data *sdd = sd->private;
960    struct cpumask *sg_span;
961    int cpu;
962
963    build_balance_mask(sd, sg, mask);
964    cpu = cpumask_first_and(sched_group_span(sg), mask);
965
966    sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
967    if (atomic_inc_return(&sg->sgc->ref) == 1) {
968        cpumask_copy(group_balance_mask(sg), mask);
969    } else {
970        WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
971    }
972
973    /*
974     * Initialize sgc->capacity such that even if we mess up the
975     * domains and no possible iteration will get us here, we won't
976     * die on a /0 trap.
977     */
978    sg_span = sched_group_span(sg);
979    sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
980    sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
981    sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
982}
983
984static struct sched_domain *find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
985{
986    /*
987     * The proper descendant would be the one whose child won't span out
988     * of sd
989     */
990    while (sibling->child && !cpumask_subset(sched_domain_span(sibling->child), sched_domain_span(sd))) {
991        sibling = sibling->child;
992    }
993
994    /*
995     * As we are referencing sgc across different topology level, we need
996     * to go down to skip those sched_domains which don't contribute to
997     * scheduling because they will be degenerated in cpu_attach_domain
998     */
999    while (sibling->child && cpumask_equal(sched_domain_span(sibling->child), sched_domain_span(sibling))) {
1000        sibling = sibling->child;
1001    }
1002
1003    return sibling;
1004}
1005
1006static int build_overlap_sched_groups(struct sched_domain *sd, int cpu)
1007{
1008    struct sched_group *first = NULL, *last = NULL, *sg;
1009    const struct cpumask *span = sched_domain_span(sd);
1010    struct cpumask *covered = sched_domains_tmpmask;
1011    struct sd_data *sdd = sd->private;
1012    struct sched_domain *sibling;
1013    int i;
1014
1015    cpumask_clear(covered);
1016
1017    for_each_cpu_wrap(i, span, cpu)
1018    {
1019        struct cpumask *sg_span;
1020
1021        if (cpumask_test_cpu(i, covered)) {
1022            continue;
1023        }
1024
1025        sibling = *per_cpu_ptr(sdd->sd, i);
1026        /*
1027         * Asymmetric node setups can result in situations where the
1028         * domain tree is of unequal depth, make sure to skip domains
1029         * that already cover the entire range.
1030         *
1031         * In that case build_sched_domains() will have terminated the
1032         * iteration early and our sibling sd spans will be empty.
1033         * Domains should always include the CPU they're built on, so
1034         * check that.
1035         */
1036        if (!cpumask_test_cpu(i, sched_domain_span(sibling))) {
1037            continue;
1038        }
1039
1040        /*
1041         * Usually we build sched_group by sibling's child sched_domain
1042         * But for machines whose NUMA diameter are 3 or above, we move
1043         * to build sched_group by sibling's proper descendant's child
1044         * domain because sibling's child sched_domain will span out of
1045         * the sched_domain being built as below.
1046         *
1047         * Smallest diameter=3 topology is:
1048         *
1049         *   node   0   1   2   3
1050         *     0:  10  20  30  40
1051         *     1:  20  10  20  30
1052         *     2:  30  20  10  20
1053         *     3:  40  30  20  10
1054         *
1055         *   0 --- 1 --- 2 --- 3
1056         *
1057         * NUMA-3       0-3             N/A             N/A             0-3
1058         *  groups:     {0-2},{1-3}                                     {1-3},{0-2}
1059         *
1060         * NUMA-2       0-2             0-3             0-3             1-3
1061         *  groups:     {0-1},{1-3}     {0-2},{2-3}     {1-3},{0-1}     {2-3},{0-2}
1062         *
1063         * NUMA-1       0-1             0-2             1-3             2-3
1064         *  groups:     {0},{1}         {1},{2},{0}     {2},{3},{1}     {3},{2}
1065         *
1066         * NUMA-0       0               1               2               3
1067         *
1068         * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
1069         * group span isn't a subset of the domain span.
1070         */
1071        if (sibling->child && !cpumask_subset(sched_domain_span(sibling->child), span)) {
1072            sibling = find_descended_sibling(sd, sibling);
1073        }
1074
1075        sg = build_group_from_child_sched_domain(sibling, cpu);
1076        if (!sg) {
1077            goto fail;
1078        }
1079
1080        sg_span = sched_group_span(sg);
1081        cpumask_or(covered, covered, sg_span);
1082
1083        init_overlap_sched_group(sibling, sg);
1084
1085        if (!first) {
1086            first = sg;
1087        }
1088        if (last) {
1089            last->next = sg;
1090        }
1091        last = sg;
1092        last->next = first;
1093    }
1094    sd->groups = first;
1095
1096    return 0;
1097
1098fail:
1099    free_sched_groups(first, 0);
1100
1101    return -ENOMEM;
1102}
1103
1104/*
1105 * Package topology (also see the load-balance blurb in fair.c)
1106 *
1107 * The scheduler builds a tree structure to represent a number of important
1108 * topology features. By default (default_topology[]) these include:
1109 *
1110 *  - Simultaneous multithreading (SMT)
1111 *  - Multi-Core Cache (MC)
1112 *  - Package (DIE)
1113 *
1114 * Where the last one more or less denotes everything up to a NUMA node.
1115 *
1116 * The tree consists of 3 primary data structures:
1117 *
1118 *    sched_domain -> sched_group -> sched_group_capacity
1119 *        ^ ^             ^ ^
1120 *          `-'             `-'
1121 *
1122 * The sched_domains are per-CPU and have a two way link (parent & child) and
1123 * denote the ever growing mask of CPUs belonging to that level of topology.
1124 *
1125 * Each sched_domain has a circular (double) linked list of sched_group's, each
1126 * denoting the domains of the level below (or individual CPUs in case of the
1127 * first domain level). The sched_group linked by a sched_domain includes the
1128 * CPU of that sched_domain [*].
1129 *
1130 * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
1131 *
1132 * CPU   0   1   2   3   4   5   6   7
1133 *
1134 * DIE  [                             ]
1135 * MC   [             ] [             ]
1136 * SMT  [     ] [     ] [     ] [     ]
1137 *
1138 *  - or -
1139 *
1140 * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
1141 * MC    0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
1142 * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
1143 *
1144 * CPU   0   1   2   3   4   5   6   7
1145 *
1146 * One way to think about it is: sched_domain moves you up and down among these
1147 * topology levels, while sched_group moves you sideways through it, at child
1148 * domain granularity.
1149 *
1150 * sched_group_capacity ensures each unique sched_group has shared storage.
1151 *
1152 * There are two related construction problems, both require a CPU that
1153 * uniquely identify each group (for a given domain):
1154 *
1155 *  - The first is the balance_cpu (see should_we_balance() and the
1156 *    load-balance blub in fair.c); for each group we only want 1 CPU to
1157 *    continue balancing at a higher domain.
1158 *
1159 *  - The second is the sched_group_capacity; we want all identical groups
1160 *    to share a single sched_group_capacity.
1161 *
1162 * Since these topologies are exclusive by construction. That is, its
1163 * impossible for an SMT thread to belong to multiple cores, and cores to
1164 * be part of multiple caches. There is a very clear and unique location
1165 * for each CPU in the hierarchy.
1166 *
1167 * Therefore computing a unique CPU for each group is trivial (the iteration
1168 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
1169 * group), we can simply pick the first CPU in each group.
1170 *
1171 *
1172 * [*] in other words, the first group of each domain is its child domain.
1173 */
1174
1175static struct sched_group *get_group(int cpu, struct sd_data *sdd)
1176{
1177    struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1178    struct sched_domain *child = sd->child;
1179    struct sched_group *sg;
1180    bool already_visited;
1181
1182    if (child) {
1183        cpu = cpumask_first(sched_domain_span(child));
1184    }
1185
1186    sg = *per_cpu_ptr(sdd->sg, cpu);
1187    sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1188
1189    /* Increase refcounts for claim_allocations: */
1190    already_visited = atomic_inc_return(&sg->ref) > 1;
1191    /* sgc visits should follow a similar trend as sg */
1192    WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1193
1194    /* If we have already visited that group, it's already initialized. */
1195    if (already_visited) {
1196        return sg;
1197    }
1198
1199    if (child) {
1200        cpumask_copy(sched_group_span(sg), sched_domain_span(child));
1201        cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
1202    } else {
1203        cpumask_set_cpu(cpu, sched_group_span(sg));
1204        cpumask_set_cpu(cpu, group_balance_mask(sg));
1205    }
1206
1207    sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
1208    sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
1209    sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
1210
1211    return sg;
1212}
1213
1214/*
1215 * build_sched_groups will build a circular linked list of the groups
1216 * covered by the given span, will set each group's ->cpumask correctly,
1217 * and will initialize their ->sgc.
1218 *
1219 * Assumes the sched_domain tree is fully constructed
1220 */
1221static int build_sched_groups(struct sched_domain *sd, int cpu)
1222{
1223    struct sched_group *first = NULL, *last = NULL;
1224    struct sd_data *sdd = sd->private;
1225    const struct cpumask *span = sched_domain_span(sd);
1226    struct cpumask *covered;
1227    int i;
1228
1229    lockdep_assert_held(&sched_domains_mutex);
1230    covered = sched_domains_tmpmask;
1231
1232    cpumask_clear(covered);
1233
1234    for_each_cpu_wrap(i, span, cpu)
1235    {
1236        struct sched_group *sg;
1237
1238        if (cpumask_test_cpu(i, covered)) {
1239            continue;
1240        }
1241
1242        sg = get_group(i, sdd);
1243
1244        cpumask_or(covered, covered, sched_group_span(sg));
1245
1246        if (!first) {
1247            first = sg;
1248        }
1249        if (last) {
1250            last->next = sg;
1251        }
1252        last = sg;
1253    }
1254    last->next = first;
1255    sd->groups = first;
1256
1257    return 0;
1258}
1259
1260/*
1261 * Initialize sched groups cpu_capacity.
1262 *
1263 * cpu_capacity indicates the capacity of sched group, which is used while
1264 * distributing the load between different sched groups in a sched domain.
1265 * Typically cpu_capacity for all the groups in a sched domain will be same
1266 * unless there are asymmetries in the topology. If there are asymmetries,
1267 * group having more cpu_capacity will pickup more load compared to the
1268 * group having less cpu_capacity.
1269 */
1270void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
1271{
1272    struct sched_group *sg = sd->groups;
1273#ifdef CONFIG_CPU_ISOLATION_OPT
1274    cpumask_t avail_mask;
1275#endif
1276
1277    WARN_ON(!sg);
1278
1279    do {
1280        int cpu, max_cpu = -1;
1281
1282#ifdef CONFIG_CPU_ISOLATION_OPT
1283        cpumask_andnot(&avail_mask, sched_group_span(sg), cpu_isolated_mask);
1284        sg->group_weight = cpumask_weight(&avail_mask);
1285#else
1286        sg->group_weight = cpumask_weight(sched_group_span(sg));
1287#endif
1288
1289        if (!(sd->flags & SD_ASYM_PACKING)) {
1290            goto next;
1291        }
1292
1293        for_each_cpu(cpu, sched_group_span(sg))
1294        {
1295            if (max_cpu < 0) {
1296                max_cpu = cpu;
1297            } else if (sched_asym_prefer(cpu, max_cpu)) {
1298                max_cpu = cpu;
1299            }
1300        }
1301        sg->asym_prefer_cpu = max_cpu;
1302
1303    next:
1304        sg = sg->next;
1305    } while (sg != sd->groups);
1306
1307    if (cpu != group_balance_cpu(sg)) {
1308        return;
1309    }
1310
1311    update_group_capacity(sd, cpu);
1312}
1313
1314/*
1315 * Initializers for schedule domains
1316 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
1317 */
1318
1319static int default_relax_domain_level = -1;
1320int sched_domain_level_max;
1321
1322static int __init setup_relax_domain_level(char *str)
1323{
1324    if (kstrtoint(str, 0, &default_relax_domain_level)) {
1325        pr_warn("Unable to set relax_domain_level\n");
1326    }
1327
1328    return 1;
1329}
1330__setup("relax_domain_level=", setup_relax_domain_level);
1331
1332static void set_domain_attribute(struct sched_domain *sd, struct sched_domain_attr *attr)
1333{
1334    int request;
1335
1336    if (!attr || attr->relax_domain_level < 0) {
1337        if (default_relax_domain_level < 0) {
1338            return;
1339        }
1340        request = default_relax_domain_level;
1341    } else {
1342        request = attr->relax_domain_level;
1343    }
1344
1345    if (sd->level > request) {
1346        /* Turn off idle balance on this domain: */
1347        sd->flags &= ~(SD_BALANCE_WAKE | SD_BALANCE_NEWIDLE);
1348    }
1349}
1350
1351static void __sdt_free(const struct cpumask *cpu_map);
1352static int __sdt_alloc(const struct cpumask *cpu_map);
1353
1354static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map)
1355{
1356    switch (what) {
1357        case sa_rootdomain:
1358            if (!atomic_read(&d->rd->refcount)) {
1359                free_rootdomain(&d->rd->rcu);
1360            }
1361            fallthrough;
1362        case sa_sd:
1363            free_percpu(d->sd);
1364            fallthrough;
1365        case sa_sd_storage:
1366            __sdt_free(cpu_map);
1367            fallthrough;
1368        case sa_none:
1369            break;
1370    }
1371}
1372
1373static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1374{
1375    memset(d, 0, sizeof(*d));
1376
1377    if (__sdt_alloc(cpu_map)) {
1378        return sa_sd_storage;
1379    }
1380    d->sd = alloc_percpu(struct sched_domain *);
1381    if (!d->sd) {
1382        return sa_sd_storage;
1383    }
1384    d->rd = alloc_rootdomain();
1385    if (!d->rd) {
1386        return sa_sd;
1387    }
1388
1389    return sa_rootdomain;
1390}
1391
1392/*
1393 * NULL the sd_data elements we've used to build the sched_domain and
1394 * sched_group structure so that the subsequent __free_domain_allocs()
1395 * will not free the data we're using.
1396 */
1397static void claim_allocations(int cpu, struct sched_domain *sd)
1398{
1399    struct sd_data *sdd = sd->private;
1400
1401    WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1402    *per_cpu_ptr(sdd->sd, cpu) = NULL;
1403
1404    if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) {
1405        *per_cpu_ptr(sdd->sds, cpu) = NULL;
1406    }
1407
1408    if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) {
1409        *per_cpu_ptr(sdd->sg, cpu) = NULL;
1410    }
1411
1412    if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) {
1413        *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1414    }
1415}
1416
1417#ifdef CONFIG_NUMA
1418enum numa_topology_type sched_numa_topology_type;
1419
1420static int sched_domains_numa_levels;
1421static int sched_domains_curr_level;
1422
1423int sched_max_numa_distance;
1424static int *sched_domains_numa_distance;
1425static struct cpumask ***sched_domains_numa_masks;
1426int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
1427#endif
1428
1429/*
1430 * SD_flags allowed in topology descriptions.
1431 *
1432 * These flags are purely descriptive of the topology and do not prescribe
1433 * behaviour. Behaviour is artificial and mapped in the below sd_init()
1434 * function:
1435 *
1436 *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
1437 *   SD_SHARE_PKG_RESOURCES - describes shared caches
1438 *   SD_NUMA                - describes NUMA topologies
1439 *
1440 * Odd one out, which beside describing the topology has a quirk also
1441 * prescribes the desired behaviour that goes along with it:
1442 *
1443 *   SD_ASYM_PACKING        - describes SMT quirks
1444 */
1445#define TOPOLOGY_SD_FLAGS (SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_NUMA | SD_ASYM_PACKING)
1446
1447static struct sched_domain *sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map,
1448                                    struct sched_domain *child, int dflags, int cpu)
1449{
1450    struct sd_data *sdd = &tl->data;
1451    struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1452    int sd_id, sd_weight, sd_flags = 0;
1453
1454#ifdef CONFIG_NUMA
1455    /*
1456     * Ugly hack to pass state to sd_numa_mask()...
1457     */
1458    sched_domains_curr_level = tl->numa_level;
1459#endif
1460
1461    sd_weight = cpumask_weight(tl->mask(cpu));
1462
1463    if (tl->sd_flags) {
1464        sd_flags = (*tl->sd_flags)();
1465    }
1466    if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, "wrong sd_flags in topology description\n")) {
1467        sd_flags &= TOPOLOGY_SD_FLAGS;
1468    }
1469
1470    /* Apply detected topology flags */
1471    sd_flags |= dflags;
1472
1473    *sd = (struct sched_domain) {
1474        .min_interval = sd_weight,
1475        .max_interval = 2 * sd_weight,
1476        .busy_factor = 16,
1477        .imbalance_pct = 117,
1478
1479        .cache_nice_tries = 0,
1480
1481        .flags = 1 * SD_BALANCE_NEWIDLE | 1 * SD_BALANCE_EXEC | 1 * SD_BALANCE_FORK | 0 * SD_BALANCE_WAKE |
1482                 1 * SD_WAKE_AFFINE | 0 * SD_SHARE_CPUCAPACITY | 0 * SD_SHARE_PKG_RESOURCES | 0 * SD_SERIALIZE |
1483                 1 * SD_PREFER_SIBLING | 0 * SD_NUMA | sd_flags,
1484
1485        .last_balance = jiffies,
1486        .balance_interval = sd_weight,
1487        .max_newidle_lb_cost = 0,
1488        .next_decay_max_lb_cost = jiffies,
1489        .child = child,
1490#ifdef CONFIG_SCHED_DEBUG
1491        .name = tl->name,
1492#endif
1493    };
1494
1495    cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
1496    sd_id = cpumask_first(sched_domain_span(sd));
1497
1498    /*
1499     * Convert topological properties into behaviour.
1500     */
1501
1502    /* Don't attempt to spread across CPUs of different capacities. */
1503    if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) {
1504        sd->child->flags &= ~SD_PREFER_SIBLING;
1505    }
1506
1507    if (sd->flags & SD_SHARE_CPUCAPACITY) {
1508        sd->imbalance_pct = IMBALANCE_SD_SHARE_CPUCAPACITY;
1509    } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1510        sd->imbalance_pct = IMBALANCE_SD_SHARE_PKG;
1511        sd->cache_nice_tries = 1;
1512
1513#ifdef CONFIG_NUMA
1514    } else if (sd->flags & SD_NUMA) {
1515        sd->cache_nice_tries = IMBALANCE_SD_NUMA;
1516
1517        sd->flags &= ~SD_PREFER_SIBLING;
1518        sd->flags |= SD_SERIALIZE;
1519        if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
1520            sd->flags &= ~(SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE);
1521        }
1522
1523#endif
1524    } else {
1525        sd->cache_nice_tries = 1;
1526    }
1527
1528    /*
1529     * For all levels sharing cache; connect a sched_domain_shared
1530     * instance.
1531     */
1532    if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1533        sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1534        atomic_inc(&sd->shared->ref);
1535        atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1536    }
1537
1538    sd->private = sdd;
1539
1540    return sd;
1541}
1542
1543/*
1544 * Topology list, bottom-up.
1545 */
1546static struct sched_domain_topology_level default_topology[] = {
1547#ifdef CONFIG_SCHED_SMT
1548    {cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)},
1549#endif
1550#ifdef CONFIG_SCHED_MC
1551    {cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC)},
1552#endif
1553    {cpu_cpu_mask, SD_INIT_NAME(DIE)},
1554    {
1555        NULL,
1556    },
1557};
1558
1559static struct sched_domain_topology_level *sched_domain_topology = default_topology;
1560
1561#define for_each_sd_topology(tl) for (tl = sched_domain_topology; (tl)->mask; (tl)++)
1562
1563void set_sched_topology(struct sched_domain_topology_level *tl)
1564{
1565    if (WARN_ON_ONCE(sched_smp_initialized)) {
1566        return;
1567    }
1568
1569    sched_domain_topology = tl;
1570}
1571
1572#ifdef CONFIG_NUMA
1573
1574static const struct cpumask *sd_numa_mask(int cpu)
1575{
1576    return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1577}
1578
1579static void sched_numa_warn(const char *str)
1580{
1581    static int done = false;
1582    int i, j;
1583
1584    if (done) {
1585        return;
1586    }
1587
1588    done = true;
1589
1590    printk(KERN_WARNING "ERROR: %s\n\n", str);
1591
1592    for (i = 0; i < nr_node_ids; i++) {
1593        printk(KERN_WARNING "  ");
1594        for (j = 0; j < nr_node_ids; j++) {
1595            printk(KERN_CONT "%02d ", node_distance(i, j));
1596        }
1597        printk(KERN_CONT "\n");
1598    }
1599    printk(KERN_WARNING "\n");
1600}
1601
1602bool find_numa_distance(int distance)
1603{
1604    int i;
1605
1606    if (distance == node_distance(0, 0)) {
1607        return true;
1608    }
1609
1610    for (i = 0; i < sched_domains_numa_levels; i++) {
1611        if (sched_domains_numa_distance[i] == distance) {
1612            return true;
1613        }
1614    }
1615
1616    return false;
1617}
1618
1619/*
1620 * A system can have three types of NUMA topology:
1621 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
1622 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
1623 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
1624 *
1625 * The difference between a glueless mesh topology and a backplane
1626 * topology lies in whether communication between not directly
1627 * connected nodes goes through intermediary nodes (where programs
1628 * could run), or through backplane controllers. This affects
1629 * placement of programs.
1630 *
1631 * The type of topology can be discerned with the following tests:
1632 * - If the maximum distance between any nodes is 1 hop, the system
1633 *   is directly connected.
1634 * - If for two nodes A and B, located N > 1 hops away from each other,
1635 *   there is an intermediary node C, which is < N hops away from both
1636 *   nodes A and B, the system is a glueless mesh.
1637 */
1638static void init_numa_topology_type(void)
1639{
1640    int a, b, c, n;
1641
1642    n = sched_max_numa_distance;
1643
1644    if (sched_domains_numa_levels <= IMBALANCE_SD_NUMA_DIRECT) {
1645        sched_numa_topology_type = NUMA_DIRECT;
1646        return;
1647    }
1648
1649    for_each_online_node(a)
1650    {
1651        for_each_online_node(b)
1652        {
1653            /* Find two nodes furthest removed from each other. */
1654            if (node_distance(a, b) < n) {
1655                continue;
1656            }
1657
1658            /* Is there an intermediary node between a and b? */
1659            for_each_online_node(c)
1660            {
1661                if (node_distance(a, c) < n && node_distance(b, c) < n) {
1662                    sched_numa_topology_type = NUMA_GLUELESS_MESH;
1663                    return;
1664                }
1665            }
1666
1667            sched_numa_topology_type = NUMA_BACKPLANE;
1668            return;
1669        }
1670    }
1671}
1672
1673#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
1674
1675void sched_init_numa(void)
1676{
1677    struct sched_domain_topology_level *tl;
1678    unsigned long *distance_map;
1679    int nr_levels = 0;
1680    int i, j;
1681
1682    /*
1683     * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1684     * unique distances in the node_distance() table.
1685     */
1686    distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
1687    if (!distance_map) {
1688        return;
1689    }
1690
1691    bitmap_zero(distance_map, NR_DISTANCE_VALUES);
1692    for (i = 0; i < nr_node_ids; i++) {
1693        for (j = 0; j < nr_node_ids; j++) {
1694            int distance = node_distance(i, j);
1695            if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
1696                sched_numa_warn("Invalid distance value range");
1697                return;
1698            }
1699
1700            bitmap_set(distance_map, distance, 1);
1701        }
1702    }
1703    /*
1704     * We can now figure out how many unique distance values there are and
1705     * allocate memory accordingly.
1706     */
1707    nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
1708
1709    sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
1710    if (!sched_domains_numa_distance) {
1711        bitmap_free(distance_map);
1712        return;
1713    }
1714
1715    for (i = 0, j = 0; i < nr_levels; i++, j++) {
1716        j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
1717        sched_domains_numa_distance[i] = j;
1718    }
1719
1720    bitmap_free(distance_map);
1721
1722    /*
1723     * 'nr_levels' contains the number of unique distances
1724     *
1725     * The sched_domains_numa_distance[] array includes the actual distance
1726     * numbers.
1727     */
1728
1729    /*
1730     * Here, we should temporarily reset sched_domains_numa_levels to 0.
1731     * If it fails to allocate memory for array sched_domains_numa_masks[][],
1732     * the array will contain less then 'nr_levels' members. This could be
1733     * dangerous when we use it to iterate array sched_domains_numa_masks[][]
1734     * in other functions.
1735     *
1736     * We reset it to 'nr_levels' at the end of this function.
1737     */
1738    sched_domains_numa_levels = 0;
1739
1740    sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
1741    if (!sched_domains_numa_masks) {
1742        return;
1743    }
1744
1745    /*
1746     * Now for each level, construct a mask per node which contains all
1747     * CPUs of nodes that are that many hops away from us.
1748     */
1749    for (i = 0; i < nr_levels; i++) {
1750        sched_domains_numa_masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1751        if (!sched_domains_numa_masks[i]) {
1752            return;
1753        }
1754
1755        for (j = 0; j < nr_node_ids; j++) {
1756            struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1757            int k;
1758
1759            if (!mask) {
1760                return;
1761            }
1762
1763            sched_domains_numa_masks[i][j] = mask;
1764
1765            for_each_node(k)
1766            {
1767                if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) {
1768                    sched_numa_warn("Node-distance not symmetric");
1769                }
1770
1771                if (node_distance(j, k) > sched_domains_numa_distance[i]) {
1772                    continue;
1773                }
1774
1775                cpumask_or(mask, mask, cpumask_of_node(k));
1776            }
1777        }
1778    }
1779
1780    /* Compute default topology size */
1781    for (i = 0; sched_domain_topology[i].mask; i++) {
1782        ;
1783    }
1784
1785    tl = kzalloc((i + nr_levels + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1786    if (!tl) {
1787        return;
1788    }
1789
1790    /*
1791     * Copy the default topology bits..
1792     */
1793    for (i = 0; sched_domain_topology[i].mask; i++) {
1794        tl[i] = sched_domain_topology[i];
1795    }
1796
1797    /*
1798     * Add the NUMA identity distance, aka single NODE.
1799     */
1800    tl[i++] = (struct sched_domain_topology_level) {.mask = sd_numa_mask, .numa_level = 0, SD_INIT_NAME(NODE)};
1801
1802    /*
1803     * .. and append 'j' levels of NUMA goodness.
1804     */
1805    for (j = 1; j < nr_levels; i++, j++) {
1806        tl[i] = (struct sched_domain_topology_level) {
1807            .mask = sd_numa_mask,
1808            .sd_flags = cpu_numa_flags,
1809            .flags = SDTL_OVERLAP,
1810            .numa_level = j,
1811            SD_INIT_NAME(NUMA)};
1812    }
1813
1814    sched_domain_topology = tl;
1815
1816    sched_domains_numa_levels = nr_levels;
1817    sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
1818
1819    init_numa_topology_type();
1820}
1821
1822void sched_domains_numa_masks_set(unsigned int cpu)
1823{
1824    int node = cpu_to_node(cpu);
1825    int i, j;
1826
1827    for (i = 0; i < sched_domains_numa_levels; i++) {
1828        for (j = 0; j < nr_node_ids; j++) {
1829            if (node_distance(j, node) <= sched_domains_numa_distance[i]) {
1830                cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1831            }
1832        }
1833    }
1834}
1835
1836void sched_domains_numa_masks_clear(unsigned int cpu)
1837{
1838    int i, j;
1839
1840    for (i = 0; i < sched_domains_numa_levels; i++) {
1841        for (j = 0; j < nr_node_ids; j++) {
1842            cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1843        }
1844    }
1845}
1846
1847/*
1848 * sched_numa_find_closest() - given the NUMA topology, find the cpu
1849 *                             closest to @cpu from @cpumask.
1850 * cpumask: cpumask to find a cpu from
1851 * cpu: cpu to be close to
1852 *
1853 * returns: cpu, or nr_cpu_ids when nothing found.
1854 */
1855int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1856{
1857    int i, j = cpu_to_node(cpu);
1858
1859    for (i = 0; i < sched_domains_numa_levels; i++) {
1860        cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1861        if (cpu < nr_cpu_ids) {
1862            return cpu;
1863        }
1864    }
1865    return nr_cpu_ids;
1866}
1867
1868#endif /* CONFIG_NUMA */
1869
1870static int __sdt_alloc(const struct cpumask *cpu_map)
1871{
1872    struct sched_domain_topology_level *tl;
1873    int j;
1874
1875    for (tl = sched_domain_topology; (tl)->mask; (tl)++) {
1876        struct sd_data *sdd = &tl->data;
1877
1878        sdd->sd = alloc_percpu(struct sched_domain *);
1879        if (!sdd->sd) {
1880            return -ENOMEM;
1881        }
1882
1883        sdd->sds = alloc_percpu(struct sched_domain_shared *);
1884        if (!sdd->sds) {
1885            return -ENOMEM;
1886        }
1887
1888        sdd->sg = alloc_percpu(struct sched_group *);
1889        if (!sdd->sg) {
1890            return -ENOMEM;
1891        }
1892
1893        sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1894        if (!sdd->sgc) {
1895            return -ENOMEM;
1896        }
1897
1898        for_each_cpu(j, cpu_map)
1899        {
1900            struct sched_domain *sd;
1901            struct sched_domain_shared *sds;
1902            struct sched_group *sg;
1903            struct sched_group_capacity *sgc;
1904
1905            sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j));
1906            if (!sd) {
1907                return -ENOMEM;
1908            }
1909
1910            *per_cpu_ptr(sdd->sd, j) = sd;
1911
1912            sds = kzalloc_node(sizeof(struct sched_domain_shared), GFP_KERNEL, cpu_to_node(j));
1913            if (!sds) {
1914                return -ENOMEM;
1915            }
1916
1917            *per_cpu_ptr(sdd->sds, j) = sds;
1918
1919            sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j));
1920            if (!sg) {
1921                return -ENOMEM;
1922            }
1923
1924            sg->next = sg;
1925
1926            *per_cpu_ptr(sdd->sg, j) = sg;
1927
1928            sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), GFP_KERNEL, cpu_to_node(j));
1929            if (!sgc) {
1930                return -ENOMEM;
1931            }
1932
1933#ifdef CONFIG_SCHED_DEBUG
1934            sgc->id = j;
1935#endif
1936
1937            *per_cpu_ptr(sdd->sgc, j) = sgc;
1938        }
1939    }
1940
1941    return 0;
1942}
1943
1944static void __sdt_free(const struct cpumask *cpu_map)
1945{
1946    struct sched_domain_topology_level *tl;
1947    int j;
1948
1949    for (tl = sched_domain_topology; (tl)->mask; (tl)++) {
1950        struct sd_data *sdd = &tl->data;
1951
1952        for_each_cpu(j, cpu_map) {
1953            struct sched_domain *sd;
1954
1955            if (sdd->sd) {
1956                sd = *per_cpu_ptr(sdd->sd, j);
1957                if (sd && (sd->flags & SD_OVERLAP)) {
1958                    free_sched_groups(sd->groups, 0);
1959                }
1960                kfree(*per_cpu_ptr(sdd->sd, j));
1961            }
1962
1963            if (sdd->sds) {
1964                kfree(*per_cpu_ptr(sdd->sds, j));
1965            }
1966            if (sdd->sg) {
1967                kfree(*per_cpu_ptr(sdd->sg, j));
1968            }
1969            if (sdd->sgc) {
1970                kfree(*per_cpu_ptr(sdd->sgc, j));
1971            }
1972        }
1973        free_percpu(sdd->sd);
1974        sdd->sd = NULL;
1975        free_percpu(sdd->sds);
1976        sdd->sds = NULL;
1977        free_percpu(sdd->sg);
1978        sdd->sg = NULL;
1979        free_percpu(sdd->sgc);
1980        sdd->sgc = NULL;
1981    }
1982}
1983
1984static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map,
1985                                               struct sched_domain_attr *attr, struct sched_domain *child, int dflags,
1986                                               int cpu)
1987{
1988    struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
1989
1990    if (child) {
1991        sd->level = child->level + 1;
1992        sched_domain_level_max = max(sched_domain_level_max, sd->level);
1993        child->parent = sd;
1994
1995        if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) {
1996            pr_err("BUG: arch topology borken\n");
1997#ifdef CONFIG_SCHED_DEBUG
1998            pr_err("     the %s domain not a subset of the %s domain\n", child->name, sd->name);
1999#endif
2000            /* Fixup, ensure @sd has at least @child CPUs. */
2001            cpumask_or(sched_domain_span(sd), sched_domain_span(sd), sched_domain_span(child));
2002        }
2003    }
2004    set_domain_attribute(sd, attr);
2005
2006    return sd;
2007}
2008
2009/*
2010 * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
2011 * any two given CPUs at this (non-NUMA) topology level.
2012 */
2013static bool topology_span_sane(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, int cpu)
2014{
2015    int i;
2016
2017    /* NUMA levels are allowed to overlap */
2018    if (tl->flags & SDTL_OVERLAP) {
2019        return true;
2020    }
2021
2022    /*
2023     * Non-NUMA levels cannot partially overlap - they must be either
2024     * completely equal or completely disjoint. Otherwise we can end up
2025     * breaking the sched_group lists - i.e. a later get_group() pass
2026     * breaks the linking done for an earlier span.
2027     */
2028    for_each_cpu(i, cpu_map)
2029    {
2030        if (i == cpu) {
2031            continue;
2032        }
2033        /*
2034         * We should 'and' all those masks with 'cpu_map' to exactly
2035         * match the topology we're about to build, but that can only
2036         * remove CPUs, which only lessens our ability to detect
2037         * overlaps
2038         */
2039        if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && cpumask_intersects(tl->mask(cpu), tl->mask(i))) {
2040            return false;
2041        }
2042    }
2043
2044    return true;
2045}
2046
2047/*
2048 * Find the sched_domain_topology_level where all CPU capacities are visible
2049 * for all CPUs.
2050 */
2051static struct sched_domain_topology_level *asym_cpu_capacity_level(const struct cpumask *cpu_map)
2052{
2053    int i, j, asym_level = 0;
2054    bool asym = false;
2055    struct sched_domain_topology_level *tl, *asym_tl = NULL;
2056    unsigned long cap;
2057
2058    /* Is there any asymmetry? */
2059    cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
2060
2061    for_each_cpu(i, cpu_map)
2062    {
2063        if (arch_scale_cpu_capacity(i) != cap) {
2064            asym = true;
2065            break;
2066        }
2067    }
2068
2069    if (!asym) {
2070        return NULL;
2071    }
2072
2073    /*
2074     * Examine topology from all CPU's point of views to detect the lowest
2075     * sched_domain_topology_level where a highest capacity CPU is visible
2076     * to everyone.
2077     */
2078    for_each_cpu(i, cpu_map)
2079    {
2080        unsigned long max_capacity = arch_scale_cpu_capacity(i);
2081        int tl_id = 0;
2082
2083        for (tl = sched_domain_topology; (tl)->mask; (tl)++) {
2084            if (tl_id < asym_level) {
2085                goto next_level;
2086            }
2087
2088            for_each_cpu_and(j, tl->mask(i), cpu_map) {
2089                unsigned long capacity;
2090
2091                capacity = arch_scale_cpu_capacity(j);
2092                if (capacity <= max_capacity) {
2093                    continue;
2094                }
2095
2096                max_capacity = capacity;
2097                asym_level = tl_id;
2098                asym_tl = tl;
2099            }
2100        next_level:
2101            tl_id++;
2102        }
2103    }
2104
2105    return asym_tl;
2106}
2107
2108/*
2109 * Build sched domains for a given set of CPUs and attach the sched domains
2110 * to the individual CPUs
2111 */
2112static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
2113{
2114    enum s_alloc alloc_state = sa_none;
2115    struct sched_domain *sd;
2116    struct s_data d;
2117    struct rq *rq = NULL;
2118    int i, ret = -ENOMEM;
2119    struct sched_domain_topology_level *tl_asym;
2120    bool has_asym = false;
2121
2122    if (WARN_ON(cpumask_empty(cpu_map))) {
2123        goto error;
2124    }
2125
2126    alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
2127    if (alloc_state != sa_rootdomain) {
2128        goto error;
2129    }
2130
2131    tl_asym = asym_cpu_capacity_level(cpu_map);
2132
2133    /* Set up domains for CPUs specified by the cpu_map: */
2134    for_each_cpu(i, cpu_map)
2135    {
2136        struct sched_domain_topology_level *tl;
2137        int dflags = 0;
2138
2139        sd = NULL;
2140        for (tl = sched_domain_topology; (tl)->mask; (tl)++) {
2141            if (tl == tl_asym) {
2142                dflags |= SD_ASYM_CPUCAPACITY;
2143                has_asym = true;
2144            }
2145
2146            if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) {
2147                goto error;
2148            }
2149
2150            sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
2151
2152            if (tl == sched_domain_topology) {
2153                *per_cpu_ptr(d.sd, i) = sd;
2154            }
2155            if (tl->flags & SDTL_OVERLAP) {
2156                sd->flags |= SD_OVERLAP;
2157            }
2158            if (cpumask_equal(cpu_map, sched_domain_span(sd))) {
2159                break;
2160            }
2161        }
2162    }
2163
2164    /* Build the groups for the domains */
2165    for_each_cpu(i, cpu_map)
2166    {
2167        for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2168            sd->span_weight = cpumask_weight(sched_domain_span(sd));
2169            if (sd->flags & SD_OVERLAP) {
2170                if (build_overlap_sched_groups(sd, i)) {
2171                    goto error;
2172                }
2173            } else {
2174                if (build_sched_groups(sd, i)) {
2175                    goto error;
2176                }
2177            }
2178        }
2179    }
2180
2181    /* Calculate CPU capacity for physical packages and nodes */
2182    for (i = nr_cpumask_bits - 1; i >= 0; i--) {
2183        if (!cpumask_test_cpu(i, cpu_map)) {
2184            continue;
2185        }
2186
2187        for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2188            claim_allocations(i, sd);
2189            init_sched_groups_capacity(i, sd);
2190        }
2191    }
2192
2193    /* Attach the domains */
2194    rcu_read_lock();
2195    for_each_cpu(i, cpu_map)
2196    {
2197#ifdef CONFIG_SCHED_RT_CAS
2198        int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
2199#endif
2200
2201        rq = cpu_rq(i);
2202        sd = *per_cpu_ptr(d.sd, i);
2203
2204#ifdef CONFIG_SCHED_RT_CAS
2205        if (max_cpu < 0 || arch_scale_cpu_capacity(i) > arch_scale_cpu_capacity(max_cpu)) {
2206            WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
2207        }
2208#endif
2209
2210        /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
2211        if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) {
2212            WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
2213        }
2214
2215        cpu_attach_domain(sd, d.rd, i);
2216    }
2217    rcu_read_unlock();
2218
2219    if (has_asym) {
2220        static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
2221    }
2222
2223    if (rq && sched_debug_enabled) {
2224        pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map),
2225                rq->rd->max_cpu_capacity);
2226    }
2227
2228    ret = 0;
2229error:
2230    __free_domain_allocs(&d, alloc_state, cpu_map);
2231
2232    return ret;
2233}
2234
2235/* Current sched domains: */
2236static cpumask_var_t *doms_cur;
2237
2238/* Number of sched domains in 'doms_cur': */
2239static int ndoms_cur;
2240
2241/* Attribues of custom domains in 'doms_cur' */
2242static struct sched_domain_attr *dattr_cur;
2243
2244/*
2245 * Special case: If a kmalloc() of a doms_cur partition (array of
2246 * cpumask) fails, then fallback to a single sched domain,
2247 * as determined by the single cpumask fallback_doms.
2248 */
2249static cpumask_var_t fallback_doms;
2250
2251/*
2252 * arch_update_cpu_topology lets virtualized architectures update the
2253 * CPU core maps. It is supposed to return 1 if the topology changed
2254 * or 0 if it stayed the same.
2255 */
2256int __weak arch_update_cpu_topology(void)
2257{
2258    return 0;
2259}
2260
2261cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
2262{
2263    int i;
2264    cpumask_var_t *doms;
2265
2266    doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
2267    if (!doms) {
2268        return NULL;
2269    }
2270    for (i = 0; i < ndoms; i++) {
2271        if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
2272            free_sched_domains(doms, i);
2273            return NULL;
2274        }
2275    }
2276    return doms;
2277}
2278
2279void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
2280{
2281    unsigned int i;
2282    for (i = 0; i < ndoms; i++) {
2283        free_cpumask_var(doms[i]);
2284    }
2285    kfree(doms);
2286}
2287
2288/*
2289 * Set up scheduler domains and groups.  For now this just excludes isolated
2290 * CPUs, but could be used to exclude other special cases in the future.
2291 */
2292int sched_init_domains(const struct cpumask *cpu_map)
2293{
2294    int err;
2295
2296    zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
2297    zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
2298    zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
2299
2300    arch_update_cpu_topology();
2301    ndoms_cur = 1;
2302    doms_cur = alloc_sched_domains(ndoms_cur);
2303    if (!doms_cur) {
2304        doms_cur = &fallback_doms;
2305    }
2306    cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
2307    err = build_sched_domains(doms_cur[0], NULL);
2308    register_sched_domain_sysctl();
2309
2310    return err;
2311}
2312
2313/*
2314 * Detach sched domains from a group of CPUs specified in cpu_map
2315 * These CPUs will now be attached to the NULL domain
2316 */
2317static void detach_destroy_domains(const struct cpumask *cpu_map)
2318{
2319    unsigned int cpu = cpumask_any(cpu_map);
2320    int i;
2321
2322    if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) {
2323        static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
2324    }
2325
2326    rcu_read_lock();
2327    for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i);
2328    rcu_read_unlock();
2329}
2330
2331/* handle null as "default" */
2332static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, struct sched_domain_attr *new, int idx_new)
2333{
2334    struct sched_domain_attr tmp;
2335
2336    /* Fast path: */
2337    if (!new && !cur) {
2338        return 1;
2339    }
2340
2341    tmp = SD_ATTR_INIT;
2342
2343    return !memcmp(cur ? (cur + idx_cur) : &tmp, new ? (new + idx_new) : &tmp, sizeof(struct sched_domain_attr));
2344}
2345
2346/*
2347 * Partition sched domains as specified by the 'ndoms_new'
2348 * cpumasks in the array doms_new[] of cpumasks. This compares
2349 * doms_new[] to the current sched domain partitioning, doms_cur[].
2350 * It destroys each deleted domain and builds each new domain.
2351 *
2352 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
2353 * The masks don't intersect (don't overlap.) We should setup one
2354 * sched domain for each mask. CPUs not in any of the cpumasks will
2355 * not be load balanced. If the same cpumask appears both in the
2356 * current 'doms_cur' domains and in the new 'doms_new', we can leave
2357 * it as it is.
2358 *
2359 * The passed in 'doms_new' should be allocated using
2360 * alloc_sched_domains.  This routine takes ownership of it and will
2361 * free_sched_domains it when done with it. If the caller failed the
2362 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
2363 * and partition_sched_domains() will fallback to the single partition
2364 * 'fallback_doms', it also forces the domains to be rebuilt.
2365 *
2366 * If doms_new == NULL it will be replaced with cpu_online_mask.
2367 * ndoms_new == 0 is a special case for destroying existing domains,
2368 * and it will not create the default domain.
2369 *
2370 * Call with hotplug lock and sched_domains_mutex held
2371 */
2372void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new)
2373{
2374    bool __maybe_unused has_eas = false;
2375    int i, j, n;
2376    int new_topology;
2377
2378    lockdep_assert_held(&sched_domains_mutex);
2379
2380    /* Always unregister in case we don't destroy any domains: */
2381    unregister_sched_domain_sysctl();
2382
2383    /* Let the architecture update CPU core mappings: */
2384    new_topology = arch_update_cpu_topology();
2385
2386    if (!doms_new) {
2387        WARN_ON_ONCE(dattr_new);
2388        n = 0;
2389        doms_new = alloc_sched_domains(1);
2390        if (doms_new) {
2391            n = 1;
2392            cpumask_and(doms_new[0], cpu_active_mask, housekeeping_cpumask(HK_FLAG_DOMAIN));
2393        }
2394    } else {
2395        n = ndoms_new;
2396    }
2397
2398    /* Destroy deleted domains: */
2399    for (i = 0; i < ndoms_cur; i++) {
2400        for (j = 0; j < n && !new_topology; j++) {
2401            if (cpumask_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) {
2402                struct root_domain *rd;
2403
2404                /*
2405                 * This domain won't be destroyed and as such
2406                 * its dl_bw->total_bw needs to be cleared.  It
2407                 * will be recomputed in function
2408                 * update_tasks_root_domain().
2409                 */
2410                rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2411                dl_clear_root_domain(rd);
2412                goto match1;
2413            }
2414        }
2415        /* No match - a current sched domain not in new doms_new[] */
2416        detach_destroy_domains(doms_cur[i]);
2417    match1:;
2418    }
2419
2420    n = ndoms_cur;
2421    if (!doms_new) {
2422        n = 0;
2423        doms_new = &fallback_doms;
2424        cpumask_and(doms_new[0], cpu_active_mask, housekeeping_cpumask(HK_FLAG_DOMAIN));
2425    }
2426
2427    /* Build new domains: */
2428    for (i = 0; i < ndoms_new; i++) {
2429        for (j = 0; j < n && !new_topology; j++) {
2430            if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) {
2431                goto match2;
2432            }
2433        }
2434        /* No match - add a new doms_new */
2435        build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
2436    match2:;
2437    }
2438
2439#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
2440    /* Build perf. domains: */
2441    for (i = 0; i < ndoms_new; i++) {
2442        for (j = 0; j < n && !sched_energy_update; j++) {
2443            if (cpumask_equal(doms_new[i], doms_cur[j]) && cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
2444                has_eas = true;
2445                goto match3;
2446            }
2447        }
2448        /* No match - add perf. domains for a new rd */
2449        has_eas |= build_perf_domains(doms_new[i]);
2450    match3:;
2451    }
2452    sched_energy_set(has_eas);
2453#endif
2454
2455    /* Remember the new sched domains: */
2456    if (doms_cur != &fallback_doms) {
2457        free_sched_domains(doms_cur, ndoms_cur);
2458    }
2459
2460    kfree(dattr_cur);
2461    doms_cur = doms_new;
2462    dattr_cur = dattr_new;
2463    ndoms_cur = ndoms_new;
2464
2465    register_sched_domain_sysctl();
2466}
2467
2468/*
2469 * Call with hotplug lock held
2470 */
2471void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new)
2472{
2473    mutex_lock(&sched_domains_mutex);
2474    partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
2475    mutex_unlock(&sched_domains_mutex);
2476}
2477