162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2016 Thomas Gleixner. 462306a36Sopenharmony_ci * Copyright (C) 2016-2017 Christoph Hellwig. 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci#include <linux/kernel.h> 762306a36Sopenharmony_ci#include <linux/slab.h> 862306a36Sopenharmony_ci#include <linux/cpu.h> 962306a36Sopenharmony_ci#include <linux/sort.h> 1062306a36Sopenharmony_ci#include <linux/group_cpus.h> 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci#ifdef CONFIG_SMP 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_cistatic void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, 1562306a36Sopenharmony_ci unsigned int cpus_per_grp) 1662306a36Sopenharmony_ci{ 1762306a36Sopenharmony_ci const struct cpumask *siblmsk; 1862306a36Sopenharmony_ci int cpu, sibl; 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci for ( ; cpus_per_grp > 0; ) { 2162306a36Sopenharmony_ci cpu = cpumask_first(nmsk); 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_ci /* Should not happen, but I'm too lazy to think about it */ 2462306a36Sopenharmony_ci if (cpu >= nr_cpu_ids) 2562306a36Sopenharmony_ci return; 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci cpumask_clear_cpu(cpu, nmsk); 2862306a36Sopenharmony_ci cpumask_set_cpu(cpu, irqmsk); 2962306a36Sopenharmony_ci cpus_per_grp--; 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci /* If the cpu has siblings, use them first */ 3262306a36Sopenharmony_ci siblmsk = topology_sibling_cpumask(cpu); 3362306a36Sopenharmony_ci for (sibl = -1; cpus_per_grp > 0; ) { 3462306a36Sopenharmony_ci sibl = cpumask_next(sibl, siblmsk); 3562306a36Sopenharmony_ci if (sibl >= nr_cpu_ids) 3662306a36Sopenharmony_ci break; 3762306a36Sopenharmony_ci if (!cpumask_test_and_clear_cpu(sibl, nmsk)) 3862306a36Sopenharmony_ci continue; 3962306a36Sopenharmony_ci cpumask_set_cpu(sibl, irqmsk); 4062306a36Sopenharmony_ci cpus_per_grp--; 4162306a36Sopenharmony_ci } 4262306a36Sopenharmony_ci } 4362306a36Sopenharmony_ci} 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_cistatic cpumask_var_t *alloc_node_to_cpumask(void) 4662306a36Sopenharmony_ci{ 4762306a36Sopenharmony_ci cpumask_var_t *masks; 4862306a36Sopenharmony_ci int node; 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); 5162306a36Sopenharmony_ci if (!masks) 5262306a36Sopenharmony_ci return NULL; 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci for (node = 0; node < nr_node_ids; node++) { 5562306a36Sopenharmony_ci if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) 5662306a36Sopenharmony_ci goto out_unwind; 5762306a36Sopenharmony_ci } 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci return masks; 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ciout_unwind: 6262306a36Sopenharmony_ci while (--node >= 0) 6362306a36Sopenharmony_ci free_cpumask_var(masks[node]); 6462306a36Sopenharmony_ci kfree(masks); 6562306a36Sopenharmony_ci return NULL; 6662306a36Sopenharmony_ci} 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_cistatic void free_node_to_cpumask(cpumask_var_t *masks) 6962306a36Sopenharmony_ci{ 7062306a36Sopenharmony_ci int node; 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci for (node = 0; node < nr_node_ids; node++) 7362306a36Sopenharmony_ci free_cpumask_var(masks[node]); 7462306a36Sopenharmony_ci kfree(masks); 7562306a36Sopenharmony_ci} 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_cistatic void build_node_to_cpumask(cpumask_var_t *masks) 7862306a36Sopenharmony_ci{ 7962306a36Sopenharmony_ci int cpu; 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci for_each_possible_cpu(cpu) 8262306a36Sopenharmony_ci cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); 8362306a36Sopenharmony_ci} 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_cistatic int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, 8662306a36Sopenharmony_ci const struct cpumask *mask, nodemask_t *nodemsk) 8762306a36Sopenharmony_ci{ 8862306a36Sopenharmony_ci int n, nodes = 0; 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci /* Calculate the number of nodes in the supplied affinity mask */ 9162306a36Sopenharmony_ci for_each_node(n) { 9262306a36Sopenharmony_ci if (cpumask_intersects(mask, node_to_cpumask[n])) { 9362306a36Sopenharmony_ci node_set(n, *nodemsk); 9462306a36Sopenharmony_ci nodes++; 9562306a36Sopenharmony_ci } 9662306a36Sopenharmony_ci } 9762306a36Sopenharmony_ci return nodes; 9862306a36Sopenharmony_ci} 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_cistruct node_groups { 10162306a36Sopenharmony_ci unsigned id; 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci union { 10462306a36Sopenharmony_ci unsigned ngroups; 10562306a36Sopenharmony_ci unsigned ncpus; 10662306a36Sopenharmony_ci }; 10762306a36Sopenharmony_ci}; 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_cistatic int ncpus_cmp_func(const void *l, const void *r) 11062306a36Sopenharmony_ci{ 11162306a36Sopenharmony_ci const struct node_groups *ln = l; 11262306a36Sopenharmony_ci const struct node_groups *rn = r; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci return ln->ncpus - rn->ncpus; 11562306a36Sopenharmony_ci} 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci/* 11862306a36Sopenharmony_ci * Allocate group number for each node, so that for each node: 11962306a36Sopenharmony_ci * 12062306a36Sopenharmony_ci * 1) the allocated number is >= 1 12162306a36Sopenharmony_ci * 12262306a36Sopenharmony_ci * 2) the allocated number is <= active CPU number of this node 12362306a36Sopenharmony_ci * 12462306a36Sopenharmony_ci * The actual allocated total groups may be less than @numgrps when 12562306a36Sopenharmony_ci * active total CPU number is less than @numgrps. 12662306a36Sopenharmony_ci * 12762306a36Sopenharmony_ci * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' 12862306a36Sopenharmony_ci * for each node. 12962306a36Sopenharmony_ci */ 13062306a36Sopenharmony_cistatic void alloc_nodes_groups(unsigned int numgrps, 13162306a36Sopenharmony_ci cpumask_var_t *node_to_cpumask, 13262306a36Sopenharmony_ci const struct cpumask *cpu_mask, 13362306a36Sopenharmony_ci const nodemask_t nodemsk, 13462306a36Sopenharmony_ci struct cpumask *nmsk, 13562306a36Sopenharmony_ci struct node_groups *node_groups) 13662306a36Sopenharmony_ci{ 13762306a36Sopenharmony_ci unsigned n, remaining_ncpus = 0; 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci for (n = 0; n < nr_node_ids; n++) { 14062306a36Sopenharmony_ci node_groups[n].id = n; 14162306a36Sopenharmony_ci node_groups[n].ncpus = UINT_MAX; 14262306a36Sopenharmony_ci } 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci for_each_node_mask(n, nodemsk) { 14562306a36Sopenharmony_ci unsigned ncpus; 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 14862306a36Sopenharmony_ci ncpus = cpumask_weight(nmsk); 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_ci if (!ncpus) 15162306a36Sopenharmony_ci continue; 15262306a36Sopenharmony_ci remaining_ncpus += ncpus; 15362306a36Sopenharmony_ci node_groups[n].ncpus = ncpus; 15462306a36Sopenharmony_ci } 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci numgrps = min_t(unsigned, remaining_ncpus, numgrps); 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci sort(node_groups, nr_node_ids, sizeof(node_groups[0]), 15962306a36Sopenharmony_ci ncpus_cmp_func, NULL); 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci /* 16262306a36Sopenharmony_ci * Allocate groups for each node according to the ratio of this 16362306a36Sopenharmony_ci * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is 16462306a36Sopenharmony_ci * bigger than number of active numa nodes. Always start the 16562306a36Sopenharmony_ci * allocation from the node with minimized nr_cpus. 16662306a36Sopenharmony_ci * 16762306a36Sopenharmony_ci * This way guarantees that each active node gets allocated at 16862306a36Sopenharmony_ci * least one group, and the theory is simple: over-allocation 16962306a36Sopenharmony_ci * is only done when this node is assigned by one group, so 17062306a36Sopenharmony_ci * other nodes will be allocated >= 1 groups, since 'numgrps' is 17162306a36Sopenharmony_ci * bigger than number of numa nodes. 17262306a36Sopenharmony_ci * 17362306a36Sopenharmony_ci * One perfect invariant is that number of allocated groups for 17462306a36Sopenharmony_ci * each node is <= CPU count of this node: 17562306a36Sopenharmony_ci * 17662306a36Sopenharmony_ci * 1) suppose there are two nodes: A and B 17762306a36Sopenharmony_ci * ncpu(X) is CPU count of node X 17862306a36Sopenharmony_ci * grps(X) is the group count allocated to node X via this 17962306a36Sopenharmony_ci * algorithm 18062306a36Sopenharmony_ci * 18162306a36Sopenharmony_ci * ncpu(A) <= ncpu(B) 18262306a36Sopenharmony_ci * ncpu(A) + ncpu(B) = N 18362306a36Sopenharmony_ci * grps(A) + grps(B) = G 18462306a36Sopenharmony_ci * 18562306a36Sopenharmony_ci * grps(A) = max(1, round_down(G * ncpu(A) / N)) 18662306a36Sopenharmony_ci * grps(B) = G - grps(A) 18762306a36Sopenharmony_ci * 18862306a36Sopenharmony_ci * both N and G are integer, and 2 <= G <= N, suppose 18962306a36Sopenharmony_ci * G = N - delta, and 0 <= delta <= N - 2 19062306a36Sopenharmony_ci * 19162306a36Sopenharmony_ci * 2) obviously grps(A) <= ncpu(A) because: 19262306a36Sopenharmony_ci * 19362306a36Sopenharmony_ci * if grps(A) is 1, then grps(A) <= ncpu(A) given 19462306a36Sopenharmony_ci * ncpu(A) >= 1 19562306a36Sopenharmony_ci * 19662306a36Sopenharmony_ci * otherwise, 19762306a36Sopenharmony_ci * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N 19862306a36Sopenharmony_ci * 19962306a36Sopenharmony_ci * 3) prove how grps(B) <= ncpu(B): 20062306a36Sopenharmony_ci * 20162306a36Sopenharmony_ci * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be 20262306a36Sopenharmony_ci * over-allocated, so grps(B) <= ncpu(B), 20362306a36Sopenharmony_ci * 20462306a36Sopenharmony_ci * otherwise: 20562306a36Sopenharmony_ci * 20662306a36Sopenharmony_ci * grps(A) = 20762306a36Sopenharmony_ci * round_down(G * ncpu(A) / N) = 20862306a36Sopenharmony_ci * round_down((N - delta) * ncpu(A) / N) = 20962306a36Sopenharmony_ci * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= 21062306a36Sopenharmony_ci * round_down((N * ncpu(A) - delta * N) / N) = 21162306a36Sopenharmony_ci * cpu(A) - delta 21262306a36Sopenharmony_ci * 21362306a36Sopenharmony_ci * then: 21462306a36Sopenharmony_ci * 21562306a36Sopenharmony_ci * grps(A) - G >= ncpu(A) - delta - G 21662306a36Sopenharmony_ci * => 21762306a36Sopenharmony_ci * G - grps(A) <= G + delta - ncpu(A) 21862306a36Sopenharmony_ci * => 21962306a36Sopenharmony_ci * grps(B) <= N - ncpu(A) 22062306a36Sopenharmony_ci * => 22162306a36Sopenharmony_ci * grps(B) <= cpu(B) 22262306a36Sopenharmony_ci * 22362306a36Sopenharmony_ci * For nodes >= 3, it can be thought as one node and another big 22462306a36Sopenharmony_ci * node given that is exactly what this algorithm is implemented, 22562306a36Sopenharmony_ci * and we always re-calculate 'remaining_ncpus' & 'numgrps', and 22662306a36Sopenharmony_ci * finally for each node X: grps(X) <= ncpu(X). 22762306a36Sopenharmony_ci * 22862306a36Sopenharmony_ci */ 22962306a36Sopenharmony_ci for (n = 0; n < nr_node_ids; n++) { 23062306a36Sopenharmony_ci unsigned ngroups, ncpus; 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci if (node_groups[n].ncpus == UINT_MAX) 23362306a36Sopenharmony_ci continue; 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci WARN_ON_ONCE(numgrps == 0); 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci ncpus = node_groups[n].ncpus; 23862306a36Sopenharmony_ci ngroups = max_t(unsigned, 1, 23962306a36Sopenharmony_ci numgrps * ncpus / remaining_ncpus); 24062306a36Sopenharmony_ci WARN_ON_ONCE(ngroups > ncpus); 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_ci node_groups[n].ngroups = ngroups; 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci remaining_ncpus -= ncpus; 24562306a36Sopenharmony_ci numgrps -= ngroups; 24662306a36Sopenharmony_ci } 24762306a36Sopenharmony_ci} 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_cistatic int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, 25062306a36Sopenharmony_ci cpumask_var_t *node_to_cpumask, 25162306a36Sopenharmony_ci const struct cpumask *cpu_mask, 25262306a36Sopenharmony_ci struct cpumask *nmsk, struct cpumask *masks) 25362306a36Sopenharmony_ci{ 25462306a36Sopenharmony_ci unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; 25562306a36Sopenharmony_ci unsigned int last_grp = numgrps; 25662306a36Sopenharmony_ci unsigned int curgrp = startgrp; 25762306a36Sopenharmony_ci nodemask_t nodemsk = NODE_MASK_NONE; 25862306a36Sopenharmony_ci struct node_groups *node_groups; 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci if (cpumask_empty(cpu_mask)) 26162306a36Sopenharmony_ci return 0; 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci /* 26662306a36Sopenharmony_ci * If the number of nodes in the mask is greater than or equal the 26762306a36Sopenharmony_ci * number of groups we just spread the groups across the nodes. 26862306a36Sopenharmony_ci */ 26962306a36Sopenharmony_ci if (numgrps <= nodes) { 27062306a36Sopenharmony_ci for_each_node_mask(n, nodemsk) { 27162306a36Sopenharmony_ci /* Ensure that only CPUs which are in both masks are set */ 27262306a36Sopenharmony_ci cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 27362306a36Sopenharmony_ci cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); 27462306a36Sopenharmony_ci if (++curgrp == last_grp) 27562306a36Sopenharmony_ci curgrp = 0; 27662306a36Sopenharmony_ci } 27762306a36Sopenharmony_ci return numgrps; 27862306a36Sopenharmony_ci } 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci node_groups = kcalloc(nr_node_ids, 28162306a36Sopenharmony_ci sizeof(struct node_groups), 28262306a36Sopenharmony_ci GFP_KERNEL); 28362306a36Sopenharmony_ci if (!node_groups) 28462306a36Sopenharmony_ci return -ENOMEM; 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci /* allocate group number for each node */ 28762306a36Sopenharmony_ci alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, 28862306a36Sopenharmony_ci nodemsk, nmsk, node_groups); 28962306a36Sopenharmony_ci for (i = 0; i < nr_node_ids; i++) { 29062306a36Sopenharmony_ci unsigned int ncpus, v; 29162306a36Sopenharmony_ci struct node_groups *nv = &node_groups[i]; 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci if (nv->ngroups == UINT_MAX) 29462306a36Sopenharmony_ci continue; 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci /* Get the cpus on this node which are in the mask */ 29762306a36Sopenharmony_ci cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); 29862306a36Sopenharmony_ci ncpus = cpumask_weight(nmsk); 29962306a36Sopenharmony_ci if (!ncpus) 30062306a36Sopenharmony_ci continue; 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci WARN_ON_ONCE(nv->ngroups > ncpus); 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci /* Account for rounding errors */ 30562306a36Sopenharmony_ci extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci /* Spread allocated groups on CPUs of the current node */ 30862306a36Sopenharmony_ci for (v = 0; v < nv->ngroups; v++, curgrp++) { 30962306a36Sopenharmony_ci cpus_per_grp = ncpus / nv->ngroups; 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci /* Account for extra groups to compensate rounding errors */ 31262306a36Sopenharmony_ci if (extra_grps) { 31362306a36Sopenharmony_ci cpus_per_grp++; 31462306a36Sopenharmony_ci --extra_grps; 31562306a36Sopenharmony_ci } 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci /* 31862306a36Sopenharmony_ci * wrapping has to be considered given 'startgrp' 31962306a36Sopenharmony_ci * may start anywhere 32062306a36Sopenharmony_ci */ 32162306a36Sopenharmony_ci if (curgrp >= last_grp) 32262306a36Sopenharmony_ci curgrp = 0; 32362306a36Sopenharmony_ci grp_spread_init_one(&masks[curgrp], nmsk, 32462306a36Sopenharmony_ci cpus_per_grp); 32562306a36Sopenharmony_ci } 32662306a36Sopenharmony_ci done += nv->ngroups; 32762306a36Sopenharmony_ci } 32862306a36Sopenharmony_ci kfree(node_groups); 32962306a36Sopenharmony_ci return done; 33062306a36Sopenharmony_ci} 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci/** 33362306a36Sopenharmony_ci * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality 33462306a36Sopenharmony_ci * @numgrps: number of groups 33562306a36Sopenharmony_ci * 33662306a36Sopenharmony_ci * Return: cpumask array if successful, NULL otherwise. And each element 33762306a36Sopenharmony_ci * includes CPUs assigned to this group 33862306a36Sopenharmony_ci * 33962306a36Sopenharmony_ci * Try to put close CPUs from viewpoint of CPU and NUMA locality into 34062306a36Sopenharmony_ci * same group, and run two-stage grouping: 34162306a36Sopenharmony_ci * 1) allocate present CPUs on these groups evenly first 34262306a36Sopenharmony_ci * 2) allocate other possible CPUs on these groups evenly 34362306a36Sopenharmony_ci * 34462306a36Sopenharmony_ci * We guarantee in the resulted grouping that all CPUs are covered, and 34562306a36Sopenharmony_ci * no same CPU is assigned to multiple groups 34662306a36Sopenharmony_ci */ 34762306a36Sopenharmony_cistruct cpumask *group_cpus_evenly(unsigned int numgrps) 34862306a36Sopenharmony_ci{ 34962306a36Sopenharmony_ci unsigned int curgrp = 0, nr_present = 0, nr_others = 0; 35062306a36Sopenharmony_ci cpumask_var_t *node_to_cpumask; 35162306a36Sopenharmony_ci cpumask_var_t nmsk, npresmsk; 35262306a36Sopenharmony_ci int ret = -ENOMEM; 35362306a36Sopenharmony_ci struct cpumask *masks = NULL; 35462306a36Sopenharmony_ci 35562306a36Sopenharmony_ci if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) 35662306a36Sopenharmony_ci return NULL; 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) 35962306a36Sopenharmony_ci goto fail_nmsk; 36062306a36Sopenharmony_ci 36162306a36Sopenharmony_ci node_to_cpumask = alloc_node_to_cpumask(); 36262306a36Sopenharmony_ci if (!node_to_cpumask) 36362306a36Sopenharmony_ci goto fail_npresmsk; 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); 36662306a36Sopenharmony_ci if (!masks) 36762306a36Sopenharmony_ci goto fail_node_to_cpumask; 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci build_node_to_cpumask(node_to_cpumask); 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci /* 37262306a36Sopenharmony_ci * Make a local cache of 'cpu_present_mask', so the two stages 37362306a36Sopenharmony_ci * spread can observe consistent 'cpu_present_mask' without holding 37462306a36Sopenharmony_ci * cpu hotplug lock, then we can reduce deadlock risk with cpu 37562306a36Sopenharmony_ci * hotplug code. 37662306a36Sopenharmony_ci * 37762306a36Sopenharmony_ci * Here CPU hotplug may happen when reading `cpu_present_mask`, and 37862306a36Sopenharmony_ci * we can live with the case because it only affects that hotplug 37962306a36Sopenharmony_ci * CPU is handled in the 1st or 2nd stage, and either way is correct 38062306a36Sopenharmony_ci * from API user viewpoint since 2-stage spread is sort of 38162306a36Sopenharmony_ci * optimization. 38262306a36Sopenharmony_ci */ 38362306a36Sopenharmony_ci cpumask_copy(npresmsk, data_race(cpu_present_mask)); 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci /* grouping present CPUs first */ 38662306a36Sopenharmony_ci ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, 38762306a36Sopenharmony_ci npresmsk, nmsk, masks); 38862306a36Sopenharmony_ci if (ret < 0) 38962306a36Sopenharmony_ci goto fail_build_affinity; 39062306a36Sopenharmony_ci nr_present = ret; 39162306a36Sopenharmony_ci 39262306a36Sopenharmony_ci /* 39362306a36Sopenharmony_ci * Allocate non present CPUs starting from the next group to be 39462306a36Sopenharmony_ci * handled. If the grouping of present CPUs already exhausted the 39562306a36Sopenharmony_ci * group space, assign the non present CPUs to the already 39662306a36Sopenharmony_ci * allocated out groups. 39762306a36Sopenharmony_ci */ 39862306a36Sopenharmony_ci if (nr_present >= numgrps) 39962306a36Sopenharmony_ci curgrp = 0; 40062306a36Sopenharmony_ci else 40162306a36Sopenharmony_ci curgrp = nr_present; 40262306a36Sopenharmony_ci cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk); 40362306a36Sopenharmony_ci ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, 40462306a36Sopenharmony_ci npresmsk, nmsk, masks); 40562306a36Sopenharmony_ci if (ret >= 0) 40662306a36Sopenharmony_ci nr_others = ret; 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_ci fail_build_affinity: 40962306a36Sopenharmony_ci if (ret >= 0) 41062306a36Sopenharmony_ci WARN_ON(nr_present + nr_others < numgrps); 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci fail_node_to_cpumask: 41362306a36Sopenharmony_ci free_node_to_cpumask(node_to_cpumask); 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci fail_npresmsk: 41662306a36Sopenharmony_ci free_cpumask_var(npresmsk); 41762306a36Sopenharmony_ci 41862306a36Sopenharmony_ci fail_nmsk: 41962306a36Sopenharmony_ci free_cpumask_var(nmsk); 42062306a36Sopenharmony_ci if (ret < 0) { 42162306a36Sopenharmony_ci kfree(masks); 42262306a36Sopenharmony_ci return NULL; 42362306a36Sopenharmony_ci } 42462306a36Sopenharmony_ci return masks; 42562306a36Sopenharmony_ci} 42662306a36Sopenharmony_ci#else /* CONFIG_SMP */ 42762306a36Sopenharmony_cistruct cpumask *group_cpus_evenly(unsigned int numgrps) 42862306a36Sopenharmony_ci{ 42962306a36Sopenharmony_ci struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); 43062306a36Sopenharmony_ci 43162306a36Sopenharmony_ci if (!masks) 43262306a36Sopenharmony_ci return NULL; 43362306a36Sopenharmony_ci 43462306a36Sopenharmony_ci /* assign all CPUs(cpu 0) to the 1st group only */ 43562306a36Sopenharmony_ci cpumask_copy(&masks[0], cpu_possible_mask); 43662306a36Sopenharmony_ci return masks; 43762306a36Sopenharmony_ci} 43862306a36Sopenharmony_ci#endif /* CONFIG_SMP */ 43962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(group_cpus_evenly); 440