1/*
2 * Copyright(c) 2015 - 2020 Intel Corporation.
3 *
4 * This file is provided under a dual BSD/GPLv2 license.  When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * General Public License for more details.
17 *
18 * BSD LICENSE
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 *
24 *  - Redistributions of source code must retain the above copyright
25 *    notice, this list of conditions and the following disclaimer.
26 *  - Redistributions in binary form must reproduce the above copyright
27 *    notice, this list of conditions and the following disclaimer in
28 *    the documentation and/or other materials provided with the
29 *    distribution.
30 *  - Neither the name of Intel Corporation nor the names of its
31 *    contributors may be used to endorse or promote products derived
32 *    from this software without specific prior written permission.
33 *
34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45 *
46 */
47#include <linux/topology.h>
48#include <linux/cpumask.h>
49#include <linux/module.h>
50#include <linux/interrupt.h>
51#include <linux/numa.h>
52
53#include "hfi.h"
54#include "affinity.h"
55#include "sdma.h"
56#include "trace.h"
57
58struct hfi1_affinity_node_list node_affinity = {
59	.list = LIST_HEAD_INIT(node_affinity.list),
60	.lock = __MUTEX_INITIALIZER(node_affinity.lock)
61};
62
63/* Name of IRQ types, indexed by enum irq_type */
64static const char * const irq_type_names[] = {
65	"SDMA",
66	"RCVCTXT",
67	"NETDEVCTXT",
68	"GENERAL",
69	"OTHER",
70};
71
72/* Per NUMA node count of HFI devices */
73static unsigned int *hfi1_per_node_cntr;
74
75static inline void init_cpu_mask_set(struct cpu_mask_set *set)
76{
77	cpumask_clear(&set->mask);
78	cpumask_clear(&set->used);
79	set->gen = 0;
80}
81
82/* Increment generation of CPU set if needed */
83static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set)
84{
85	if (cpumask_equal(&set->mask, &set->used)) {
86		/*
87		 * We've used up all the CPUs, bump up the generation
88		 * and reset the 'used' map
89		 */
90		set->gen++;
91		cpumask_clear(&set->used);
92	}
93}
94
95static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set)
96{
97	if (cpumask_empty(&set->used) && set->gen) {
98		set->gen--;
99		cpumask_copy(&set->used, &set->mask);
100	}
101}
102
103/* Get the first CPU from the list of unused CPUs in a CPU set data structure */
104static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff)
105{
106	int cpu;
107
108	if (!diff || !set)
109		return -EINVAL;
110
111	_cpu_mask_set_gen_inc(set);
112
113	/* Find out CPUs left in CPU mask */
114	cpumask_andnot(diff, &set->mask, &set->used);
115
116	cpu = cpumask_first(diff);
117	if (cpu >= nr_cpu_ids) /* empty */
118		cpu = -EINVAL;
119	else
120		cpumask_set_cpu(cpu, &set->used);
121
122	return cpu;
123}
124
125static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
126{
127	if (!set)
128		return;
129
130	cpumask_clear_cpu(cpu, &set->used);
131	_cpu_mask_set_gen_dec(set);
132}
133
134/* Initialize non-HT cpu cores mask */
135void init_real_cpu_mask(void)
136{
137	int possible, curr_cpu, i, ht;
138
139	cpumask_clear(&node_affinity.real_cpu_mask);
140
141	/* Start with cpu online mask as the real cpu mask */
142	cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
143
144	/*
145	 * Remove HT cores from the real cpu mask.  Do this in two steps below.
146	 */
147	possible = cpumask_weight(&node_affinity.real_cpu_mask);
148	ht = cpumask_weight(topology_sibling_cpumask(
149				cpumask_first(&node_affinity.real_cpu_mask)));
150	/*
151	 * Step 1.  Skip over the first N HT siblings and use them as the
152	 * "real" cores.  Assumes that HT cores are not enumerated in
153	 * succession (except in the single core case).
154	 */
155	curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
156	for (i = 0; i < possible / ht; i++)
157		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
158	/*
159	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
160	 * skip any gaps.
161	 */
162	for (; i < possible; i++) {
163		cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
164		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
165	}
166}
167
168int node_affinity_init(void)
169{
170	int node;
171	struct pci_dev *dev = NULL;
172	const struct pci_device_id *ids = hfi1_pci_tbl;
173
174	cpumask_clear(&node_affinity.proc.used);
175	cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
176
177	node_affinity.proc.gen = 0;
178	node_affinity.num_core_siblings =
179				cpumask_weight(topology_sibling_cpumask(
180					cpumask_first(&node_affinity.proc.mask)
181					));
182	node_affinity.num_possible_nodes = num_possible_nodes();
183	node_affinity.num_online_nodes = num_online_nodes();
184	node_affinity.num_online_cpus = num_online_cpus();
185
186	/*
187	 * The real cpu mask is part of the affinity struct but it has to be
188	 * initialized early. It is needed to calculate the number of user
189	 * contexts in set_up_context_variables().
190	 */
191	init_real_cpu_mask();
192
193	hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
194				     sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
195	if (!hfi1_per_node_cntr)
196		return -ENOMEM;
197
198	while (ids->vendor) {
199		dev = NULL;
200		while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
201			node = pcibus_to_node(dev->bus);
202			if (node < 0)
203				goto out;
204
205			hfi1_per_node_cntr[node]++;
206		}
207		ids++;
208	}
209
210	return 0;
211
212out:
213	/*
214	 * Invalid PCI NUMA node information found, note it, and populate
215	 * our database 1:1.
216	 */
217	pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
218	pr_err("HFI: System BIOS may need to be upgraded\n");
219	for (node = 0; node < node_affinity.num_possible_nodes; node++)
220		hfi1_per_node_cntr[node] = 1;
221
222	pci_dev_put(dev);
223
224	return 0;
225}
226
227static void node_affinity_destroy(struct hfi1_affinity_node *entry)
228{
229	free_percpu(entry->comp_vect_affinity);
230	kfree(entry);
231}
232
233void node_affinity_destroy_all(void)
234{
235	struct list_head *pos, *q;
236	struct hfi1_affinity_node *entry;
237
238	mutex_lock(&node_affinity.lock);
239	list_for_each_safe(pos, q, &node_affinity.list) {
240		entry = list_entry(pos, struct hfi1_affinity_node,
241				   list);
242		list_del(pos);
243		node_affinity_destroy(entry);
244	}
245	mutex_unlock(&node_affinity.lock);
246	kfree(hfi1_per_node_cntr);
247}
248
249static struct hfi1_affinity_node *node_affinity_allocate(int node)
250{
251	struct hfi1_affinity_node *entry;
252
253	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
254	if (!entry)
255		return NULL;
256	entry->node = node;
257	entry->comp_vect_affinity = alloc_percpu(u16);
258	INIT_LIST_HEAD(&entry->list);
259
260	return entry;
261}
262
263/*
264 * It appends an entry to the list.
265 * It *must* be called with node_affinity.lock held.
266 */
267static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
268{
269	list_add_tail(&entry->list, &node_affinity.list);
270}
271
272/* It must be called with node_affinity.lock held */
273static struct hfi1_affinity_node *node_affinity_lookup(int node)
274{
275	struct list_head *pos;
276	struct hfi1_affinity_node *entry;
277
278	list_for_each(pos, &node_affinity.list) {
279		entry = list_entry(pos, struct hfi1_affinity_node, list);
280		if (entry->node == node)
281			return entry;
282	}
283
284	return NULL;
285}
286
287static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
288				u16 __percpu *comp_vect_affinity)
289{
290	int curr_cpu;
291	u16 cntr;
292	u16 prev_cntr;
293	int ret_cpu;
294
295	if (!possible_cpumask) {
296		ret_cpu = -EINVAL;
297		goto fail;
298	}
299
300	if (!comp_vect_affinity) {
301		ret_cpu = -EINVAL;
302		goto fail;
303	}
304
305	ret_cpu = cpumask_first(possible_cpumask);
306	if (ret_cpu >= nr_cpu_ids) {
307		ret_cpu = -EINVAL;
308		goto fail;
309	}
310
311	prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
312	for_each_cpu(curr_cpu, possible_cpumask) {
313		cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
314
315		if (cntr < prev_cntr) {
316			ret_cpu = curr_cpu;
317			prev_cntr = cntr;
318		}
319	}
320
321	*per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
322
323fail:
324	return ret_cpu;
325}
326
327static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
328				    u16 __percpu *comp_vect_affinity)
329{
330	int curr_cpu;
331	int max_cpu;
332	u16 cntr;
333	u16 prev_cntr;
334
335	if (!possible_cpumask)
336		return -EINVAL;
337
338	if (!comp_vect_affinity)
339		return -EINVAL;
340
341	max_cpu = cpumask_first(possible_cpumask);
342	if (max_cpu >= nr_cpu_ids)
343		return -EINVAL;
344
345	prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
346	for_each_cpu(curr_cpu, possible_cpumask) {
347		cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
348
349		if (cntr > prev_cntr) {
350			max_cpu = curr_cpu;
351			prev_cntr = cntr;
352		}
353	}
354
355	*per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
356
357	return max_cpu;
358}
359
360/*
361 * Non-interrupt CPUs are used first, then interrupt CPUs.
362 * Two already allocated cpu masks must be passed.
363 */
364static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
365				  struct hfi1_affinity_node *entry,
366				  cpumask_var_t non_intr_cpus,
367				  cpumask_var_t available_cpus)
368	__must_hold(&node_affinity.lock)
369{
370	int cpu;
371	struct cpu_mask_set *set = dd->comp_vect;
372
373	lockdep_assert_held(&node_affinity.lock);
374	if (!non_intr_cpus) {
375		cpu = -1;
376		goto fail;
377	}
378
379	if (!available_cpus) {
380		cpu = -1;
381		goto fail;
382	}
383
384	/* Available CPUs for pinning completion vectors */
385	_cpu_mask_set_gen_inc(set);
386	cpumask_andnot(available_cpus, &set->mask, &set->used);
387
388	/* Available CPUs without SDMA engine interrupts */
389	cpumask_andnot(non_intr_cpus, available_cpus,
390		       &entry->def_intr.used);
391
392	/* If there are non-interrupt CPUs available, use them first */
393	if (!cpumask_empty(non_intr_cpus))
394		cpu = cpumask_first(non_intr_cpus);
395	else /* Otherwise, use interrupt CPUs */
396		cpu = cpumask_first(available_cpus);
397
398	if (cpu >= nr_cpu_ids) { /* empty */
399		cpu = -1;
400		goto fail;
401	}
402	cpumask_set_cpu(cpu, &set->used);
403
404fail:
405	return cpu;
406}
407
408static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
409{
410	struct cpu_mask_set *set = dd->comp_vect;
411
412	if (cpu < 0)
413		return;
414
415	cpu_mask_set_put(set, cpu);
416}
417
418/* _dev_comp_vect_mappings_destroy() is reentrant */
419static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
420{
421	int i, cpu;
422
423	if (!dd->comp_vect_mappings)
424		return;
425
426	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
427		cpu = dd->comp_vect_mappings[i];
428		_dev_comp_vect_cpu_put(dd, cpu);
429		dd->comp_vect_mappings[i] = -1;
430		hfi1_cdbg(AFFINITY,
431			  "[%s] Release CPU %d from completion vector %d",
432			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
433	}
434
435	kfree(dd->comp_vect_mappings);
436	dd->comp_vect_mappings = NULL;
437}
438
439/*
440 * This function creates the table for looking up CPUs for completion vectors.
441 * num_comp_vectors needs to have been initilized before calling this function.
442 */
443static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
444					  struct hfi1_affinity_node *entry)
445	__must_hold(&node_affinity.lock)
446{
447	int i, cpu, ret;
448	cpumask_var_t non_intr_cpus;
449	cpumask_var_t available_cpus;
450
451	lockdep_assert_held(&node_affinity.lock);
452
453	if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
454		return -ENOMEM;
455
456	if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
457		free_cpumask_var(non_intr_cpus);
458		return -ENOMEM;
459	}
460
461	dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
462					 sizeof(*dd->comp_vect_mappings),
463					 GFP_KERNEL);
464	if (!dd->comp_vect_mappings) {
465		ret = -ENOMEM;
466		goto fail;
467	}
468	for (i = 0; i < dd->comp_vect_possible_cpus; i++)
469		dd->comp_vect_mappings[i] = -1;
470
471	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
472		cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
473					     available_cpus);
474		if (cpu < 0) {
475			ret = -EINVAL;
476			goto fail;
477		}
478
479		dd->comp_vect_mappings[i] = cpu;
480		hfi1_cdbg(AFFINITY,
481			  "[%s] Completion Vector %d -> CPU %d",
482			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
483	}
484
485	free_cpumask_var(available_cpus);
486	free_cpumask_var(non_intr_cpus);
487	return 0;
488
489fail:
490	free_cpumask_var(available_cpus);
491	free_cpumask_var(non_intr_cpus);
492	_dev_comp_vect_mappings_destroy(dd);
493
494	return ret;
495}
496
497int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
498{
499	int ret;
500	struct hfi1_affinity_node *entry;
501
502	mutex_lock(&node_affinity.lock);
503	entry = node_affinity_lookup(dd->node);
504	if (!entry) {
505		ret = -EINVAL;
506		goto unlock;
507	}
508	ret = _dev_comp_vect_mappings_create(dd, entry);
509unlock:
510	mutex_unlock(&node_affinity.lock);
511
512	return ret;
513}
514
515void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
516{
517	_dev_comp_vect_mappings_destroy(dd);
518}
519
520int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
521{
522	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
523	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
524
525	if (!dd->comp_vect_mappings)
526		return -EINVAL;
527	if (comp_vect >= dd->comp_vect_possible_cpus)
528		return -EINVAL;
529
530	return dd->comp_vect_mappings[comp_vect];
531}
532
533/*
534 * It assumes dd->comp_vect_possible_cpus is available.
535 */
536static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
537					struct hfi1_affinity_node *entry,
538					bool first_dev_init)
539	__must_hold(&node_affinity.lock)
540{
541	int i, j, curr_cpu;
542	int possible_cpus_comp_vect = 0;
543	struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
544
545	lockdep_assert_held(&node_affinity.lock);
546	/*
547	 * If there's only one CPU available for completion vectors, then
548	 * there will only be one completion vector available. Othewise,
549	 * the number of completion vector available will be the number of
550	 * available CPUs divide it by the number of devices in the
551	 * local NUMA node.
552	 */
553	if (cpumask_weight(&entry->comp_vect_mask) == 1) {
554		possible_cpus_comp_vect = 1;
555		dd_dev_warn(dd,
556			    "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
557	} else {
558		possible_cpus_comp_vect +=
559			cpumask_weight(&entry->comp_vect_mask) /
560				       hfi1_per_node_cntr[dd->node];
561
562		/*
563		 * If the completion vector CPUs available doesn't divide
564		 * evenly among devices, then the first device device to be
565		 * initialized gets an extra CPU.
566		 */
567		if (first_dev_init &&
568		    cpumask_weight(&entry->comp_vect_mask) %
569		    hfi1_per_node_cntr[dd->node] != 0)
570			possible_cpus_comp_vect++;
571	}
572
573	dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
574
575	/* Reserving CPUs for device completion vector */
576	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
577		curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
578						entry->comp_vect_affinity);
579		if (curr_cpu < 0)
580			goto fail;
581
582		cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
583	}
584
585	hfi1_cdbg(AFFINITY,
586		  "[%s] Completion vector affinity CPU set(s) %*pbl",
587		  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
588		  cpumask_pr_args(dev_comp_vect_mask));
589
590	return 0;
591
592fail:
593	for (j = 0; j < i; j++)
594		per_cpu_affinity_put_max(&entry->comp_vect_mask,
595					 entry->comp_vect_affinity);
596
597	return curr_cpu;
598}
599
600/*
601 * It assumes dd->comp_vect_possible_cpus is available.
602 */
603static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
604					     struct hfi1_affinity_node *entry)
605	__must_hold(&node_affinity.lock)
606{
607	int i, cpu;
608
609	lockdep_assert_held(&node_affinity.lock);
610	if (!dd->comp_vect_possible_cpus)
611		return;
612
613	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
614		cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
615					       entry->comp_vect_affinity);
616		/* Clearing CPU in device completion vector cpu mask */
617		if (cpu >= 0)
618			cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
619	}
620
621	dd->comp_vect_possible_cpus = 0;
622}
623
624/*
625 * Interrupt affinity.
626 *
627 * non-rcv avail gets a default mask that
628 * starts as possible cpus with threads reset
629 * and each rcv avail reset.
630 *
631 * rcv avail gets node relative 1 wrapping back
632 * to the node relative 1 as necessary.
633 *
634 */
635int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
636{
637	struct hfi1_affinity_node *entry;
638	const struct cpumask *local_mask;
639	int curr_cpu, possible, i, ret;
640	bool new_entry = false;
641
642	local_mask = cpumask_of_node(dd->node);
643	if (cpumask_first(local_mask) >= nr_cpu_ids)
644		local_mask = topology_core_cpumask(0);
645
646	mutex_lock(&node_affinity.lock);
647	entry = node_affinity_lookup(dd->node);
648
649	/*
650	 * If this is the first time this NUMA node's affinity is used,
651	 * create an entry in the global affinity structure and initialize it.
652	 */
653	if (!entry) {
654		entry = node_affinity_allocate(dd->node);
655		if (!entry) {
656			dd_dev_err(dd,
657				   "Unable to allocate global affinity node\n");
658			ret = -ENOMEM;
659			goto fail;
660		}
661		new_entry = true;
662
663		init_cpu_mask_set(&entry->def_intr);
664		init_cpu_mask_set(&entry->rcv_intr);
665		cpumask_clear(&entry->comp_vect_mask);
666		cpumask_clear(&entry->general_intr_mask);
667		/* Use the "real" cpu mask of this node as the default */
668		cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
669			    local_mask);
670
671		/* fill in the receive list */
672		possible = cpumask_weight(&entry->def_intr.mask);
673		curr_cpu = cpumask_first(&entry->def_intr.mask);
674
675		if (possible == 1) {
676			/* only one CPU, everyone will use it */
677			cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
678			cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
679		} else {
680			/*
681			 * The general/control context will be the first CPU in
682			 * the default list, so it is removed from the default
683			 * list and added to the general interrupt list.
684			 */
685			cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
686			cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
687			curr_cpu = cpumask_next(curr_cpu,
688						&entry->def_intr.mask);
689
690			/*
691			 * Remove the remaining kernel receive queues from
692			 * the default list and add them to the receive list.
693			 */
694			for (i = 0;
695			     i < (dd->n_krcv_queues - 1) *
696				  hfi1_per_node_cntr[dd->node];
697			     i++) {
698				cpumask_clear_cpu(curr_cpu,
699						  &entry->def_intr.mask);
700				cpumask_set_cpu(curr_cpu,
701						&entry->rcv_intr.mask);
702				curr_cpu = cpumask_next(curr_cpu,
703							&entry->def_intr.mask);
704				if (curr_cpu >= nr_cpu_ids)
705					break;
706			}
707
708			/*
709			 * If there ends up being 0 CPU cores leftover for SDMA
710			 * engines, use the same CPU cores as general/control
711			 * context.
712			 */
713			if (cpumask_weight(&entry->def_intr.mask) == 0)
714				cpumask_copy(&entry->def_intr.mask,
715					     &entry->general_intr_mask);
716		}
717
718		/* Determine completion vector CPUs for the entire node */
719		cpumask_and(&entry->comp_vect_mask,
720			    &node_affinity.real_cpu_mask, local_mask);
721		cpumask_andnot(&entry->comp_vect_mask,
722			       &entry->comp_vect_mask,
723			       &entry->rcv_intr.mask);
724		cpumask_andnot(&entry->comp_vect_mask,
725			       &entry->comp_vect_mask,
726			       &entry->general_intr_mask);
727
728		/*
729		 * If there ends up being 0 CPU cores leftover for completion
730		 * vectors, use the same CPU core as the general/control
731		 * context.
732		 */
733		if (cpumask_weight(&entry->comp_vect_mask) == 0)
734			cpumask_copy(&entry->comp_vect_mask,
735				     &entry->general_intr_mask);
736	}
737
738	ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
739	if (ret < 0)
740		goto fail;
741
742	if (new_entry)
743		node_affinity_add_tail(entry);
744
745	dd->affinity_entry = entry;
746	mutex_unlock(&node_affinity.lock);
747
748	return 0;
749
750fail:
751	if (new_entry)
752		node_affinity_destroy(entry);
753	mutex_unlock(&node_affinity.lock);
754	return ret;
755}
756
757void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
758{
759	struct hfi1_affinity_node *entry;
760
761	mutex_lock(&node_affinity.lock);
762	if (!dd->affinity_entry)
763		goto unlock;
764	entry = node_affinity_lookup(dd->node);
765	if (!entry)
766		goto unlock;
767
768	/*
769	 * Free device completion vector CPUs to be used by future
770	 * completion vectors
771	 */
772	_dev_comp_vect_cpu_mask_clean_up(dd, entry);
773unlock:
774	dd->affinity_entry = NULL;
775	mutex_unlock(&node_affinity.lock);
776}
777
778/*
779 * Function updates the irq affinity hint for msix after it has been changed
780 * by the user using the /proc/irq interface. This function only accepts
781 * one cpu in the mask.
782 */
783static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
784{
785	struct sdma_engine *sde = msix->arg;
786	struct hfi1_devdata *dd = sde->dd;
787	struct hfi1_affinity_node *entry;
788	struct cpu_mask_set *set;
789	int i, old_cpu;
790
791	if (cpu > num_online_cpus() || cpu == sde->cpu)
792		return;
793
794	mutex_lock(&node_affinity.lock);
795	entry = node_affinity_lookup(dd->node);
796	if (!entry)
797		goto unlock;
798
799	old_cpu = sde->cpu;
800	sde->cpu = cpu;
801	cpumask_clear(&msix->mask);
802	cpumask_set_cpu(cpu, &msix->mask);
803	dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
804		   msix->irq, irq_type_names[msix->type],
805		   sde->this_idx, cpu);
806	irq_set_affinity_hint(msix->irq, &msix->mask);
807
808	/*
809	 * Set the new cpu in the hfi1_affinity_node and clean
810	 * the old cpu if it is not used by any other IRQ
811	 */
812	set = &entry->def_intr;
813	cpumask_set_cpu(cpu, &set->mask);
814	cpumask_set_cpu(cpu, &set->used);
815	for (i = 0; i < dd->msix_info.max_requested; i++) {
816		struct hfi1_msix_entry *other_msix;
817
818		other_msix = &dd->msix_info.msix_entries[i];
819		if (other_msix->type != IRQ_SDMA || other_msix == msix)
820			continue;
821
822		if (cpumask_test_cpu(old_cpu, &other_msix->mask))
823			goto unlock;
824	}
825	cpumask_clear_cpu(old_cpu, &set->mask);
826	cpumask_clear_cpu(old_cpu, &set->used);
827unlock:
828	mutex_unlock(&node_affinity.lock);
829}
830
831static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
832				     const cpumask_t *mask)
833{
834	int cpu = cpumask_first(mask);
835	struct hfi1_msix_entry *msix = container_of(notify,
836						    struct hfi1_msix_entry,
837						    notify);
838
839	/* Only one CPU configuration supported currently */
840	hfi1_update_sdma_affinity(msix, cpu);
841}
842
843static void hfi1_irq_notifier_release(struct kref *ref)
844{
845	/*
846	 * This is required by affinity notifier. We don't have anything to
847	 * free here.
848	 */
849}
850
851static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
852{
853	struct irq_affinity_notify *notify = &msix->notify;
854
855	notify->irq = msix->irq;
856	notify->notify = hfi1_irq_notifier_notify;
857	notify->release = hfi1_irq_notifier_release;
858
859	if (irq_set_affinity_notifier(notify->irq, notify))
860		pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
861		       notify->irq);
862}
863
864static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
865{
866	struct irq_affinity_notify *notify = &msix->notify;
867
868	if (irq_set_affinity_notifier(notify->irq, NULL))
869		pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
870		       notify->irq);
871}
872
873/*
874 * Function sets the irq affinity for msix.
875 * It *must* be called with node_affinity.lock held.
876 */
877static int get_irq_affinity(struct hfi1_devdata *dd,
878			    struct hfi1_msix_entry *msix)
879{
880	cpumask_var_t diff;
881	struct hfi1_affinity_node *entry;
882	struct cpu_mask_set *set = NULL;
883	struct sdma_engine *sde = NULL;
884	struct hfi1_ctxtdata *rcd = NULL;
885	char extra[64];
886	int cpu = -1;
887
888	extra[0] = '\0';
889	cpumask_clear(&msix->mask);
890
891	entry = node_affinity_lookup(dd->node);
892
893	switch (msix->type) {
894	case IRQ_SDMA:
895		sde = (struct sdma_engine *)msix->arg;
896		scnprintf(extra, 64, "engine %u", sde->this_idx);
897		set = &entry->def_intr;
898		break;
899	case IRQ_GENERAL:
900		cpu = cpumask_first(&entry->general_intr_mask);
901		break;
902	case IRQ_RCVCTXT:
903		rcd = (struct hfi1_ctxtdata *)msix->arg;
904		if (rcd->ctxt == HFI1_CTRL_CTXT)
905			cpu = cpumask_first(&entry->general_intr_mask);
906		else
907			set = &entry->rcv_intr;
908		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
909		break;
910	case IRQ_NETDEVCTXT:
911		rcd = (struct hfi1_ctxtdata *)msix->arg;
912		set = &entry->def_intr;
913		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
914		break;
915	default:
916		dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
917		return -EINVAL;
918	}
919
920	/*
921	 * The general and control contexts are placed on a particular
922	 * CPU, which is set above. Skip accounting for it. Everything else
923	 * finds its CPU here.
924	 */
925	if (cpu == -1 && set) {
926		if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
927			return -ENOMEM;
928
929		cpu = cpu_mask_set_get_first(set, diff);
930		if (cpu < 0) {
931			free_cpumask_var(diff);
932			dd_dev_err(dd, "Failure to obtain CPU for IRQ\n");
933			return cpu;
934		}
935
936		free_cpumask_var(diff);
937	}
938
939	cpumask_set_cpu(cpu, &msix->mask);
940	dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
941		    msix->irq, irq_type_names[msix->type],
942		    extra, cpu);
943	irq_set_affinity_hint(msix->irq, &msix->mask);
944
945	if (msix->type == IRQ_SDMA) {
946		sde->cpu = cpu;
947		hfi1_setup_sdma_notifier(msix);
948	}
949
950	return 0;
951}
952
953int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
954{
955	int ret;
956
957	mutex_lock(&node_affinity.lock);
958	ret = get_irq_affinity(dd, msix);
959	mutex_unlock(&node_affinity.lock);
960	return ret;
961}
962
963void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
964			   struct hfi1_msix_entry *msix)
965{
966	struct cpu_mask_set *set = NULL;
967	struct hfi1_ctxtdata *rcd;
968	struct hfi1_affinity_node *entry;
969
970	mutex_lock(&node_affinity.lock);
971	entry = node_affinity_lookup(dd->node);
972
973	switch (msix->type) {
974	case IRQ_SDMA:
975		set = &entry->def_intr;
976		hfi1_cleanup_sdma_notifier(msix);
977		break;
978	case IRQ_GENERAL:
979		/* Don't do accounting for general contexts */
980		break;
981	case IRQ_RCVCTXT:
982		rcd = (struct hfi1_ctxtdata *)msix->arg;
983		/* Don't do accounting for control contexts */
984		if (rcd->ctxt != HFI1_CTRL_CTXT)
985			set = &entry->rcv_intr;
986		break;
987	case IRQ_NETDEVCTXT:
988		rcd = (struct hfi1_ctxtdata *)msix->arg;
989		set = &entry->def_intr;
990		break;
991	default:
992		mutex_unlock(&node_affinity.lock);
993		return;
994	}
995
996	if (set) {
997		cpumask_andnot(&set->used, &set->used, &msix->mask);
998		_cpu_mask_set_gen_dec(set);
999	}
1000
1001	irq_set_affinity_hint(msix->irq, NULL);
1002	cpumask_clear(&msix->mask);
1003	mutex_unlock(&node_affinity.lock);
1004}
1005
1006/* This should be called with node_affinity.lock held */
1007static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
1008				struct hfi1_affinity_node_list *affinity)
1009{
1010	int possible, curr_cpu, i;
1011	uint num_cores_per_socket = node_affinity.num_online_cpus /
1012					affinity->num_core_siblings /
1013						node_affinity.num_online_nodes;
1014
1015	cpumask_copy(hw_thread_mask, &affinity->proc.mask);
1016	if (affinity->num_core_siblings > 0) {
1017		/* Removing other siblings not needed for now */
1018		possible = cpumask_weight(hw_thread_mask);
1019		curr_cpu = cpumask_first(hw_thread_mask);
1020		for (i = 0;
1021		     i < num_cores_per_socket * node_affinity.num_online_nodes;
1022		     i++)
1023			curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
1024
1025		for (; i < possible; i++) {
1026			cpumask_clear_cpu(curr_cpu, hw_thread_mask);
1027			curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
1028		}
1029
1030		/* Identifying correct HW threads within physical cores */
1031		cpumask_shift_left(hw_thread_mask, hw_thread_mask,
1032				   num_cores_per_socket *
1033				   node_affinity.num_online_nodes *
1034				   hw_thread_no);
1035	}
1036}
1037
1038int hfi1_get_proc_affinity(int node)
1039{
1040	int cpu = -1, ret, i;
1041	struct hfi1_affinity_node *entry;
1042	cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
1043	const struct cpumask *node_mask,
1044		*proc_mask = current->cpus_ptr;
1045	struct hfi1_affinity_node_list *affinity = &node_affinity;
1046	struct cpu_mask_set *set = &affinity->proc;
1047
1048	/*
1049	 * check whether process/context affinity has already
1050	 * been set
1051	 */
1052	if (current->nr_cpus_allowed == 1) {
1053		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
1054			  current->pid, current->comm,
1055			  cpumask_pr_args(proc_mask));
1056		/*
1057		 * Mark the pre-set CPU as used. This is atomic so we don't
1058		 * need the lock
1059		 */
1060		cpu = cpumask_first(proc_mask);
1061		cpumask_set_cpu(cpu, &set->used);
1062		goto done;
1063	} else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
1064		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
1065			  current->pid, current->comm,
1066			  cpumask_pr_args(proc_mask));
1067		goto done;
1068	}
1069
1070	/*
1071	 * The process does not have a preset CPU affinity so find one to
1072	 * recommend using the following algorithm:
1073	 *
1074	 * For each user process that is opening a context on HFI Y:
1075	 *  a) If all cores are filled, reinitialize the bitmask
1076	 *  b) Fill real cores first, then HT cores (First set of HT
1077	 *     cores on all physical cores, then second set of HT core,
1078	 *     and, so on) in the following order:
1079	 *
1080	 *     1. Same NUMA node as HFI Y and not running an IRQ
1081	 *        handler
1082	 *     2. Same NUMA node as HFI Y and running an IRQ handler
1083	 *     3. Different NUMA node to HFI Y and not running an IRQ
1084	 *        handler
1085	 *     4. Different NUMA node to HFI Y and running an IRQ
1086	 *        handler
1087	 *  c) Mark core as filled in the bitmask. As user processes are
1088	 *     done, clear cores from the bitmask.
1089	 */
1090
1091	ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
1092	if (!ret)
1093		goto done;
1094	ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
1095	if (!ret)
1096		goto free_diff;
1097	ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
1098	if (!ret)
1099		goto free_hw_thread_mask;
1100	ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
1101	if (!ret)
1102		goto free_available_mask;
1103
1104	mutex_lock(&affinity->lock);
1105	/*
1106	 * If we've used all available HW threads, clear the mask and start
1107	 * overloading.
1108	 */
1109	_cpu_mask_set_gen_inc(set);
1110
1111	/*
1112	 * If NUMA node has CPUs used by interrupt handlers, include them in the
1113	 * interrupt handler mask.
1114	 */
1115	entry = node_affinity_lookup(node);
1116	if (entry) {
1117		cpumask_copy(intrs_mask, (entry->def_intr.gen ?
1118					  &entry->def_intr.mask :
1119					  &entry->def_intr.used));
1120		cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
1121						    &entry->rcv_intr.mask :
1122						    &entry->rcv_intr.used));
1123		cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
1124	}
1125	hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
1126		  cpumask_pr_args(intrs_mask));
1127
1128	cpumask_copy(hw_thread_mask, &set->mask);
1129
1130	/*
1131	 * If HT cores are enabled, identify which HW threads within the
1132	 * physical cores should be used.
1133	 */
1134	if (affinity->num_core_siblings > 0) {
1135		for (i = 0; i < affinity->num_core_siblings; i++) {
1136			find_hw_thread_mask(i, hw_thread_mask, affinity);
1137
1138			/*
1139			 * If there's at least one available core for this HW
1140			 * thread number, stop looking for a core.
1141			 *
1142			 * diff will always be not empty at least once in this
1143			 * loop as the used mask gets reset when
1144			 * (set->mask == set->used) before this loop.
1145			 */
1146			cpumask_andnot(diff, hw_thread_mask, &set->used);
1147			if (!cpumask_empty(diff))
1148				break;
1149		}
1150	}
1151	hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
1152		  cpumask_pr_args(hw_thread_mask));
1153
1154	node_mask = cpumask_of_node(node);
1155	hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
1156		  cpumask_pr_args(node_mask));
1157
1158	/* Get cpumask of available CPUs on preferred NUMA */
1159	cpumask_and(available_mask, hw_thread_mask, node_mask);
1160	cpumask_andnot(available_mask, available_mask, &set->used);
1161	hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
1162		  cpumask_pr_args(available_mask));
1163
1164	/*
1165	 * At first, we don't want to place processes on the same
1166	 * CPUs as interrupt handlers. Then, CPUs running interrupt
1167	 * handlers are used.
1168	 *
1169	 * 1) If diff is not empty, then there are CPUs not running
1170	 *    non-interrupt handlers available, so diff gets copied
1171	 *    over to available_mask.
1172	 * 2) If diff is empty, then all CPUs not running interrupt
1173	 *    handlers are taken, so available_mask contains all
1174	 *    available CPUs running interrupt handlers.
1175	 * 3) If available_mask is empty, then all CPUs on the
1176	 *    preferred NUMA node are taken, so other NUMA nodes are
1177	 *    used for process assignments using the same method as
1178	 *    the preferred NUMA node.
1179	 */
1180	cpumask_andnot(diff, available_mask, intrs_mask);
1181	if (!cpumask_empty(diff))
1182		cpumask_copy(available_mask, diff);
1183
1184	/* If we don't have CPUs on the preferred node, use other NUMA nodes */
1185	if (cpumask_empty(available_mask)) {
1186		cpumask_andnot(available_mask, hw_thread_mask, &set->used);
1187		/* Excluding preferred NUMA cores */
1188		cpumask_andnot(available_mask, available_mask, node_mask);
1189		hfi1_cdbg(PROC,
1190			  "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
1191			  cpumask_pr_args(available_mask));
1192
1193		/*
1194		 * At first, we don't want to place processes on the same
1195		 * CPUs as interrupt handlers.
1196		 */
1197		cpumask_andnot(diff, available_mask, intrs_mask);
1198		if (!cpumask_empty(diff))
1199			cpumask_copy(available_mask, diff);
1200	}
1201	hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
1202		  cpumask_pr_args(available_mask));
1203
1204	cpu = cpumask_first(available_mask);
1205	if (cpu >= nr_cpu_ids) /* empty */
1206		cpu = -1;
1207	else
1208		cpumask_set_cpu(cpu, &set->used);
1209
1210	mutex_unlock(&affinity->lock);
1211	hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
1212
1213	free_cpumask_var(intrs_mask);
1214free_available_mask:
1215	free_cpumask_var(available_mask);
1216free_hw_thread_mask:
1217	free_cpumask_var(hw_thread_mask);
1218free_diff:
1219	free_cpumask_var(diff);
1220done:
1221	return cpu;
1222}
1223
1224void hfi1_put_proc_affinity(int cpu)
1225{
1226	struct hfi1_affinity_node_list *affinity = &node_affinity;
1227	struct cpu_mask_set *set = &affinity->proc;
1228
1229	if (cpu < 0)
1230		return;
1231
1232	mutex_lock(&affinity->lock);
1233	cpu_mask_set_put(set, cpu);
1234	hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
1235	mutex_unlock(&affinity->lock);
1236}
1237