1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2
3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4/* Copyright (c) 2008-2019, IBM Corporation */
5
6#include <linux/init.h>
7#include <linux/errno.h>
8#include <linux/netdevice.h>
9#include <linux/inetdevice.h>
10#include <net/net_namespace.h>
11#include <linux/rtnetlink.h>
12#include <linux/if_arp.h>
13#include <linux/list.h>
14#include <linux/kernel.h>
15#include <linux/sched.h>
16#include <linux/module.h>
17#include <linux/dma-mapping.h>
18
19#include <net/addrconf.h>
20#include <rdma/ib_verbs.h>
21#include <rdma/ib_user_verbs.h>
22#include <rdma/rdma_netlink.h>
23#include <linux/kthread.h>
24
25#include "siw.h"
26#include "siw_verbs.h"
27
28MODULE_AUTHOR("Bernard Metzler");
29MODULE_DESCRIPTION("Software iWARP Driver");
30MODULE_LICENSE("Dual BSD/GPL");
31
32/* transmit from user buffer, if possible */
33const bool zcopy_tx = true;
34
35/* Restrict usage of GSO, if hardware peer iwarp is unable to process
36 * large packets. try_gso = true lets siw try to use local GSO,
37 * if peer agrees.  Not using GSO severly limits siw maximum tx bandwidth.
38 */
39const bool try_gso;
40
41/* Attach siw also with loopback devices */
42const bool loopback_enabled = true;
43
44/* We try to negotiate CRC on, if true */
45const bool mpa_crc_required;
46
47/* MPA CRC on/off enforced */
48const bool mpa_crc_strict;
49
50/* Control TCP_NODELAY socket option */
51const bool siw_tcp_nagle;
52
53/* Select MPA version to be used during connection setup */
54u_char mpa_version = MPA_REVISION_2;
55
56/* Selects MPA P2P mode (additional handshake during connection
57 * setup, if true.
58 */
59const bool peer_to_peer;
60
61struct task_struct *siw_tx_thread[NR_CPUS];
62struct crypto_shash *siw_crypto_shash;
63
64static int siw_device_register(struct siw_device *sdev, const char *name)
65{
66	struct ib_device *base_dev = &sdev->base_dev;
67	static int dev_id = 1;
68	int rv;
69
70	sdev->vendor_part_id = dev_id++;
71
72	rv = ib_register_device(base_dev, name, NULL);
73	if (rv) {
74		pr_warn("siw: device registration error %d\n", rv);
75		return rv;
76	}
77
78	siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid);
79	return 0;
80}
81
82static void siw_device_cleanup(struct ib_device *base_dev)
83{
84	struct siw_device *sdev = to_siw_dev(base_dev);
85
86	xa_destroy(&sdev->qp_xa);
87	xa_destroy(&sdev->mem_xa);
88}
89
90static int siw_dev_qualified(struct net_device *netdev)
91{
92	/*
93	 * Additional hardware support can be added here
94	 * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
95	 * <linux/if_arp.h> for type identifiers.
96	 */
97	if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 ||
98	    netdev->type == ARPHRD_NONE ||
99	    (netdev->type == ARPHRD_LOOPBACK && loopback_enabled))
100		return 1;
101
102	return 0;
103}
104
105static DEFINE_PER_CPU(atomic_t, siw_use_cnt);
106
107static struct {
108	struct cpumask **tx_valid_cpus;
109	int num_nodes;
110} siw_cpu_info;
111
112static int siw_init_cpulist(void)
113{
114	int i, num_nodes = nr_node_ids;
115
116	memset(siw_tx_thread, 0, sizeof(siw_tx_thread));
117
118	siw_cpu_info.num_nodes = num_nodes;
119
120	siw_cpu_info.tx_valid_cpus =
121		kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL);
122	if (!siw_cpu_info.tx_valid_cpus) {
123		siw_cpu_info.num_nodes = 0;
124		return -ENOMEM;
125	}
126	for (i = 0; i < siw_cpu_info.num_nodes; i++) {
127		siw_cpu_info.tx_valid_cpus[i] =
128			kzalloc(sizeof(struct cpumask), GFP_KERNEL);
129		if (!siw_cpu_info.tx_valid_cpus[i])
130			goto out_err;
131
132		cpumask_clear(siw_cpu_info.tx_valid_cpus[i]);
133	}
134	for_each_possible_cpu(i)
135		cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]);
136
137	return 0;
138
139out_err:
140	siw_cpu_info.num_nodes = 0;
141	while (--i >= 0)
142		kfree(siw_cpu_info.tx_valid_cpus[i]);
143	kfree(siw_cpu_info.tx_valid_cpus);
144	siw_cpu_info.tx_valid_cpus = NULL;
145
146	return -ENOMEM;
147}
148
149static void siw_destroy_cpulist(void)
150{
151	int i = 0;
152
153	while (i < siw_cpu_info.num_nodes)
154		kfree(siw_cpu_info.tx_valid_cpus[i++]);
155
156	kfree(siw_cpu_info.tx_valid_cpus);
157}
158
159/*
160 * Choose CPU with least number of active QP's from NUMA node of
161 * TX interface.
162 */
163int siw_get_tx_cpu(struct siw_device *sdev)
164{
165	const struct cpumask *tx_cpumask;
166	int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1;
167
168	if (node < 0)
169		tx_cpumask = cpu_online_mask;
170	else
171		tx_cpumask = siw_cpu_info.tx_valid_cpus[node];
172
173	num_cpus = cpumask_weight(tx_cpumask);
174	if (!num_cpus) {
175		/* no CPU on this NUMA node */
176		tx_cpumask = cpu_online_mask;
177		num_cpus = cpumask_weight(tx_cpumask);
178	}
179	if (!num_cpus)
180		goto out;
181
182	cpu = cpumask_first(tx_cpumask);
183
184	for (i = 0, min_use = SIW_MAX_QP; i < num_cpus;
185	     i++, cpu = cpumask_next(cpu, tx_cpumask)) {
186		int usage;
187
188		/* Skip any cores which have no TX thread */
189		if (!siw_tx_thread[cpu])
190			continue;
191
192		usage = atomic_read(&per_cpu(siw_use_cnt, cpu));
193		if (usage <= min_use) {
194			tx_cpu = cpu;
195			min_use = usage;
196		}
197	}
198	siw_dbg(&sdev->base_dev,
199		"tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use);
200
201out:
202	if (tx_cpu >= 0)
203		atomic_inc(&per_cpu(siw_use_cnt, tx_cpu));
204	else
205		pr_warn("siw: no tx cpu found\n");
206
207	return tx_cpu;
208}
209
210void siw_put_tx_cpu(int cpu)
211{
212	atomic_dec(&per_cpu(siw_use_cnt, cpu));
213}
214
215static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
216{
217	struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id);
218
219	if (qp) {
220		/*
221		 * siw_qp_id2obj() increments object reference count
222		 */
223		siw_qp_put(qp);
224		return &qp->base_qp;
225	}
226	return NULL;
227}
228
229static const struct ib_device_ops siw_device_ops = {
230	.owner = THIS_MODULE,
231	.uverbs_abi_ver = SIW_ABI_VERSION,
232	.driver_id = RDMA_DRIVER_SIW,
233
234	.alloc_mr = siw_alloc_mr,
235	.alloc_pd = siw_alloc_pd,
236	.alloc_ucontext = siw_alloc_ucontext,
237	.create_cq = siw_create_cq,
238	.create_qp = siw_create_qp,
239	.create_srq = siw_create_srq,
240	.dealloc_driver = siw_device_cleanup,
241	.dealloc_pd = siw_dealloc_pd,
242	.dealloc_ucontext = siw_dealloc_ucontext,
243	.dereg_mr = siw_dereg_mr,
244	.destroy_cq = siw_destroy_cq,
245	.destroy_qp = siw_destroy_qp,
246	.destroy_srq = siw_destroy_srq,
247	.get_dma_mr = siw_get_dma_mr,
248	.get_port_immutable = siw_get_port_immutable,
249	.iw_accept = siw_accept,
250	.iw_add_ref = siw_qp_get_ref,
251	.iw_connect = siw_connect,
252	.iw_create_listen = siw_create_listen,
253	.iw_destroy_listen = siw_destroy_listen,
254	.iw_get_qp = siw_get_base_qp,
255	.iw_reject = siw_reject,
256	.iw_rem_ref = siw_qp_put_ref,
257	.map_mr_sg = siw_map_mr_sg,
258	.mmap = siw_mmap,
259	.mmap_free = siw_mmap_free,
260	.modify_qp = siw_verbs_modify_qp,
261	.modify_srq = siw_modify_srq,
262	.poll_cq = siw_poll_cq,
263	.post_recv = siw_post_receive,
264	.post_send = siw_post_send,
265	.post_srq_recv = siw_post_srq_recv,
266	.query_device = siw_query_device,
267	.query_gid = siw_query_gid,
268	.query_port = siw_query_port,
269	.query_qp = siw_query_qp,
270	.query_srq = siw_query_srq,
271	.req_notify_cq = siw_req_notify_cq,
272	.reg_user_mr = siw_reg_user_mr,
273
274	INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq),
275	INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd),
276	INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp),
277	INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq),
278	INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext),
279};
280
281static struct siw_device *siw_device_create(struct net_device *netdev)
282{
283	struct siw_device *sdev = NULL;
284	struct ib_device *base_dev;
285	int rv;
286
287	sdev = ib_alloc_device(siw_device, base_dev);
288	if (!sdev)
289		return NULL;
290
291	base_dev = &sdev->base_dev;
292	sdev->netdev = netdev;
293
294	if (netdev->addr_len) {
295		memcpy(sdev->raw_gid, netdev->dev_addr,
296		       min_t(unsigned int, netdev->addr_len, ETH_ALEN));
297	} else {
298		/*
299		 * This device does not have a HW address, but
300		 * connection mangagement requires a unique gid.
301		 */
302		eth_random_addr(sdev->raw_gid);
303	}
304	addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid);
305
306	base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND);
307
308	base_dev->node_type = RDMA_NODE_RNIC;
309	memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON,
310	       sizeof(SIW_NODE_DESC_COMMON));
311
312	/*
313	 * Current model (one-to-one device association):
314	 * One Softiwarp device per net_device or, equivalently,
315	 * per physical port.
316	 */
317	base_dev->phys_port_cnt = 1;
318	base_dev->num_comp_vectors = num_possible_cpus();
319
320	xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
321	xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);
322
323	ib_set_device_ops(base_dev, &siw_device_ops);
324	rv = ib_device_set_netdev(base_dev, netdev, 1);
325	if (rv)
326		goto error;
327
328	memcpy(base_dev->iw_ifname, netdev->name,
329	       sizeof(base_dev->iw_ifname));
330
331	/* Disable TCP port mapping */
332	base_dev->iw_driver_flags = IW_F_NO_PORT_MAP;
333
334	sdev->attrs.max_qp = SIW_MAX_QP;
335	sdev->attrs.max_qp_wr = SIW_MAX_QP_WR;
336	sdev->attrs.max_ord = SIW_MAX_ORD_QP;
337	sdev->attrs.max_ird = SIW_MAX_IRD_QP;
338	sdev->attrs.max_sge = SIW_MAX_SGE;
339	sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD;
340	sdev->attrs.max_cq = SIW_MAX_CQ;
341	sdev->attrs.max_cqe = SIW_MAX_CQE;
342	sdev->attrs.max_mr = SIW_MAX_MR;
343	sdev->attrs.max_pd = SIW_MAX_PD;
344	sdev->attrs.max_mw = SIW_MAX_MW;
345	sdev->attrs.max_srq = SIW_MAX_SRQ;
346	sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
347	sdev->attrs.max_srq_sge = SIW_MAX_SGE;
348
349	INIT_LIST_HEAD(&sdev->cep_list);
350	INIT_LIST_HEAD(&sdev->qp_list);
351
352	atomic_set(&sdev->num_ctx, 0);
353	atomic_set(&sdev->num_srq, 0);
354	atomic_set(&sdev->num_qp, 0);
355	atomic_set(&sdev->num_cq, 0);
356	atomic_set(&sdev->num_mr, 0);
357	atomic_set(&sdev->num_pd, 0);
358
359	sdev->numa_node = dev_to_node(&netdev->dev);
360	spin_lock_init(&sdev->lock);
361
362	return sdev;
363error:
364	ib_dealloc_device(base_dev);
365
366	return NULL;
367}
368
369/*
370 * Network link becomes unavailable. Mark all
371 * affected QP's accordingly.
372 */
373static void siw_netdev_down(struct work_struct *work)
374{
375	struct siw_device *sdev =
376		container_of(work, struct siw_device, netdev_down);
377
378	struct siw_qp_attrs qp_attrs;
379	struct list_head *pos, *tmp;
380
381	memset(&qp_attrs, 0, sizeof(qp_attrs));
382	qp_attrs.state = SIW_QP_STATE_ERROR;
383
384	list_for_each_safe(pos, tmp, &sdev->qp_list) {
385		struct siw_qp *qp = list_entry(pos, struct siw_qp, devq);
386
387		down_write(&qp->state_lock);
388		WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE));
389		up_write(&qp->state_lock);
390	}
391	ib_device_put(&sdev->base_dev);
392}
393
394static void siw_device_goes_down(struct siw_device *sdev)
395{
396	if (ib_device_try_get(&sdev->base_dev)) {
397		INIT_WORK(&sdev->netdev_down, siw_netdev_down);
398		schedule_work(&sdev->netdev_down);
399	}
400}
401
402static int siw_netdev_event(struct notifier_block *nb, unsigned long event,
403			    void *arg)
404{
405	struct net_device *netdev = netdev_notifier_info_to_dev(arg);
406	struct ib_device *base_dev;
407	struct siw_device *sdev;
408
409	dev_dbg(&netdev->dev, "siw: event %lu\n", event);
410
411	base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
412	if (!base_dev)
413		return NOTIFY_OK;
414
415	sdev = to_siw_dev(base_dev);
416
417	switch (event) {
418	case NETDEV_UP:
419		sdev->state = IB_PORT_ACTIVE;
420		siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE);
421		break;
422
423	case NETDEV_GOING_DOWN:
424		siw_device_goes_down(sdev);
425		break;
426
427	case NETDEV_DOWN:
428		sdev->state = IB_PORT_DOWN;
429		siw_port_event(sdev, 1, IB_EVENT_PORT_ERR);
430		break;
431
432	case NETDEV_REGISTER:
433		/*
434		 * Device registration now handled only by
435		 * rdma netlink commands. So it shall be impossible
436		 * to end up here with a valid siw device.
437		 */
438		siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n");
439		break;
440
441	case NETDEV_UNREGISTER:
442		ib_unregister_device_queued(&sdev->base_dev);
443		break;
444
445	case NETDEV_CHANGEADDR:
446		siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE);
447		break;
448	/*
449	 * Todo: Below netdev events are currently not handled.
450	 */
451	case NETDEV_CHANGEMTU:
452	case NETDEV_CHANGE:
453		break;
454
455	default:
456		break;
457	}
458	ib_device_put(&sdev->base_dev);
459
460	return NOTIFY_OK;
461}
462
463static struct notifier_block siw_netdev_nb = {
464	.notifier_call = siw_netdev_event,
465};
466
467static int siw_newlink(const char *basedev_name, struct net_device *netdev)
468{
469	struct ib_device *base_dev;
470	struct siw_device *sdev = NULL;
471	int rv = -ENOMEM;
472
473	if (!siw_dev_qualified(netdev))
474		return -EINVAL;
475
476	base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
477	if (base_dev) {
478		ib_device_put(base_dev);
479		return -EEXIST;
480	}
481	sdev = siw_device_create(netdev);
482	if (sdev) {
483		dev_dbg(&netdev->dev, "siw: new device\n");
484
485		if (netif_running(netdev) && netif_carrier_ok(netdev))
486			sdev->state = IB_PORT_ACTIVE;
487		else
488			sdev->state = IB_PORT_DOWN;
489
490		rv = siw_device_register(sdev, basedev_name);
491		if (rv)
492			ib_dealloc_device(&sdev->base_dev);
493	}
494	return rv;
495}
496
497static struct rdma_link_ops siw_link_ops = {
498	.type = "siw",
499	.newlink = siw_newlink,
500};
501
502/*
503 * siw_init_module - Initialize Softiwarp module and register with netdev
504 *                   subsystem.
505 */
506static __init int siw_init_module(void)
507{
508	int rv;
509
510	if (SENDPAGE_THRESH < SIW_MAX_INLINE) {
511		pr_info("siw: sendpage threshold too small: %u\n",
512			(int)SENDPAGE_THRESH);
513		rv = -EINVAL;
514		goto out_error;
515	}
516	rv = siw_init_cpulist();
517	if (rv)
518		goto out_error;
519
520	rv = siw_cm_init();
521	if (rv)
522		goto out_error;
523
524	if (!siw_create_tx_threads()) {
525		pr_info("siw: Could not start any TX thread\n");
526		rv = -ENOMEM;
527		goto out_error;
528	}
529	/*
530	 * Locate CRC32 algorithm. If unsuccessful, fail
531	 * loading siw only, if CRC is required.
532	 */
533	siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0);
534	if (IS_ERR(siw_crypto_shash)) {
535		pr_info("siw: Loading CRC32c failed: %ld\n",
536			PTR_ERR(siw_crypto_shash));
537		siw_crypto_shash = NULL;
538		if (mpa_crc_required) {
539			rv = -EOPNOTSUPP;
540			goto out_error;
541		}
542	}
543	rv = register_netdevice_notifier(&siw_netdev_nb);
544	if (rv)
545		goto out_error;
546
547	rdma_link_register(&siw_link_ops);
548
549	pr_info("SoftiWARP attached\n");
550	return 0;
551
552out_error:
553	siw_stop_tx_threads();
554
555	if (siw_crypto_shash)
556		crypto_free_shash(siw_crypto_shash);
557
558	pr_info("SoftIWARP attach failed. Error: %d\n", rv);
559
560	siw_cm_exit();
561	siw_destroy_cpulist();
562
563	return rv;
564}
565
566static void __exit siw_exit_module(void)
567{
568	siw_stop_tx_threads();
569
570	unregister_netdevice_notifier(&siw_netdev_nb);
571	rdma_link_unregister(&siw_link_ops);
572	ib_unregister_driver(RDMA_DRIVER_SIW);
573
574	siw_cm_exit();
575
576	siw_destroy_cpulist();
577
578	if (siw_crypto_shash)
579		crypto_free_shash(siw_crypto_shash);
580
581	pr_info("SoftiWARP detached\n");
582}
583
584module_init(siw_init_module);
585module_exit(siw_exit_module);
586
587MODULE_ALIAS_RDMA_LINK("siw");
588