xref: /kernel/linux/linux-6.6/drivers/nvme/host/tcp.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * NVMe over Fabrics TCP host.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/nvme-tcp.h>
12#include <net/sock.h>
13#include <net/tcp.h>
14#include <linux/blk-mq.h>
15#include <crypto/hash.h>
16#include <net/busy_poll.h>
17#include <trace/events/sock.h>
18
19#include "nvme.h"
20#include "fabrics.h"
21
22struct nvme_tcp_queue;
23
24/* Define the socket priority to use for connections were it is desirable
25 * that the NIC consider performing optimized packet processing or filtering.
26 * A non-zero value being sufficient to indicate general consideration of any
27 * possible optimization.  Making it a module param allows for alternative
28 * values that may be unique for some NIC implementations.
29 */
30static int so_priority;
31module_param(so_priority, int, 0644);
32MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
33
34#ifdef CONFIG_DEBUG_LOCK_ALLOC
35/* lockdep can detect a circular dependency of the form
36 *   sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
37 * because dependencies are tracked for both nvme-tcp and user contexts. Using
38 * a separate class prevents lockdep from conflating nvme-tcp socket use with
39 * user-space socket API use.
40 */
41static struct lock_class_key nvme_tcp_sk_key[2];
42static struct lock_class_key nvme_tcp_slock_key[2];
43
44static void nvme_tcp_reclassify_socket(struct socket *sock)
45{
46	struct sock *sk = sock->sk;
47
48	if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
49		return;
50
51	switch (sk->sk_family) {
52	case AF_INET:
53		sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
54					      &nvme_tcp_slock_key[0],
55					      "sk_lock-AF_INET-NVME",
56					      &nvme_tcp_sk_key[0]);
57		break;
58	case AF_INET6:
59		sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
60					      &nvme_tcp_slock_key[1],
61					      "sk_lock-AF_INET6-NVME",
62					      &nvme_tcp_sk_key[1]);
63		break;
64	default:
65		WARN_ON_ONCE(1);
66	}
67}
68#else
69static void nvme_tcp_reclassify_socket(struct socket *sock) { }
70#endif
71
72enum nvme_tcp_send_state {
73	NVME_TCP_SEND_CMD_PDU = 0,
74	NVME_TCP_SEND_H2C_PDU,
75	NVME_TCP_SEND_DATA,
76	NVME_TCP_SEND_DDGST,
77};
78
79struct nvme_tcp_request {
80	struct nvme_request	req;
81	void			*pdu;
82	struct nvme_tcp_queue	*queue;
83	u32			data_len;
84	u32			pdu_len;
85	u32			pdu_sent;
86	u32			h2cdata_left;
87	u32			h2cdata_offset;
88	u16			ttag;
89	__le16			status;
90	struct list_head	entry;
91	struct llist_node	lentry;
92	__le32			ddgst;
93
94	struct bio		*curr_bio;
95	struct iov_iter		iter;
96
97	/* send state */
98	size_t			offset;
99	size_t			data_sent;
100	enum nvme_tcp_send_state state;
101};
102
103enum nvme_tcp_queue_flags {
104	NVME_TCP_Q_ALLOCATED	= 0,
105	NVME_TCP_Q_LIVE		= 1,
106	NVME_TCP_Q_POLLING	= 2,
107};
108
109enum nvme_tcp_recv_state {
110	NVME_TCP_RECV_PDU = 0,
111	NVME_TCP_RECV_DATA,
112	NVME_TCP_RECV_DDGST,
113};
114
115struct nvme_tcp_ctrl;
116struct nvme_tcp_queue {
117	struct socket		*sock;
118	struct work_struct	io_work;
119	int			io_cpu;
120
121	struct mutex		queue_lock;
122	struct mutex		send_mutex;
123	struct llist_head	req_list;
124	struct list_head	send_list;
125
126	/* recv state */
127	void			*pdu;
128	int			pdu_remaining;
129	int			pdu_offset;
130	size_t			data_remaining;
131	size_t			ddgst_remaining;
132	unsigned int		nr_cqe;
133
134	/* send state */
135	struct nvme_tcp_request *request;
136
137	u32			maxh2cdata;
138	size_t			cmnd_capsule_len;
139	struct nvme_tcp_ctrl	*ctrl;
140	unsigned long		flags;
141	bool			rd_enabled;
142
143	bool			hdr_digest;
144	bool			data_digest;
145	struct ahash_request	*rcv_hash;
146	struct ahash_request	*snd_hash;
147	__le32			exp_ddgst;
148	__le32			recv_ddgst;
149
150	struct page_frag_cache	pf_cache;
151
152	void (*state_change)(struct sock *);
153	void (*data_ready)(struct sock *);
154	void (*write_space)(struct sock *);
155};
156
157struct nvme_tcp_ctrl {
158	/* read only in the hot path */
159	struct nvme_tcp_queue	*queues;
160	struct blk_mq_tag_set	tag_set;
161
162	/* other member variables */
163	struct list_head	list;
164	struct blk_mq_tag_set	admin_tag_set;
165	struct sockaddr_storage addr;
166	struct sockaddr_storage src_addr;
167	struct nvme_ctrl	ctrl;
168
169	struct work_struct	err_work;
170	struct delayed_work	connect_work;
171	struct nvme_tcp_request async_req;
172	u32			io_queues[HCTX_MAX_TYPES];
173};
174
175static LIST_HEAD(nvme_tcp_ctrl_list);
176static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
177static struct workqueue_struct *nvme_tcp_wq;
178static const struct blk_mq_ops nvme_tcp_mq_ops;
179static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
180static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
181
182static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
183{
184	return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
185}
186
187static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
188{
189	return queue - queue->ctrl->queues;
190}
191
192static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
193{
194	u32 queue_idx = nvme_tcp_queue_id(queue);
195
196	if (queue_idx == 0)
197		return queue->ctrl->admin_tag_set.tags[queue_idx];
198	return queue->ctrl->tag_set.tags[queue_idx - 1];
199}
200
201static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
202{
203	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
204}
205
206static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
207{
208	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
209}
210
211static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req)
212{
213	return req->pdu;
214}
215
216static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
217{
218	/* use the pdu space in the back for the data pdu */
219	return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) -
220		sizeof(struct nvme_tcp_data_pdu);
221}
222
223static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
224{
225	if (nvme_is_fabrics(req->req.cmd))
226		return NVME_TCP_ADMIN_CCSZ;
227	return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
228}
229
230static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
231{
232	return req == &req->queue->ctrl->async_req;
233}
234
235static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
236{
237	struct request *rq;
238
239	if (unlikely(nvme_tcp_async_req(req)))
240		return false; /* async events don't have a request */
241
242	rq = blk_mq_rq_from_pdu(req);
243
244	return rq_data_dir(rq) == WRITE && req->data_len &&
245		req->data_len <= nvme_tcp_inline_data_size(req);
246}
247
248static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
249{
250	return req->iter.bvec->bv_page;
251}
252
253static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
254{
255	return req->iter.bvec->bv_offset + req->iter.iov_offset;
256}
257
258static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
259{
260	return min_t(size_t, iov_iter_single_seg_count(&req->iter),
261			req->pdu_len - req->pdu_sent);
262}
263
264static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
265{
266	return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
267			req->pdu_len - req->pdu_sent : 0;
268}
269
270static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
271		int len)
272{
273	return nvme_tcp_pdu_data_left(req) <= len;
274}
275
276static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
277		unsigned int dir)
278{
279	struct request *rq = blk_mq_rq_from_pdu(req);
280	struct bio_vec *vec;
281	unsigned int size;
282	int nr_bvec;
283	size_t offset;
284
285	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
286		vec = &rq->special_vec;
287		nr_bvec = 1;
288		size = blk_rq_payload_bytes(rq);
289		offset = 0;
290	} else {
291		struct bio *bio = req->curr_bio;
292		struct bvec_iter bi;
293		struct bio_vec bv;
294
295		vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
296		nr_bvec = 0;
297		bio_for_each_bvec(bv, bio, bi) {
298			nr_bvec++;
299		}
300		size = bio->bi_iter.bi_size;
301		offset = bio->bi_iter.bi_bvec_done;
302	}
303
304	iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
305	req->iter.iov_offset = offset;
306}
307
308static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
309		int len)
310{
311	req->data_sent += len;
312	req->pdu_sent += len;
313	iov_iter_advance(&req->iter, len);
314	if (!iov_iter_count(&req->iter) &&
315	    req->data_sent < req->data_len) {
316		req->curr_bio = req->curr_bio->bi_next;
317		nvme_tcp_init_iter(req, ITER_SOURCE);
318	}
319}
320
321static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
322{
323	int ret;
324
325	/* drain the send queue as much as we can... */
326	do {
327		ret = nvme_tcp_try_send(queue);
328	} while (ret > 0);
329}
330
331static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
332{
333	return !list_empty(&queue->send_list) ||
334		!llist_empty(&queue->req_list);
335}
336
337static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
338		bool sync, bool last)
339{
340	struct nvme_tcp_queue *queue = req->queue;
341	bool empty;
342
343	empty = llist_add(&req->lentry, &queue->req_list) &&
344		list_empty(&queue->send_list) && !queue->request;
345
346	/*
347	 * if we're the first on the send_list and we can try to send
348	 * directly, otherwise queue io_work. Also, only do that if we
349	 * are on the same cpu, so we don't introduce contention.
350	 */
351	if (queue->io_cpu == raw_smp_processor_id() &&
352	    sync && empty && mutex_trylock(&queue->send_mutex)) {
353		nvme_tcp_send_all(queue);
354		mutex_unlock(&queue->send_mutex);
355	}
356
357	if (last && nvme_tcp_queue_more(queue))
358		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
359}
360
361static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
362{
363	struct nvme_tcp_request *req;
364	struct llist_node *node;
365
366	for (node = llist_del_all(&queue->req_list); node; node = node->next) {
367		req = llist_entry(node, struct nvme_tcp_request, lentry);
368		list_add(&req->entry, &queue->send_list);
369	}
370}
371
372static inline struct nvme_tcp_request *
373nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
374{
375	struct nvme_tcp_request *req;
376
377	req = list_first_entry_or_null(&queue->send_list,
378			struct nvme_tcp_request, entry);
379	if (!req) {
380		nvme_tcp_process_req_list(queue);
381		req = list_first_entry_or_null(&queue->send_list,
382				struct nvme_tcp_request, entry);
383		if (unlikely(!req))
384			return NULL;
385	}
386
387	list_del(&req->entry);
388	return req;
389}
390
391static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
392		__le32 *dgst)
393{
394	ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
395	crypto_ahash_final(hash);
396}
397
398static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
399		struct page *page, off_t off, size_t len)
400{
401	struct scatterlist sg;
402
403	sg_init_table(&sg, 1);
404	sg_set_page(&sg, page, len, off);
405	ahash_request_set_crypt(hash, &sg, NULL, len);
406	crypto_ahash_update(hash);
407}
408
409static inline void nvme_tcp_hdgst(struct ahash_request *hash,
410		void *pdu, size_t len)
411{
412	struct scatterlist sg;
413
414	sg_init_one(&sg, pdu, len);
415	ahash_request_set_crypt(hash, &sg, pdu + len, len);
416	crypto_ahash_digest(hash);
417}
418
419static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
420		void *pdu, size_t pdu_len)
421{
422	struct nvme_tcp_hdr *hdr = pdu;
423	__le32 recv_digest;
424	__le32 exp_digest;
425
426	if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
427		dev_err(queue->ctrl->ctrl.device,
428			"queue %d: header digest flag is cleared\n",
429			nvme_tcp_queue_id(queue));
430		return -EPROTO;
431	}
432
433	recv_digest = *(__le32 *)(pdu + hdr->hlen);
434	nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
435	exp_digest = *(__le32 *)(pdu + hdr->hlen);
436	if (recv_digest != exp_digest) {
437		dev_err(queue->ctrl->ctrl.device,
438			"header digest error: recv %#x expected %#x\n",
439			le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
440		return -EIO;
441	}
442
443	return 0;
444}
445
446static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
447{
448	struct nvme_tcp_hdr *hdr = pdu;
449	u8 digest_len = nvme_tcp_hdgst_len(queue);
450	u32 len;
451
452	len = le32_to_cpu(hdr->plen) - hdr->hlen -
453		((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
454
455	if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
456		dev_err(queue->ctrl->ctrl.device,
457			"queue %d: data digest flag is cleared\n",
458		nvme_tcp_queue_id(queue));
459		return -EPROTO;
460	}
461	crypto_ahash_init(queue->rcv_hash);
462
463	return 0;
464}
465
466static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
467		struct request *rq, unsigned int hctx_idx)
468{
469	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
470
471	page_frag_free(req->pdu);
472}
473
474static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
475		struct request *rq, unsigned int hctx_idx,
476		unsigned int numa_node)
477{
478	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
479	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
480	struct nvme_tcp_cmd_pdu *pdu;
481	int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
482	struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
483	u8 hdgst = nvme_tcp_hdgst_len(queue);
484
485	req->pdu = page_frag_alloc(&queue->pf_cache,
486		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
487		GFP_KERNEL | __GFP_ZERO);
488	if (!req->pdu)
489		return -ENOMEM;
490
491	pdu = req->pdu;
492	req->queue = queue;
493	nvme_req(rq)->ctrl = &ctrl->ctrl;
494	nvme_req(rq)->cmd = &pdu->cmd;
495
496	return 0;
497}
498
499static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
500		unsigned int hctx_idx)
501{
502	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
503	struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
504
505	hctx->driver_data = queue;
506	return 0;
507}
508
509static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
510		unsigned int hctx_idx)
511{
512	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
513	struct nvme_tcp_queue *queue = &ctrl->queues[0];
514
515	hctx->driver_data = queue;
516	return 0;
517}
518
519static enum nvme_tcp_recv_state
520nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
521{
522	return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
523		(queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
524		NVME_TCP_RECV_DATA;
525}
526
527static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
528{
529	queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
530				nvme_tcp_hdgst_len(queue);
531	queue->pdu_offset = 0;
532	queue->data_remaining = -1;
533	queue->ddgst_remaining = 0;
534}
535
536static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
537{
538	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
539		return;
540
541	dev_warn(ctrl->device, "starting error recovery\n");
542	queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
543}
544
545static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
546		struct nvme_completion *cqe)
547{
548	struct nvme_tcp_request *req;
549	struct request *rq;
550
551	rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
552	if (!rq) {
553		dev_err(queue->ctrl->ctrl.device,
554			"got bad cqe.command_id %#x on queue %d\n",
555			cqe->command_id, nvme_tcp_queue_id(queue));
556		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
557		return -EINVAL;
558	}
559
560	req = blk_mq_rq_to_pdu(rq);
561	if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
562		req->status = cqe->status;
563
564	if (!nvme_try_complete_req(rq, req->status, cqe->result))
565		nvme_complete_rq(rq);
566	queue->nr_cqe++;
567
568	return 0;
569}
570
571static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
572		struct nvme_tcp_data_pdu *pdu)
573{
574	struct request *rq;
575
576	rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
577	if (!rq) {
578		dev_err(queue->ctrl->ctrl.device,
579			"got bad c2hdata.command_id %#x on queue %d\n",
580			pdu->command_id, nvme_tcp_queue_id(queue));
581		return -ENOENT;
582	}
583
584	if (!blk_rq_payload_bytes(rq)) {
585		dev_err(queue->ctrl->ctrl.device,
586			"queue %d tag %#x unexpected data\n",
587			nvme_tcp_queue_id(queue), rq->tag);
588		return -EIO;
589	}
590
591	queue->data_remaining = le32_to_cpu(pdu->data_length);
592
593	if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
594	    unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
595		dev_err(queue->ctrl->ctrl.device,
596			"queue %d tag %#x SUCCESS set but not last PDU\n",
597			nvme_tcp_queue_id(queue), rq->tag);
598		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
599		return -EPROTO;
600	}
601
602	return 0;
603}
604
605static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
606		struct nvme_tcp_rsp_pdu *pdu)
607{
608	struct nvme_completion *cqe = &pdu->cqe;
609	int ret = 0;
610
611	/*
612	 * AEN requests are special as they don't time out and can
613	 * survive any kind of queue freeze and often don't respond to
614	 * aborts.  We don't even bother to allocate a struct request
615	 * for them but rather special case them here.
616	 */
617	if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
618				     cqe->command_id)))
619		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
620				&cqe->result);
621	else
622		ret = nvme_tcp_process_nvme_cqe(queue, cqe);
623
624	return ret;
625}
626
627static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
628{
629	struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req);
630	struct nvme_tcp_queue *queue = req->queue;
631	struct request *rq = blk_mq_rq_from_pdu(req);
632	u32 h2cdata_sent = req->pdu_len;
633	u8 hdgst = nvme_tcp_hdgst_len(queue);
634	u8 ddgst = nvme_tcp_ddgst_len(queue);
635
636	req->state = NVME_TCP_SEND_H2C_PDU;
637	req->offset = 0;
638	req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata);
639	req->pdu_sent = 0;
640	req->h2cdata_left -= req->pdu_len;
641	req->h2cdata_offset += h2cdata_sent;
642
643	memset(data, 0, sizeof(*data));
644	data->hdr.type = nvme_tcp_h2c_data;
645	if (!req->h2cdata_left)
646		data->hdr.flags = NVME_TCP_F_DATA_LAST;
647	if (queue->hdr_digest)
648		data->hdr.flags |= NVME_TCP_F_HDGST;
649	if (queue->data_digest)
650		data->hdr.flags |= NVME_TCP_F_DDGST;
651	data->hdr.hlen = sizeof(*data);
652	data->hdr.pdo = data->hdr.hlen + hdgst;
653	data->hdr.plen =
654		cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
655	data->ttag = req->ttag;
656	data->command_id = nvme_cid(rq);
657	data->data_offset = cpu_to_le32(req->h2cdata_offset);
658	data->data_length = cpu_to_le32(req->pdu_len);
659}
660
661static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
662		struct nvme_tcp_r2t_pdu *pdu)
663{
664	struct nvme_tcp_request *req;
665	struct request *rq;
666	u32 r2t_length = le32_to_cpu(pdu->r2t_length);
667	u32 r2t_offset = le32_to_cpu(pdu->r2t_offset);
668
669	rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
670	if (!rq) {
671		dev_err(queue->ctrl->ctrl.device,
672			"got bad r2t.command_id %#x on queue %d\n",
673			pdu->command_id, nvme_tcp_queue_id(queue));
674		return -ENOENT;
675	}
676	req = blk_mq_rq_to_pdu(rq);
677
678	if (unlikely(!r2t_length)) {
679		dev_err(queue->ctrl->ctrl.device,
680			"req %d r2t len is %u, probably a bug...\n",
681			rq->tag, r2t_length);
682		return -EPROTO;
683	}
684
685	if (unlikely(req->data_sent + r2t_length > req->data_len)) {
686		dev_err(queue->ctrl->ctrl.device,
687			"req %d r2t len %u exceeded data len %u (%zu sent)\n",
688			rq->tag, r2t_length, req->data_len, req->data_sent);
689		return -EPROTO;
690	}
691
692	if (unlikely(r2t_offset < req->data_sent)) {
693		dev_err(queue->ctrl->ctrl.device,
694			"req %d unexpected r2t offset %u (expected %zu)\n",
695			rq->tag, r2t_offset, req->data_sent);
696		return -EPROTO;
697	}
698
699	req->pdu_len = 0;
700	req->h2cdata_left = r2t_length;
701	req->h2cdata_offset = r2t_offset;
702	req->ttag = pdu->ttag;
703
704	nvme_tcp_setup_h2c_data_pdu(req);
705	nvme_tcp_queue_request(req, false, true);
706
707	return 0;
708}
709
710static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
711		unsigned int *offset, size_t *len)
712{
713	struct nvme_tcp_hdr *hdr;
714	char *pdu = queue->pdu;
715	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
716	int ret;
717
718	ret = skb_copy_bits(skb, *offset,
719		&pdu[queue->pdu_offset], rcv_len);
720	if (unlikely(ret))
721		return ret;
722
723	queue->pdu_remaining -= rcv_len;
724	queue->pdu_offset += rcv_len;
725	*offset += rcv_len;
726	*len -= rcv_len;
727	if (queue->pdu_remaining)
728		return 0;
729
730	hdr = queue->pdu;
731	if (queue->hdr_digest) {
732		ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
733		if (unlikely(ret))
734			return ret;
735	}
736
737
738	if (queue->data_digest) {
739		ret = nvme_tcp_check_ddgst(queue, queue->pdu);
740		if (unlikely(ret))
741			return ret;
742	}
743
744	switch (hdr->type) {
745	case nvme_tcp_c2h_data:
746		return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
747	case nvme_tcp_rsp:
748		nvme_tcp_init_recv_ctx(queue);
749		return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
750	case nvme_tcp_r2t:
751		nvme_tcp_init_recv_ctx(queue);
752		return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
753	default:
754		dev_err(queue->ctrl->ctrl.device,
755			"unsupported pdu type (%d)\n", hdr->type);
756		return -EINVAL;
757	}
758}
759
760static inline void nvme_tcp_end_request(struct request *rq, u16 status)
761{
762	union nvme_result res = {};
763
764	if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
765		nvme_complete_rq(rq);
766}
767
768static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
769			      unsigned int *offset, size_t *len)
770{
771	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
772	struct request *rq =
773		nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
774	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
775
776	while (true) {
777		int recv_len, ret;
778
779		recv_len = min_t(size_t, *len, queue->data_remaining);
780		if (!recv_len)
781			break;
782
783		if (!iov_iter_count(&req->iter)) {
784			req->curr_bio = req->curr_bio->bi_next;
785
786			/*
787			 * If we don`t have any bios it means that controller
788			 * sent more data than we requested, hence error
789			 */
790			if (!req->curr_bio) {
791				dev_err(queue->ctrl->ctrl.device,
792					"queue %d no space in request %#x",
793					nvme_tcp_queue_id(queue), rq->tag);
794				nvme_tcp_init_recv_ctx(queue);
795				return -EIO;
796			}
797			nvme_tcp_init_iter(req, ITER_DEST);
798		}
799
800		/* we can read only from what is left in this bio */
801		recv_len = min_t(size_t, recv_len,
802				iov_iter_count(&req->iter));
803
804		if (queue->data_digest)
805			ret = skb_copy_and_hash_datagram_iter(skb, *offset,
806				&req->iter, recv_len, queue->rcv_hash);
807		else
808			ret = skb_copy_datagram_iter(skb, *offset,
809					&req->iter, recv_len);
810		if (ret) {
811			dev_err(queue->ctrl->ctrl.device,
812				"queue %d failed to copy request %#x data",
813				nvme_tcp_queue_id(queue), rq->tag);
814			return ret;
815		}
816
817		*len -= recv_len;
818		*offset += recv_len;
819		queue->data_remaining -= recv_len;
820	}
821
822	if (!queue->data_remaining) {
823		if (queue->data_digest) {
824			nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
825			queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
826		} else {
827			if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
828				nvme_tcp_end_request(rq,
829						le16_to_cpu(req->status));
830				queue->nr_cqe++;
831			}
832			nvme_tcp_init_recv_ctx(queue);
833		}
834	}
835
836	return 0;
837}
838
839static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
840		struct sk_buff *skb, unsigned int *offset, size_t *len)
841{
842	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
843	char *ddgst = (char *)&queue->recv_ddgst;
844	size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
845	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
846	int ret;
847
848	ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
849	if (unlikely(ret))
850		return ret;
851
852	queue->ddgst_remaining -= recv_len;
853	*offset += recv_len;
854	*len -= recv_len;
855	if (queue->ddgst_remaining)
856		return 0;
857
858	if (queue->recv_ddgst != queue->exp_ddgst) {
859		struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
860					pdu->command_id);
861		struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
862
863		req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
864
865		dev_err(queue->ctrl->ctrl.device,
866			"data digest error: recv %#x expected %#x\n",
867			le32_to_cpu(queue->recv_ddgst),
868			le32_to_cpu(queue->exp_ddgst));
869	}
870
871	if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
872		struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
873					pdu->command_id);
874		struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
875
876		nvme_tcp_end_request(rq, le16_to_cpu(req->status));
877		queue->nr_cqe++;
878	}
879
880	nvme_tcp_init_recv_ctx(queue);
881	return 0;
882}
883
884static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
885			     unsigned int offset, size_t len)
886{
887	struct nvme_tcp_queue *queue = desc->arg.data;
888	size_t consumed = len;
889	int result;
890
891	if (unlikely(!queue->rd_enabled))
892		return -EFAULT;
893
894	while (len) {
895		switch (nvme_tcp_recv_state(queue)) {
896		case NVME_TCP_RECV_PDU:
897			result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
898			break;
899		case NVME_TCP_RECV_DATA:
900			result = nvme_tcp_recv_data(queue, skb, &offset, &len);
901			break;
902		case NVME_TCP_RECV_DDGST:
903			result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
904			break;
905		default:
906			result = -EFAULT;
907		}
908		if (result) {
909			dev_err(queue->ctrl->ctrl.device,
910				"receive failed:  %d\n", result);
911			queue->rd_enabled = false;
912			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
913			return result;
914		}
915	}
916
917	return consumed;
918}
919
920static void nvme_tcp_data_ready(struct sock *sk)
921{
922	struct nvme_tcp_queue *queue;
923
924	trace_sk_data_ready(sk);
925
926	read_lock_bh(&sk->sk_callback_lock);
927	queue = sk->sk_user_data;
928	if (likely(queue && queue->rd_enabled) &&
929	    !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
930		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
931	read_unlock_bh(&sk->sk_callback_lock);
932}
933
934static void nvme_tcp_write_space(struct sock *sk)
935{
936	struct nvme_tcp_queue *queue;
937
938	read_lock_bh(&sk->sk_callback_lock);
939	queue = sk->sk_user_data;
940	if (likely(queue && sk_stream_is_writeable(sk))) {
941		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
942		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
943	}
944	read_unlock_bh(&sk->sk_callback_lock);
945}
946
947static void nvme_tcp_state_change(struct sock *sk)
948{
949	struct nvme_tcp_queue *queue;
950
951	read_lock_bh(&sk->sk_callback_lock);
952	queue = sk->sk_user_data;
953	if (!queue)
954		goto done;
955
956	switch (sk->sk_state) {
957	case TCP_CLOSE:
958	case TCP_CLOSE_WAIT:
959	case TCP_LAST_ACK:
960	case TCP_FIN_WAIT1:
961	case TCP_FIN_WAIT2:
962		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
963		break;
964	default:
965		dev_info(queue->ctrl->ctrl.device,
966			"queue %d socket state %d\n",
967			nvme_tcp_queue_id(queue), sk->sk_state);
968	}
969
970	queue->state_change(sk);
971done:
972	read_unlock_bh(&sk->sk_callback_lock);
973}
974
975static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
976{
977	queue->request = NULL;
978}
979
980static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
981{
982	if (nvme_tcp_async_req(req)) {
983		union nvme_result res = {};
984
985		nvme_complete_async_event(&req->queue->ctrl->ctrl,
986				cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res);
987	} else {
988		nvme_tcp_end_request(blk_mq_rq_from_pdu(req),
989				NVME_SC_HOST_PATH_ERROR);
990	}
991}
992
993static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
994{
995	struct nvme_tcp_queue *queue = req->queue;
996	int req_data_len = req->data_len;
997	u32 h2cdata_left = req->h2cdata_left;
998
999	while (true) {
1000		struct bio_vec bvec;
1001		struct msghdr msg = {
1002			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
1003		};
1004		struct page *page = nvme_tcp_req_cur_page(req);
1005		size_t offset = nvme_tcp_req_cur_offset(req);
1006		size_t len = nvme_tcp_req_cur_length(req);
1007		bool last = nvme_tcp_pdu_last_send(req, len);
1008		int req_data_sent = req->data_sent;
1009		int ret;
1010
1011		if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
1012			msg.msg_flags |= MSG_EOR;
1013		else
1014			msg.msg_flags |= MSG_MORE;
1015
1016		if (!sendpage_ok(page))
1017			msg.msg_flags &= ~MSG_SPLICE_PAGES;
1018
1019		bvec_set_page(&bvec, page, len, offset);
1020		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
1021		ret = sock_sendmsg(queue->sock, &msg);
1022		if (ret <= 0)
1023			return ret;
1024
1025		if (queue->data_digest)
1026			nvme_tcp_ddgst_update(queue->snd_hash, page,
1027					offset, ret);
1028
1029		/*
1030		 * update the request iterator except for the last payload send
1031		 * in the request where we don't want to modify it as we may
1032		 * compete with the RX path completing the request.
1033		 */
1034		if (req_data_sent + ret < req_data_len)
1035			nvme_tcp_advance_req(req, ret);
1036
1037		/* fully successful last send in current PDU */
1038		if (last && ret == len) {
1039			if (queue->data_digest) {
1040				nvme_tcp_ddgst_final(queue->snd_hash,
1041					&req->ddgst);
1042				req->state = NVME_TCP_SEND_DDGST;
1043				req->offset = 0;
1044			} else {
1045				if (h2cdata_left)
1046					nvme_tcp_setup_h2c_data_pdu(req);
1047				else
1048					nvme_tcp_done_send_req(queue);
1049			}
1050			return 1;
1051		}
1052	}
1053	return -EAGAIN;
1054}
1055
1056static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
1057{
1058	struct nvme_tcp_queue *queue = req->queue;
1059	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
1060	struct bio_vec bvec;
1061	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
1062	bool inline_data = nvme_tcp_has_inline_data(req);
1063	u8 hdgst = nvme_tcp_hdgst_len(queue);
1064	int len = sizeof(*pdu) + hdgst - req->offset;
1065	int ret;
1066
1067	if (inline_data || nvme_tcp_queue_more(queue))
1068		msg.msg_flags |= MSG_MORE;
1069	else
1070		msg.msg_flags |= MSG_EOR;
1071
1072	if (queue->hdr_digest && !req->offset)
1073		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1074
1075	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
1076	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
1077	ret = sock_sendmsg(queue->sock, &msg);
1078	if (unlikely(ret <= 0))
1079		return ret;
1080
1081	len -= ret;
1082	if (!len) {
1083		if (inline_data) {
1084			req->state = NVME_TCP_SEND_DATA;
1085			if (queue->data_digest)
1086				crypto_ahash_init(queue->snd_hash);
1087		} else {
1088			nvme_tcp_done_send_req(queue);
1089		}
1090		return 1;
1091	}
1092	req->offset += ret;
1093
1094	return -EAGAIN;
1095}
1096
1097static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1098{
1099	struct nvme_tcp_queue *queue = req->queue;
1100	struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
1101	struct bio_vec bvec;
1102	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
1103	u8 hdgst = nvme_tcp_hdgst_len(queue);
1104	int len = sizeof(*pdu) - req->offset + hdgst;
1105	int ret;
1106
1107	if (queue->hdr_digest && !req->offset)
1108		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1109
1110	if (!req->h2cdata_left)
1111		msg.msg_flags |= MSG_SPLICE_PAGES;
1112
1113	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
1114	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
1115	ret = sock_sendmsg(queue->sock, &msg);
1116	if (unlikely(ret <= 0))
1117		return ret;
1118
1119	len -= ret;
1120	if (!len) {
1121		req->state = NVME_TCP_SEND_DATA;
1122		if (queue->data_digest)
1123			crypto_ahash_init(queue->snd_hash);
1124		return 1;
1125	}
1126	req->offset += ret;
1127
1128	return -EAGAIN;
1129}
1130
1131static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1132{
1133	struct nvme_tcp_queue *queue = req->queue;
1134	size_t offset = req->offset;
1135	u32 h2cdata_left = req->h2cdata_left;
1136	int ret;
1137	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1138	struct kvec iov = {
1139		.iov_base = (u8 *)&req->ddgst + req->offset,
1140		.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1141	};
1142
1143	if (nvme_tcp_queue_more(queue))
1144		msg.msg_flags |= MSG_MORE;
1145	else
1146		msg.msg_flags |= MSG_EOR;
1147
1148	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1149	if (unlikely(ret <= 0))
1150		return ret;
1151
1152	if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
1153		if (h2cdata_left)
1154			nvme_tcp_setup_h2c_data_pdu(req);
1155		else
1156			nvme_tcp_done_send_req(queue);
1157		return 1;
1158	}
1159
1160	req->offset += ret;
1161	return -EAGAIN;
1162}
1163
1164static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1165{
1166	struct nvme_tcp_request *req;
1167	unsigned int noreclaim_flag;
1168	int ret = 1;
1169
1170	if (!queue->request) {
1171		queue->request = nvme_tcp_fetch_request(queue);
1172		if (!queue->request)
1173			return 0;
1174	}
1175	req = queue->request;
1176
1177	noreclaim_flag = memalloc_noreclaim_save();
1178	if (req->state == NVME_TCP_SEND_CMD_PDU) {
1179		ret = nvme_tcp_try_send_cmd_pdu(req);
1180		if (ret <= 0)
1181			goto done;
1182		if (!nvme_tcp_has_inline_data(req))
1183			goto out;
1184	}
1185
1186	if (req->state == NVME_TCP_SEND_H2C_PDU) {
1187		ret = nvme_tcp_try_send_data_pdu(req);
1188		if (ret <= 0)
1189			goto done;
1190	}
1191
1192	if (req->state == NVME_TCP_SEND_DATA) {
1193		ret = nvme_tcp_try_send_data(req);
1194		if (ret <= 0)
1195			goto done;
1196	}
1197
1198	if (req->state == NVME_TCP_SEND_DDGST)
1199		ret = nvme_tcp_try_send_ddgst(req);
1200done:
1201	if (ret == -EAGAIN) {
1202		ret = 0;
1203	} else if (ret < 0) {
1204		dev_err(queue->ctrl->ctrl.device,
1205			"failed to send request %d\n", ret);
1206		nvme_tcp_fail_request(queue->request);
1207		nvme_tcp_done_send_req(queue);
1208	}
1209out:
1210	memalloc_noreclaim_restore(noreclaim_flag);
1211	return ret;
1212}
1213
1214static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1215{
1216	struct socket *sock = queue->sock;
1217	struct sock *sk = sock->sk;
1218	read_descriptor_t rd_desc;
1219	int consumed;
1220
1221	rd_desc.arg.data = queue;
1222	rd_desc.count = 1;
1223	lock_sock(sk);
1224	queue->nr_cqe = 0;
1225	consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1226	release_sock(sk);
1227	return consumed;
1228}
1229
1230static void nvme_tcp_io_work(struct work_struct *w)
1231{
1232	struct nvme_tcp_queue *queue =
1233		container_of(w, struct nvme_tcp_queue, io_work);
1234	unsigned long deadline = jiffies + msecs_to_jiffies(1);
1235
1236	do {
1237		bool pending = false;
1238		int result;
1239
1240		if (mutex_trylock(&queue->send_mutex)) {
1241			result = nvme_tcp_try_send(queue);
1242			mutex_unlock(&queue->send_mutex);
1243			if (result > 0)
1244				pending = true;
1245			else if (unlikely(result < 0))
1246				break;
1247		}
1248
1249		result = nvme_tcp_try_recv(queue);
1250		if (result > 0)
1251			pending = true;
1252		else if (unlikely(result < 0))
1253			return;
1254
1255		if (!pending || !queue->rd_enabled)
1256			return;
1257
1258	} while (!time_after(jiffies, deadline)); /* quota is exhausted */
1259
1260	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1261}
1262
1263static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1264{
1265	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1266
1267	ahash_request_free(queue->rcv_hash);
1268	ahash_request_free(queue->snd_hash);
1269	crypto_free_ahash(tfm);
1270}
1271
1272static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1273{
1274	struct crypto_ahash *tfm;
1275
1276	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1277	if (IS_ERR(tfm))
1278		return PTR_ERR(tfm);
1279
1280	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1281	if (!queue->snd_hash)
1282		goto free_tfm;
1283	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1284
1285	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1286	if (!queue->rcv_hash)
1287		goto free_snd_hash;
1288	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1289
1290	return 0;
1291free_snd_hash:
1292	ahash_request_free(queue->snd_hash);
1293free_tfm:
1294	crypto_free_ahash(tfm);
1295	return -ENOMEM;
1296}
1297
1298static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1299{
1300	struct nvme_tcp_request *async = &ctrl->async_req;
1301
1302	page_frag_free(async->pdu);
1303}
1304
1305static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1306{
1307	struct nvme_tcp_queue *queue = &ctrl->queues[0];
1308	struct nvme_tcp_request *async = &ctrl->async_req;
1309	u8 hdgst = nvme_tcp_hdgst_len(queue);
1310
1311	async->pdu = page_frag_alloc(&queue->pf_cache,
1312		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1313		GFP_KERNEL | __GFP_ZERO);
1314	if (!async->pdu)
1315		return -ENOMEM;
1316
1317	async->queue = &ctrl->queues[0];
1318	return 0;
1319}
1320
1321static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1322{
1323	struct page *page;
1324	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1325	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1326	unsigned int noreclaim_flag;
1327
1328	if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1329		return;
1330
1331	if (queue->hdr_digest || queue->data_digest)
1332		nvme_tcp_free_crypto(queue);
1333
1334	if (queue->pf_cache.va) {
1335		page = virt_to_head_page(queue->pf_cache.va);
1336		__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
1337		queue->pf_cache.va = NULL;
1338	}
1339
1340	noreclaim_flag = memalloc_noreclaim_save();
1341	sock_release(queue->sock);
1342	memalloc_noreclaim_restore(noreclaim_flag);
1343
1344	kfree(queue->pdu);
1345	mutex_destroy(&queue->send_mutex);
1346	mutex_destroy(&queue->queue_lock);
1347}
1348
1349static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1350{
1351	struct nvme_tcp_icreq_pdu *icreq;
1352	struct nvme_tcp_icresp_pdu *icresp;
1353	struct msghdr msg = {};
1354	struct kvec iov;
1355	bool ctrl_hdgst, ctrl_ddgst;
1356	u32 maxh2cdata;
1357	int ret;
1358
1359	icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1360	if (!icreq)
1361		return -ENOMEM;
1362
1363	icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1364	if (!icresp) {
1365		ret = -ENOMEM;
1366		goto free_icreq;
1367	}
1368
1369	icreq->hdr.type = nvme_tcp_icreq;
1370	icreq->hdr.hlen = sizeof(*icreq);
1371	icreq->hdr.pdo = 0;
1372	icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1373	icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1374	icreq->maxr2t = 0; /* single inflight r2t supported */
1375	icreq->hpda = 0; /* no alignment constraint */
1376	if (queue->hdr_digest)
1377		icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1378	if (queue->data_digest)
1379		icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1380
1381	iov.iov_base = icreq;
1382	iov.iov_len = sizeof(*icreq);
1383	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1384	if (ret < 0)
1385		goto free_icresp;
1386
1387	memset(&msg, 0, sizeof(msg));
1388	iov.iov_base = icresp;
1389	iov.iov_len = sizeof(*icresp);
1390	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1391			iov.iov_len, msg.msg_flags);
1392	if (ret < 0)
1393		goto free_icresp;
1394
1395	ret = -EINVAL;
1396	if (icresp->hdr.type != nvme_tcp_icresp) {
1397		pr_err("queue %d: bad type returned %d\n",
1398			nvme_tcp_queue_id(queue), icresp->hdr.type);
1399		goto free_icresp;
1400	}
1401
1402	if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1403		pr_err("queue %d: bad pdu length returned %d\n",
1404			nvme_tcp_queue_id(queue), icresp->hdr.plen);
1405		goto free_icresp;
1406	}
1407
1408	if (icresp->pfv != NVME_TCP_PFV_1_0) {
1409		pr_err("queue %d: bad pfv returned %d\n",
1410			nvme_tcp_queue_id(queue), icresp->pfv);
1411		goto free_icresp;
1412	}
1413
1414	ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1415	if ((queue->data_digest && !ctrl_ddgst) ||
1416	    (!queue->data_digest && ctrl_ddgst)) {
1417		pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1418			nvme_tcp_queue_id(queue),
1419			queue->data_digest ? "enabled" : "disabled",
1420			ctrl_ddgst ? "enabled" : "disabled");
1421		goto free_icresp;
1422	}
1423
1424	ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1425	if ((queue->hdr_digest && !ctrl_hdgst) ||
1426	    (!queue->hdr_digest && ctrl_hdgst)) {
1427		pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1428			nvme_tcp_queue_id(queue),
1429			queue->hdr_digest ? "enabled" : "disabled",
1430			ctrl_hdgst ? "enabled" : "disabled");
1431		goto free_icresp;
1432	}
1433
1434	if (icresp->cpda != 0) {
1435		pr_err("queue %d: unsupported cpda returned %d\n",
1436			nvme_tcp_queue_id(queue), icresp->cpda);
1437		goto free_icresp;
1438	}
1439
1440	maxh2cdata = le32_to_cpu(icresp->maxdata);
1441	if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) {
1442		pr_err("queue %d: invalid maxh2cdata returned %u\n",
1443		       nvme_tcp_queue_id(queue), maxh2cdata);
1444		goto free_icresp;
1445	}
1446	queue->maxh2cdata = maxh2cdata;
1447
1448	ret = 0;
1449free_icresp:
1450	kfree(icresp);
1451free_icreq:
1452	kfree(icreq);
1453	return ret;
1454}
1455
1456static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1457{
1458	return nvme_tcp_queue_id(queue) == 0;
1459}
1460
1461static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1462{
1463	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1464	int qid = nvme_tcp_queue_id(queue);
1465
1466	return !nvme_tcp_admin_queue(queue) &&
1467		qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1468}
1469
1470static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1471{
1472	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1473	int qid = nvme_tcp_queue_id(queue);
1474
1475	return !nvme_tcp_admin_queue(queue) &&
1476		!nvme_tcp_default_queue(queue) &&
1477		qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1478			  ctrl->io_queues[HCTX_TYPE_READ];
1479}
1480
1481static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1482{
1483	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1484	int qid = nvme_tcp_queue_id(queue);
1485
1486	return !nvme_tcp_admin_queue(queue) &&
1487		!nvme_tcp_default_queue(queue) &&
1488		!nvme_tcp_read_queue(queue) &&
1489		qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1490			  ctrl->io_queues[HCTX_TYPE_READ] +
1491			  ctrl->io_queues[HCTX_TYPE_POLL];
1492}
1493
1494static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1495{
1496	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1497	int qid = nvme_tcp_queue_id(queue);
1498	int n = 0;
1499
1500	if (nvme_tcp_default_queue(queue))
1501		n = qid - 1;
1502	else if (nvme_tcp_read_queue(queue))
1503		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1504	else if (nvme_tcp_poll_queue(queue))
1505		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1506				ctrl->io_queues[HCTX_TYPE_READ] - 1;
1507	queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1508}
1509
1510static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
1511{
1512	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1513	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1514	int ret, rcv_pdu_size;
1515
1516	mutex_init(&queue->queue_lock);
1517	queue->ctrl = ctrl;
1518	init_llist_head(&queue->req_list);
1519	INIT_LIST_HEAD(&queue->send_list);
1520	mutex_init(&queue->send_mutex);
1521	INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1522
1523	if (qid > 0)
1524		queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1525	else
1526		queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1527						NVME_TCP_ADMIN_CCSZ;
1528
1529	ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1530			IPPROTO_TCP, &queue->sock);
1531	if (ret) {
1532		dev_err(nctrl->device,
1533			"failed to create socket: %d\n", ret);
1534		goto err_destroy_mutex;
1535	}
1536
1537	nvme_tcp_reclassify_socket(queue->sock);
1538
1539	/* Single syn retry */
1540	tcp_sock_set_syncnt(queue->sock->sk, 1);
1541
1542	/* Set TCP no delay */
1543	tcp_sock_set_nodelay(queue->sock->sk);
1544
1545	/*
1546	 * Cleanup whatever is sitting in the TCP transmit queue on socket
1547	 * close. This is done to prevent stale data from being sent should
1548	 * the network connection be restored before TCP times out.
1549	 */
1550	sock_no_linger(queue->sock->sk);
1551
1552	if (so_priority > 0)
1553		sock_set_priority(queue->sock->sk, so_priority);
1554
1555	/* Set socket type of service */
1556	if (nctrl->opts->tos >= 0)
1557		ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1558
1559	/* Set 10 seconds timeout for icresp recvmsg */
1560	queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1561
1562	queue->sock->sk->sk_allocation = GFP_ATOMIC;
1563	queue->sock->sk->sk_use_task_frag = false;
1564	nvme_tcp_set_queue_io_cpu(queue);
1565	queue->request = NULL;
1566	queue->data_remaining = 0;
1567	queue->ddgst_remaining = 0;
1568	queue->pdu_remaining = 0;
1569	queue->pdu_offset = 0;
1570	sk_set_memalloc(queue->sock->sk);
1571
1572	if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1573		ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1574			sizeof(ctrl->src_addr));
1575		if (ret) {
1576			dev_err(nctrl->device,
1577				"failed to bind queue %d socket %d\n",
1578				qid, ret);
1579			goto err_sock;
1580		}
1581	}
1582
1583	if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1584		char *iface = nctrl->opts->host_iface;
1585		sockptr_t optval = KERNEL_SOCKPTR(iface);
1586
1587		ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1588				      optval, strlen(iface));
1589		if (ret) {
1590			dev_err(nctrl->device,
1591			  "failed to bind to interface %s queue %d err %d\n",
1592			  iface, qid, ret);
1593			goto err_sock;
1594		}
1595	}
1596
1597	queue->hdr_digest = nctrl->opts->hdr_digest;
1598	queue->data_digest = nctrl->opts->data_digest;
1599	if (queue->hdr_digest || queue->data_digest) {
1600		ret = nvme_tcp_alloc_crypto(queue);
1601		if (ret) {
1602			dev_err(nctrl->device,
1603				"failed to allocate queue %d crypto\n", qid);
1604			goto err_sock;
1605		}
1606	}
1607
1608	rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1609			nvme_tcp_hdgst_len(queue);
1610	queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1611	if (!queue->pdu) {
1612		ret = -ENOMEM;
1613		goto err_crypto;
1614	}
1615
1616	dev_dbg(nctrl->device, "connecting queue %d\n",
1617			nvme_tcp_queue_id(queue));
1618
1619	ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1620		sizeof(ctrl->addr), 0);
1621	if (ret) {
1622		dev_err(nctrl->device,
1623			"failed to connect socket: %d\n", ret);
1624		goto err_rcv_pdu;
1625	}
1626
1627	ret = nvme_tcp_init_connection(queue);
1628	if (ret)
1629		goto err_init_connect;
1630
1631	set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1632
1633	return 0;
1634
1635err_init_connect:
1636	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1637err_rcv_pdu:
1638	kfree(queue->pdu);
1639err_crypto:
1640	if (queue->hdr_digest || queue->data_digest)
1641		nvme_tcp_free_crypto(queue);
1642err_sock:
1643	sock_release(queue->sock);
1644	queue->sock = NULL;
1645err_destroy_mutex:
1646	mutex_destroy(&queue->send_mutex);
1647	mutex_destroy(&queue->queue_lock);
1648	return ret;
1649}
1650
1651static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
1652{
1653	struct socket *sock = queue->sock;
1654
1655	write_lock_bh(&sock->sk->sk_callback_lock);
1656	sock->sk->sk_user_data  = NULL;
1657	sock->sk->sk_data_ready = queue->data_ready;
1658	sock->sk->sk_state_change = queue->state_change;
1659	sock->sk->sk_write_space  = queue->write_space;
1660	write_unlock_bh(&sock->sk->sk_callback_lock);
1661}
1662
1663static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1664{
1665	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1666	nvme_tcp_restore_sock_ops(queue);
1667	cancel_work_sync(&queue->io_work);
1668}
1669
1670static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1671{
1672	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1673	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1674
1675	if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1676		return;
1677
1678	mutex_lock(&queue->queue_lock);
1679	if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1680		__nvme_tcp_stop_queue(queue);
1681	mutex_unlock(&queue->queue_lock);
1682}
1683
1684static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
1685{
1686	write_lock_bh(&queue->sock->sk->sk_callback_lock);
1687	queue->sock->sk->sk_user_data = queue;
1688	queue->state_change = queue->sock->sk->sk_state_change;
1689	queue->data_ready = queue->sock->sk->sk_data_ready;
1690	queue->write_space = queue->sock->sk->sk_write_space;
1691	queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1692	queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1693	queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1694#ifdef CONFIG_NET_RX_BUSY_POLL
1695	queue->sock->sk->sk_ll_usec = 1;
1696#endif
1697	write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1698}
1699
1700static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1701{
1702	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1703	struct nvme_tcp_queue *queue = &ctrl->queues[idx];
1704	int ret;
1705
1706	queue->rd_enabled = true;
1707	nvme_tcp_init_recv_ctx(queue);
1708	nvme_tcp_setup_sock_ops(queue);
1709
1710	if (idx)
1711		ret = nvmf_connect_io_queue(nctrl, idx);
1712	else
1713		ret = nvmf_connect_admin_queue(nctrl);
1714
1715	if (!ret) {
1716		set_bit(NVME_TCP_Q_LIVE, &queue->flags);
1717	} else {
1718		if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1719			__nvme_tcp_stop_queue(queue);
1720		dev_err(nctrl->device,
1721			"failed to connect queue: %d ret=%d\n", idx, ret);
1722	}
1723	return ret;
1724}
1725
1726static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1727{
1728	if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1729		cancel_work_sync(&ctrl->async_event_work);
1730		nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1731		to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1732	}
1733
1734	nvme_tcp_free_queue(ctrl, 0);
1735}
1736
1737static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1738{
1739	int i;
1740
1741	for (i = 1; i < ctrl->queue_count; i++)
1742		nvme_tcp_free_queue(ctrl, i);
1743}
1744
1745static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1746{
1747	int i;
1748
1749	for (i = 1; i < ctrl->queue_count; i++)
1750		nvme_tcp_stop_queue(ctrl, i);
1751}
1752
1753static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,
1754				    int first, int last)
1755{
1756	int i, ret;
1757
1758	for (i = first; i < last; i++) {
1759		ret = nvme_tcp_start_queue(ctrl, i);
1760		if (ret)
1761			goto out_stop_queues;
1762	}
1763
1764	return 0;
1765
1766out_stop_queues:
1767	for (i--; i >= first; i--)
1768		nvme_tcp_stop_queue(ctrl, i);
1769	return ret;
1770}
1771
1772static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1773{
1774	int ret;
1775
1776	ret = nvme_tcp_alloc_queue(ctrl, 0);
1777	if (ret)
1778		return ret;
1779
1780	ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1781	if (ret)
1782		goto out_free_queue;
1783
1784	return 0;
1785
1786out_free_queue:
1787	nvme_tcp_free_queue(ctrl, 0);
1788	return ret;
1789}
1790
1791static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1792{
1793	int i, ret;
1794
1795	for (i = 1; i < ctrl->queue_count; i++) {
1796		ret = nvme_tcp_alloc_queue(ctrl, i);
1797		if (ret)
1798			goto out_free_queues;
1799	}
1800
1801	return 0;
1802
1803out_free_queues:
1804	for (i--; i >= 1; i--)
1805		nvme_tcp_free_queue(ctrl, i);
1806
1807	return ret;
1808}
1809
1810static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1811{
1812	unsigned int nr_io_queues;
1813	int ret;
1814
1815	nr_io_queues = nvmf_nr_io_queues(ctrl->opts);
1816	ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1817	if (ret)
1818		return ret;
1819
1820	if (nr_io_queues == 0) {
1821		dev_err(ctrl->device,
1822			"unable to set any I/O queues\n");
1823		return -ENOMEM;
1824	}
1825
1826	ctrl->queue_count = nr_io_queues + 1;
1827	dev_info(ctrl->device,
1828		"creating %d I/O queues.\n", nr_io_queues);
1829
1830	nvmf_set_io_queues(ctrl->opts, nr_io_queues,
1831			   to_tcp_ctrl(ctrl)->io_queues);
1832	return __nvme_tcp_alloc_io_queues(ctrl);
1833}
1834
1835static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1836{
1837	nvme_tcp_stop_io_queues(ctrl);
1838	if (remove)
1839		nvme_remove_io_tag_set(ctrl);
1840	nvme_tcp_free_io_queues(ctrl);
1841}
1842
1843static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1844{
1845	int ret, nr_queues;
1846
1847	ret = nvme_tcp_alloc_io_queues(ctrl);
1848	if (ret)
1849		return ret;
1850
1851	if (new) {
1852		ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set,
1853				&nvme_tcp_mq_ops,
1854				ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
1855				sizeof(struct nvme_tcp_request));
1856		if (ret)
1857			goto out_free_io_queues;
1858	}
1859
1860	/*
1861	 * Only start IO queues for which we have allocated the tagset
1862	 * and limitted it to the available queues. On reconnects, the
1863	 * queue number might have changed.
1864	 */
1865	nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
1866	ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues);
1867	if (ret)
1868		goto out_cleanup_connect_q;
1869
1870	if (!new) {
1871		nvme_start_freeze(ctrl);
1872		nvme_unquiesce_io_queues(ctrl);
1873		if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
1874			/*
1875			 * If we timed out waiting for freeze we are likely to
1876			 * be stuck.  Fail the controller initialization just
1877			 * to be safe.
1878			 */
1879			ret = -ENODEV;
1880			nvme_unfreeze(ctrl);
1881			goto out_wait_freeze_timed_out;
1882		}
1883		blk_mq_update_nr_hw_queues(ctrl->tagset,
1884			ctrl->queue_count - 1);
1885		nvme_unfreeze(ctrl);
1886	}
1887
1888	/*
1889	 * If the number of queues has increased (reconnect case)
1890	 * start all new queues now.
1891	 */
1892	ret = nvme_tcp_start_io_queues(ctrl, nr_queues,
1893				       ctrl->tagset->nr_hw_queues + 1);
1894	if (ret)
1895		goto out_wait_freeze_timed_out;
1896
1897	return 0;
1898
1899out_wait_freeze_timed_out:
1900	nvme_quiesce_io_queues(ctrl);
1901	nvme_sync_io_queues(ctrl);
1902	nvme_tcp_stop_io_queues(ctrl);
1903out_cleanup_connect_q:
1904	nvme_cancel_tagset(ctrl);
1905	if (new)
1906		nvme_remove_io_tag_set(ctrl);
1907out_free_io_queues:
1908	nvme_tcp_free_io_queues(ctrl);
1909	return ret;
1910}
1911
1912static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1913{
1914	nvme_tcp_stop_queue(ctrl, 0);
1915	if (remove)
1916		nvme_remove_admin_tag_set(ctrl);
1917	nvme_tcp_free_admin_queue(ctrl);
1918}
1919
1920static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1921{
1922	int error;
1923
1924	error = nvme_tcp_alloc_admin_queue(ctrl);
1925	if (error)
1926		return error;
1927
1928	if (new) {
1929		error = nvme_alloc_admin_tag_set(ctrl,
1930				&to_tcp_ctrl(ctrl)->admin_tag_set,
1931				&nvme_tcp_admin_mq_ops,
1932				sizeof(struct nvme_tcp_request));
1933		if (error)
1934			goto out_free_queue;
1935	}
1936
1937	error = nvme_tcp_start_queue(ctrl, 0);
1938	if (error)
1939		goto out_cleanup_tagset;
1940
1941	error = nvme_enable_ctrl(ctrl);
1942	if (error)
1943		goto out_stop_queue;
1944
1945	nvme_unquiesce_admin_queue(ctrl);
1946
1947	error = nvme_init_ctrl_finish(ctrl, false);
1948	if (error)
1949		goto out_quiesce_queue;
1950
1951	return 0;
1952
1953out_quiesce_queue:
1954	nvme_quiesce_admin_queue(ctrl);
1955	blk_sync_queue(ctrl->admin_q);
1956out_stop_queue:
1957	nvme_tcp_stop_queue(ctrl, 0);
1958	nvme_cancel_admin_tagset(ctrl);
1959out_cleanup_tagset:
1960	if (new)
1961		nvme_remove_admin_tag_set(ctrl);
1962out_free_queue:
1963	nvme_tcp_free_admin_queue(ctrl);
1964	return error;
1965}
1966
1967static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1968		bool remove)
1969{
1970	nvme_quiesce_admin_queue(ctrl);
1971	blk_sync_queue(ctrl->admin_q);
1972	nvme_tcp_stop_queue(ctrl, 0);
1973	nvme_cancel_admin_tagset(ctrl);
1974	if (remove)
1975		nvme_unquiesce_admin_queue(ctrl);
1976	nvme_tcp_destroy_admin_queue(ctrl, remove);
1977}
1978
1979static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1980		bool remove)
1981{
1982	if (ctrl->queue_count <= 1)
1983		return;
1984	nvme_quiesce_admin_queue(ctrl);
1985	nvme_quiesce_io_queues(ctrl);
1986	nvme_sync_io_queues(ctrl);
1987	nvme_tcp_stop_io_queues(ctrl);
1988	nvme_cancel_tagset(ctrl);
1989	if (remove)
1990		nvme_unquiesce_io_queues(ctrl);
1991	nvme_tcp_destroy_io_queues(ctrl, remove);
1992}
1993
1994static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1995{
1996	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
1997
1998	/* If we are resetting/deleting then do nothing */
1999	if (state != NVME_CTRL_CONNECTING) {
2000		WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
2001		return;
2002	}
2003
2004	if (nvmf_should_reconnect(ctrl)) {
2005		dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
2006			ctrl->opts->reconnect_delay);
2007		queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
2008				ctrl->opts->reconnect_delay * HZ);
2009	} else {
2010		dev_info(ctrl->device, "Removing controller...\n");
2011		nvme_delete_ctrl(ctrl);
2012	}
2013}
2014
2015static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
2016{
2017	struct nvmf_ctrl_options *opts = ctrl->opts;
2018	int ret;
2019
2020	ret = nvme_tcp_configure_admin_queue(ctrl, new);
2021	if (ret)
2022		return ret;
2023
2024	if (ctrl->icdoff) {
2025		ret = -EOPNOTSUPP;
2026		dev_err(ctrl->device, "icdoff is not supported!\n");
2027		goto destroy_admin;
2028	}
2029
2030	if (!nvme_ctrl_sgl_supported(ctrl)) {
2031		ret = -EOPNOTSUPP;
2032		dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
2033		goto destroy_admin;
2034	}
2035
2036	if (opts->queue_size > ctrl->sqsize + 1)
2037		dev_warn(ctrl->device,
2038			"queue_size %zu > ctrl sqsize %u, clamping down\n",
2039			opts->queue_size, ctrl->sqsize + 1);
2040
2041	if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2042		dev_warn(ctrl->device,
2043			"sqsize %u > ctrl maxcmd %u, clamping down\n",
2044			ctrl->sqsize + 1, ctrl->maxcmd);
2045		ctrl->sqsize = ctrl->maxcmd - 1;
2046	}
2047
2048	if (ctrl->queue_count > 1) {
2049		ret = nvme_tcp_configure_io_queues(ctrl, new);
2050		if (ret)
2051			goto destroy_admin;
2052	}
2053
2054	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
2055		/*
2056		 * state change failure is ok if we started ctrl delete,
2057		 * unless we're during creation of a new controller to
2058		 * avoid races with teardown flow.
2059		 */
2060		enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2061
2062		WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2063			     state != NVME_CTRL_DELETING_NOIO);
2064		WARN_ON_ONCE(new);
2065		ret = -EINVAL;
2066		goto destroy_io;
2067	}
2068
2069	nvme_start_ctrl(ctrl);
2070	return 0;
2071
2072destroy_io:
2073	if (ctrl->queue_count > 1) {
2074		nvme_quiesce_io_queues(ctrl);
2075		nvme_sync_io_queues(ctrl);
2076		nvme_tcp_stop_io_queues(ctrl);
2077		nvme_cancel_tagset(ctrl);
2078		nvme_tcp_destroy_io_queues(ctrl, new);
2079	}
2080destroy_admin:
2081	nvme_quiesce_admin_queue(ctrl);
2082	blk_sync_queue(ctrl->admin_q);
2083	nvme_tcp_stop_queue(ctrl, 0);
2084	nvme_cancel_admin_tagset(ctrl);
2085	nvme_tcp_destroy_admin_queue(ctrl, new);
2086	return ret;
2087}
2088
2089static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2090{
2091	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2092			struct nvme_tcp_ctrl, connect_work);
2093	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2094
2095	++ctrl->nr_reconnects;
2096
2097	if (nvme_tcp_setup_ctrl(ctrl, false))
2098		goto requeue;
2099
2100	dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2101			ctrl->nr_reconnects);
2102
2103	ctrl->nr_reconnects = 0;
2104
2105	return;
2106
2107requeue:
2108	dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2109			ctrl->nr_reconnects);
2110	nvme_tcp_reconnect_or_remove(ctrl);
2111}
2112
2113static void nvme_tcp_error_recovery_work(struct work_struct *work)
2114{
2115	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2116				struct nvme_tcp_ctrl, err_work);
2117	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2118
2119	nvme_stop_keep_alive(ctrl);
2120	flush_work(&ctrl->async_event_work);
2121	nvme_tcp_teardown_io_queues(ctrl, false);
2122	/* unquiesce to fail fast pending requests */
2123	nvme_unquiesce_io_queues(ctrl);
2124	nvme_tcp_teardown_admin_queue(ctrl, false);
2125	nvme_unquiesce_admin_queue(ctrl);
2126	nvme_auth_stop(ctrl);
2127
2128	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2129		/* state change failure is ok if we started ctrl delete */
2130		enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2131
2132		WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2133			     state != NVME_CTRL_DELETING_NOIO);
2134		return;
2135	}
2136
2137	nvme_tcp_reconnect_or_remove(ctrl);
2138}
2139
2140static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2141{
2142	nvme_tcp_teardown_io_queues(ctrl, shutdown);
2143	nvme_quiesce_admin_queue(ctrl);
2144	nvme_disable_ctrl(ctrl, shutdown);
2145	nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2146}
2147
2148static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2149{
2150	nvme_tcp_teardown_ctrl(ctrl, true);
2151}
2152
2153static void nvme_reset_ctrl_work(struct work_struct *work)
2154{
2155	struct nvme_ctrl *ctrl =
2156		container_of(work, struct nvme_ctrl, reset_work);
2157
2158	nvme_stop_ctrl(ctrl);
2159	nvme_tcp_teardown_ctrl(ctrl, false);
2160
2161	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2162		/* state change failure is ok if we started ctrl delete */
2163		enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2164
2165		WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2166			     state != NVME_CTRL_DELETING_NOIO);
2167		return;
2168	}
2169
2170	if (nvme_tcp_setup_ctrl(ctrl, false))
2171		goto out_fail;
2172
2173	return;
2174
2175out_fail:
2176	++ctrl->nr_reconnects;
2177	nvme_tcp_reconnect_or_remove(ctrl);
2178}
2179
2180static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
2181{
2182	flush_work(&to_tcp_ctrl(ctrl)->err_work);
2183	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2184}
2185
2186static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2187{
2188	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2189
2190	if (list_empty(&ctrl->list))
2191		goto free_ctrl;
2192
2193	mutex_lock(&nvme_tcp_ctrl_mutex);
2194	list_del(&ctrl->list);
2195	mutex_unlock(&nvme_tcp_ctrl_mutex);
2196
2197	nvmf_free_options(nctrl->opts);
2198free_ctrl:
2199	kfree(ctrl->queues);
2200	kfree(ctrl);
2201}
2202
2203static void nvme_tcp_set_sg_null(struct nvme_command *c)
2204{
2205	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2206
2207	sg->addr = 0;
2208	sg->length = 0;
2209	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2210			NVME_SGL_FMT_TRANSPORT_A;
2211}
2212
2213static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2214		struct nvme_command *c, u32 data_len)
2215{
2216	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2217
2218	sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2219	sg->length = cpu_to_le32(data_len);
2220	sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2221}
2222
2223static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2224		u32 data_len)
2225{
2226	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2227
2228	sg->addr = 0;
2229	sg->length = cpu_to_le32(data_len);
2230	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2231			NVME_SGL_FMT_TRANSPORT_A;
2232}
2233
2234static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2235{
2236	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2237	struct nvme_tcp_queue *queue = &ctrl->queues[0];
2238	struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2239	struct nvme_command *cmd = &pdu->cmd;
2240	u8 hdgst = nvme_tcp_hdgst_len(queue);
2241
2242	memset(pdu, 0, sizeof(*pdu));
2243	pdu->hdr.type = nvme_tcp_cmd;
2244	if (queue->hdr_digest)
2245		pdu->hdr.flags |= NVME_TCP_F_HDGST;
2246	pdu->hdr.hlen = sizeof(*pdu);
2247	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2248
2249	cmd->common.opcode = nvme_admin_async_event;
2250	cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2251	cmd->common.flags |= NVME_CMD_SGL_METABUF;
2252	nvme_tcp_set_sg_null(cmd);
2253
2254	ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2255	ctrl->async_req.offset = 0;
2256	ctrl->async_req.curr_bio = NULL;
2257	ctrl->async_req.data_len = 0;
2258
2259	nvme_tcp_queue_request(&ctrl->async_req, true, true);
2260}
2261
2262static void nvme_tcp_complete_timed_out(struct request *rq)
2263{
2264	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2265	struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2266
2267	nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2268	nvmf_complete_timed_out_request(rq);
2269}
2270
2271static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
2272{
2273	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2274	struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2275	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2276	u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype;
2277	int qid = nvme_tcp_queue_id(req->queue);
2278
2279	dev_warn(ctrl->device,
2280		"queue %d: timeout cid %#x type %d opcode %#x (%s)\n",
2281		nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type,
2282		opc, nvme_opcode_str(qid, opc, fctype));
2283
2284	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
2285		/*
2286		 * If we are resetting, connecting or deleting we should
2287		 * complete immediately because we may block controller
2288		 * teardown or setup sequence
2289		 * - ctrl disable/shutdown fabrics requests
2290		 * - connect requests
2291		 * - initialization admin requests
2292		 * - I/O requests that entered after unquiescing and
2293		 *   the controller stopped responding
2294		 *
2295		 * All other requests should be cancelled by the error
2296		 * recovery work, so it's fine that we fail it here.
2297		 */
2298		nvme_tcp_complete_timed_out(rq);
2299		return BLK_EH_DONE;
2300	}
2301
2302	/*
2303	 * LIVE state should trigger the normal error recovery which will
2304	 * handle completing this request.
2305	 */
2306	nvme_tcp_error_recovery(ctrl);
2307	return BLK_EH_RESET_TIMER;
2308}
2309
2310static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2311			struct request *rq)
2312{
2313	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2314	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2315	struct nvme_command *c = &pdu->cmd;
2316
2317	c->common.flags |= NVME_CMD_SGL_METABUF;
2318
2319	if (!blk_rq_nr_phys_segments(rq))
2320		nvme_tcp_set_sg_null(c);
2321	else if (rq_data_dir(rq) == WRITE &&
2322	    req->data_len <= nvme_tcp_inline_data_size(req))
2323		nvme_tcp_set_sg_inline(queue, c, req->data_len);
2324	else
2325		nvme_tcp_set_sg_host_data(c, req->data_len);
2326
2327	return 0;
2328}
2329
2330static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2331		struct request *rq)
2332{
2333	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2334	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2335	struct nvme_tcp_queue *queue = req->queue;
2336	u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2337	blk_status_t ret;
2338
2339	ret = nvme_setup_cmd(ns, rq);
2340	if (ret)
2341		return ret;
2342
2343	req->state = NVME_TCP_SEND_CMD_PDU;
2344	req->status = cpu_to_le16(NVME_SC_SUCCESS);
2345	req->offset = 0;
2346	req->data_sent = 0;
2347	req->pdu_len = 0;
2348	req->pdu_sent = 0;
2349	req->h2cdata_left = 0;
2350	req->data_len = blk_rq_nr_phys_segments(rq) ?
2351				blk_rq_payload_bytes(rq) : 0;
2352	req->curr_bio = rq->bio;
2353	if (req->curr_bio && req->data_len)
2354		nvme_tcp_init_iter(req, rq_data_dir(rq));
2355
2356	if (rq_data_dir(rq) == WRITE &&
2357	    req->data_len <= nvme_tcp_inline_data_size(req))
2358		req->pdu_len = req->data_len;
2359
2360	pdu->hdr.type = nvme_tcp_cmd;
2361	pdu->hdr.flags = 0;
2362	if (queue->hdr_digest)
2363		pdu->hdr.flags |= NVME_TCP_F_HDGST;
2364	if (queue->data_digest && req->pdu_len) {
2365		pdu->hdr.flags |= NVME_TCP_F_DDGST;
2366		ddgst = nvme_tcp_ddgst_len(queue);
2367	}
2368	pdu->hdr.hlen = sizeof(*pdu);
2369	pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2370	pdu->hdr.plen =
2371		cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2372
2373	ret = nvme_tcp_map_data(queue, rq);
2374	if (unlikely(ret)) {
2375		nvme_cleanup_cmd(rq);
2376		dev_err(queue->ctrl->ctrl.device,
2377			"Failed to map data (%d)\n", ret);
2378		return ret;
2379	}
2380
2381	return 0;
2382}
2383
2384static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2385{
2386	struct nvme_tcp_queue *queue = hctx->driver_data;
2387
2388	if (!llist_empty(&queue->req_list))
2389		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2390}
2391
2392static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2393		const struct blk_mq_queue_data *bd)
2394{
2395	struct nvme_ns *ns = hctx->queue->queuedata;
2396	struct nvme_tcp_queue *queue = hctx->driver_data;
2397	struct request *rq = bd->rq;
2398	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2399	bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2400	blk_status_t ret;
2401
2402	if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2403		return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2404
2405	ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2406	if (unlikely(ret))
2407		return ret;
2408
2409	nvme_start_request(rq);
2410
2411	nvme_tcp_queue_request(req, true, bd->last);
2412
2413	return BLK_STS_OK;
2414}
2415
2416static void nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2417{
2418	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
2419
2420	nvmf_map_queues(set, &ctrl->ctrl, ctrl->io_queues);
2421}
2422
2423static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
2424{
2425	struct nvme_tcp_queue *queue = hctx->driver_data;
2426	struct sock *sk = queue->sock->sk;
2427
2428	if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2429		return 0;
2430
2431	set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2432	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2433		sk_busy_loop(sk, true);
2434	nvme_tcp_try_recv(queue);
2435	clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2436	return queue->nr_cqe;
2437}
2438
2439static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
2440{
2441	struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0];
2442	struct sockaddr_storage src_addr;
2443	int ret, len;
2444
2445	len = nvmf_get_address(ctrl, buf, size);
2446
2447	mutex_lock(&queue->queue_lock);
2448
2449	if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2450		goto done;
2451	ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr);
2452	if (ret > 0) {
2453		if (len > 0)
2454			len--; /* strip trailing newline */
2455		len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n",
2456				(len) ? "," : "", &src_addr);
2457	}
2458done:
2459	mutex_unlock(&queue->queue_lock);
2460
2461	return len;
2462}
2463
2464static const struct blk_mq_ops nvme_tcp_mq_ops = {
2465	.queue_rq	= nvme_tcp_queue_rq,
2466	.commit_rqs	= nvme_tcp_commit_rqs,
2467	.complete	= nvme_complete_rq,
2468	.init_request	= nvme_tcp_init_request,
2469	.exit_request	= nvme_tcp_exit_request,
2470	.init_hctx	= nvme_tcp_init_hctx,
2471	.timeout	= nvme_tcp_timeout,
2472	.map_queues	= nvme_tcp_map_queues,
2473	.poll		= nvme_tcp_poll,
2474};
2475
2476static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2477	.queue_rq	= nvme_tcp_queue_rq,
2478	.complete	= nvme_complete_rq,
2479	.init_request	= nvme_tcp_init_request,
2480	.exit_request	= nvme_tcp_exit_request,
2481	.init_hctx	= nvme_tcp_init_admin_hctx,
2482	.timeout	= nvme_tcp_timeout,
2483};
2484
2485static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2486	.name			= "tcp",
2487	.module			= THIS_MODULE,
2488	.flags			= NVME_F_FABRICS | NVME_F_BLOCKING,
2489	.reg_read32		= nvmf_reg_read32,
2490	.reg_read64		= nvmf_reg_read64,
2491	.reg_write32		= nvmf_reg_write32,
2492	.free_ctrl		= nvme_tcp_free_ctrl,
2493	.submit_async_event	= nvme_tcp_submit_async_event,
2494	.delete_ctrl		= nvme_tcp_delete_ctrl,
2495	.get_address		= nvme_tcp_get_address,
2496	.stop_ctrl		= nvme_tcp_stop_ctrl,
2497};
2498
2499static bool
2500nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2501{
2502	struct nvme_tcp_ctrl *ctrl;
2503	bool found = false;
2504
2505	mutex_lock(&nvme_tcp_ctrl_mutex);
2506	list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2507		found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2508		if (found)
2509			break;
2510	}
2511	mutex_unlock(&nvme_tcp_ctrl_mutex);
2512
2513	return found;
2514}
2515
2516static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2517		struct nvmf_ctrl_options *opts)
2518{
2519	struct nvme_tcp_ctrl *ctrl;
2520	int ret;
2521
2522	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2523	if (!ctrl)
2524		return ERR_PTR(-ENOMEM);
2525
2526	INIT_LIST_HEAD(&ctrl->list);
2527	ctrl->ctrl.opts = opts;
2528	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2529				opts->nr_poll_queues + 1;
2530	ctrl->ctrl.sqsize = opts->queue_size - 1;
2531	ctrl->ctrl.kato = opts->kato;
2532
2533	INIT_DELAYED_WORK(&ctrl->connect_work,
2534			nvme_tcp_reconnect_ctrl_work);
2535	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2536	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2537
2538	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2539		opts->trsvcid =
2540			kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2541		if (!opts->trsvcid) {
2542			ret = -ENOMEM;
2543			goto out_free_ctrl;
2544		}
2545		opts->mask |= NVMF_OPT_TRSVCID;
2546	}
2547
2548	ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2549			opts->traddr, opts->trsvcid, &ctrl->addr);
2550	if (ret) {
2551		pr_err("malformed address passed: %s:%s\n",
2552			opts->traddr, opts->trsvcid);
2553		goto out_free_ctrl;
2554	}
2555
2556	if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2557		ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2558			opts->host_traddr, NULL, &ctrl->src_addr);
2559		if (ret) {
2560			pr_err("malformed src address passed: %s\n",
2561			       opts->host_traddr);
2562			goto out_free_ctrl;
2563		}
2564	}
2565
2566	if (opts->mask & NVMF_OPT_HOST_IFACE) {
2567		if (!__dev_get_by_name(&init_net, opts->host_iface)) {
2568			pr_err("invalid interface passed: %s\n",
2569			       opts->host_iface);
2570			ret = -ENODEV;
2571			goto out_free_ctrl;
2572		}
2573	}
2574
2575	if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2576		ret = -EALREADY;
2577		goto out_free_ctrl;
2578	}
2579
2580	ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2581				GFP_KERNEL);
2582	if (!ctrl->queues) {
2583		ret = -ENOMEM;
2584		goto out_free_ctrl;
2585	}
2586
2587	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2588	if (ret)
2589		goto out_kfree_queues;
2590
2591	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2592		WARN_ON_ONCE(1);
2593		ret = -EINTR;
2594		goto out_uninit_ctrl;
2595	}
2596
2597	ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2598	if (ret)
2599		goto out_uninit_ctrl;
2600
2601	dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2602		nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
2603
2604	mutex_lock(&nvme_tcp_ctrl_mutex);
2605	list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2606	mutex_unlock(&nvme_tcp_ctrl_mutex);
2607
2608	return &ctrl->ctrl;
2609
2610out_uninit_ctrl:
2611	nvme_uninit_ctrl(&ctrl->ctrl);
2612	nvme_put_ctrl(&ctrl->ctrl);
2613	if (ret > 0)
2614		ret = -EIO;
2615	return ERR_PTR(ret);
2616out_kfree_queues:
2617	kfree(ctrl->queues);
2618out_free_ctrl:
2619	kfree(ctrl);
2620	return ERR_PTR(ret);
2621}
2622
2623static struct nvmf_transport_ops nvme_tcp_transport = {
2624	.name		= "tcp",
2625	.module		= THIS_MODULE,
2626	.required_opts	= NVMF_OPT_TRADDR,
2627	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2628			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2629			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2630			  NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2631			  NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
2632	.create_ctrl	= nvme_tcp_create_ctrl,
2633};
2634
2635static int __init nvme_tcp_init_module(void)
2636{
2637	BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
2638	BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
2639	BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24);
2640	BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24);
2641	BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24);
2642	BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128);
2643	BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
2644	BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
2645
2646	nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2647			WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2648	if (!nvme_tcp_wq)
2649		return -ENOMEM;
2650
2651	nvmf_register_transport(&nvme_tcp_transport);
2652	return 0;
2653}
2654
2655static void __exit nvme_tcp_cleanup_module(void)
2656{
2657	struct nvme_tcp_ctrl *ctrl;
2658
2659	nvmf_unregister_transport(&nvme_tcp_transport);
2660
2661	mutex_lock(&nvme_tcp_ctrl_mutex);
2662	list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2663		nvme_delete_ctrl(&ctrl->ctrl);
2664	mutex_unlock(&nvme_tcp_ctrl_mutex);
2665	flush_workqueue(nvme_delete_wq);
2666
2667	destroy_workqueue(nvme_tcp_wq);
2668}
2669
2670module_init(nvme_tcp_init_module);
2671module_exit(nvme_tcp_cleanup_module);
2672
2673MODULE_LICENSE("GPL v2");
2674