xref: /kernel/linux/linux-5.10/lib/iov_iter.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2#include <crypto/hash.h>
3#include <linux/export.h>
4#include <linux/bvec.h>
5#include <linux/fault-inject-usercopy.h>
6#include <linux/uio.h>
7#include <linux/pagemap.h>
8#include <linux/slab.h>
9#include <linux/vmalloc.h>
10#include <linux/splice.h>
11#include <linux/compat.h>
12#include <net/checksum.h>
13#include <linux/scatterlist.h>
14#include <linux/instrumented.h>
15
16#define PIPE_PARANOIA /* for now */
17
18#define iterate_iovec(i, n, __v, __p, skip, STEP) {	\
19	size_t left;					\
20	size_t wanted = n;				\
21	__p = i->iov;					\
22	__v.iov_len = min(n, __p->iov_len - skip);	\
23	if (likely(__v.iov_len)) {			\
24		__v.iov_base = __p->iov_base + skip;	\
25		left = (STEP);				\
26		__v.iov_len -= left;			\
27		skip += __v.iov_len;			\
28		n -= __v.iov_len;			\
29	} else {					\
30		left = 0;				\
31	}						\
32	while (unlikely(!left && n)) {			\
33		__p++;					\
34		__v.iov_len = min(n, __p->iov_len);	\
35		if (unlikely(!__v.iov_len))		\
36			continue;			\
37		__v.iov_base = __p->iov_base;		\
38		left = (STEP);				\
39		__v.iov_len -= left;			\
40		skip = __v.iov_len;			\
41		n -= __v.iov_len;			\
42	}						\
43	n = wanted - n;					\
44}
45
46#define iterate_kvec(i, n, __v, __p, skip, STEP) {	\
47	size_t wanted = n;				\
48	__p = i->kvec;					\
49	__v.iov_len = min(n, __p->iov_len - skip);	\
50	if (likely(__v.iov_len)) {			\
51		__v.iov_base = __p->iov_base + skip;	\
52		(void)(STEP);				\
53		skip += __v.iov_len;			\
54		n -= __v.iov_len;			\
55	}						\
56	while (unlikely(n)) {				\
57		__p++;					\
58		__v.iov_len = min(n, __p->iov_len);	\
59		if (unlikely(!__v.iov_len))		\
60			continue;			\
61		__v.iov_base = __p->iov_base;		\
62		(void)(STEP);				\
63		skip = __v.iov_len;			\
64		n -= __v.iov_len;			\
65	}						\
66	n = wanted;					\
67}
68
69#define iterate_bvec(i, n, __v, __bi, skip, STEP) {	\
70	struct bvec_iter __start;			\
71	__start.bi_size = n;				\
72	__start.bi_bvec_done = skip;			\
73	__start.bi_idx = 0;				\
74	for_each_bvec(__v, i->bvec, __bi, __start) {	\
75		if (!__v.bv_len)			\
76			continue;			\
77		(void)(STEP);				\
78	}						\
79}
80
81#define iterate_all_kinds(i, n, v, I, B, K) {			\
82	if (likely(n)) {					\
83		size_t skip = i->iov_offset;			\
84		if (unlikely(i->type & ITER_BVEC)) {		\
85			struct bio_vec v;			\
86			struct bvec_iter __bi;			\
87			iterate_bvec(i, n, v, __bi, skip, (B))	\
88		} else if (unlikely(i->type & ITER_KVEC)) {	\
89			const struct kvec *kvec;		\
90			struct kvec v;				\
91			iterate_kvec(i, n, v, kvec, skip, (K))	\
92		} else if (unlikely(i->type & ITER_DISCARD)) {	\
93		} else {					\
94			const struct iovec *iov;		\
95			struct iovec v;				\
96			iterate_iovec(i, n, v, iov, skip, (I))	\
97		}						\
98	}							\
99}
100
101#define iterate_and_advance(i, n, v, I, B, K) {			\
102	if (unlikely(i->count < n))				\
103		n = i->count;					\
104	if (i->count) {						\
105		size_t skip = i->iov_offset;			\
106		if (unlikely(i->type & ITER_BVEC)) {		\
107			const struct bio_vec *bvec = i->bvec;	\
108			struct bio_vec v;			\
109			struct bvec_iter __bi;			\
110			iterate_bvec(i, n, v, __bi, skip, (B))	\
111			i->bvec = __bvec_iter_bvec(i->bvec, __bi);	\
112			i->nr_segs -= i->bvec - bvec;		\
113			skip = __bi.bi_bvec_done;		\
114		} else if (unlikely(i->type & ITER_KVEC)) {	\
115			const struct kvec *kvec;		\
116			struct kvec v;				\
117			iterate_kvec(i, n, v, kvec, skip, (K))	\
118			if (skip == kvec->iov_len) {		\
119				kvec++;				\
120				skip = 0;			\
121			}					\
122			i->nr_segs -= kvec - i->kvec;		\
123			i->kvec = kvec;				\
124		} else if (unlikely(i->type & ITER_DISCARD)) {	\
125			skip += n;				\
126		} else {					\
127			const struct iovec *iov;		\
128			struct iovec v;				\
129			iterate_iovec(i, n, v, iov, skip, (I))	\
130			if (skip == iov->iov_len) {		\
131				iov++;				\
132				skip = 0;			\
133			}					\
134			i->nr_segs -= iov - i->iov;		\
135			i->iov = iov;				\
136		}						\
137		i->count -= n;					\
138		i->iov_offset = skip;				\
139	}							\
140}
141
142static int copyout(void __user *to, const void *from, size_t n)
143{
144	if (should_fail_usercopy())
145		return n;
146	if (access_ok(to, n)) {
147		instrument_copy_to_user(to, from, n);
148		n = raw_copy_to_user(to, from, n);
149	}
150	return n;
151}
152
153static int copyin(void *to, const void __user *from, size_t n)
154{
155	if (should_fail_usercopy())
156		return n;
157	if (access_ok(from, n)) {
158		instrument_copy_from_user(to, from, n);
159		n = raw_copy_from_user(to, from, n);
160	}
161	return n;
162}
163
164static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
165			 struct iov_iter *i)
166{
167	size_t skip, copy, left, wanted;
168	const struct iovec *iov;
169	char __user *buf;
170	void *kaddr, *from;
171
172	if (unlikely(bytes > i->count))
173		bytes = i->count;
174
175	if (unlikely(!bytes))
176		return 0;
177
178	might_fault();
179	wanted = bytes;
180	iov = i->iov;
181	skip = i->iov_offset;
182	buf = iov->iov_base + skip;
183	copy = min(bytes, iov->iov_len - skip);
184
185	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
186		kaddr = kmap_atomic(page);
187		from = kaddr + offset;
188
189		/* first chunk, usually the only one */
190		left = copyout(buf, from, copy);
191		copy -= left;
192		skip += copy;
193		from += copy;
194		bytes -= copy;
195
196		while (unlikely(!left && bytes)) {
197			iov++;
198			buf = iov->iov_base;
199			copy = min(bytes, iov->iov_len);
200			left = copyout(buf, from, copy);
201			copy -= left;
202			skip = copy;
203			from += copy;
204			bytes -= copy;
205		}
206		if (likely(!bytes)) {
207			kunmap_atomic(kaddr);
208			goto done;
209		}
210		offset = from - kaddr;
211		buf += copy;
212		kunmap_atomic(kaddr);
213		copy = min(bytes, iov->iov_len - skip);
214	}
215	/* Too bad - revert to non-atomic kmap */
216
217	kaddr = kmap(page);
218	from = kaddr + offset;
219	left = copyout(buf, from, copy);
220	copy -= left;
221	skip += copy;
222	from += copy;
223	bytes -= copy;
224	while (unlikely(!left && bytes)) {
225		iov++;
226		buf = iov->iov_base;
227		copy = min(bytes, iov->iov_len);
228		left = copyout(buf, from, copy);
229		copy -= left;
230		skip = copy;
231		from += copy;
232		bytes -= copy;
233	}
234	kunmap(page);
235
236done:
237	if (skip == iov->iov_len) {
238		iov++;
239		skip = 0;
240	}
241	i->count -= wanted - bytes;
242	i->nr_segs -= iov - i->iov;
243	i->iov = iov;
244	i->iov_offset = skip;
245	return wanted - bytes;
246}
247
248static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
249			 struct iov_iter *i)
250{
251	size_t skip, copy, left, wanted;
252	const struct iovec *iov;
253	char __user *buf;
254	void *kaddr, *to;
255
256	if (unlikely(bytes > i->count))
257		bytes = i->count;
258
259	if (unlikely(!bytes))
260		return 0;
261
262	might_fault();
263	wanted = bytes;
264	iov = i->iov;
265	skip = i->iov_offset;
266	buf = iov->iov_base + skip;
267	copy = min(bytes, iov->iov_len - skip);
268
269	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
270		kaddr = kmap_atomic(page);
271		to = kaddr + offset;
272
273		/* first chunk, usually the only one */
274		left = copyin(to, buf, copy);
275		copy -= left;
276		skip += copy;
277		to += copy;
278		bytes -= copy;
279
280		while (unlikely(!left && bytes)) {
281			iov++;
282			buf = iov->iov_base;
283			copy = min(bytes, iov->iov_len);
284			left = copyin(to, buf, copy);
285			copy -= left;
286			skip = copy;
287			to += copy;
288			bytes -= copy;
289		}
290		if (likely(!bytes)) {
291			kunmap_atomic(kaddr);
292			goto done;
293		}
294		offset = to - kaddr;
295		buf += copy;
296		kunmap_atomic(kaddr);
297		copy = min(bytes, iov->iov_len - skip);
298	}
299	/* Too bad - revert to non-atomic kmap */
300
301	kaddr = kmap(page);
302	to = kaddr + offset;
303	left = copyin(to, buf, copy);
304	copy -= left;
305	skip += copy;
306	to += copy;
307	bytes -= copy;
308	while (unlikely(!left && bytes)) {
309		iov++;
310		buf = iov->iov_base;
311		copy = min(bytes, iov->iov_len);
312		left = copyin(to, buf, copy);
313		copy -= left;
314		skip = copy;
315		to += copy;
316		bytes -= copy;
317	}
318	kunmap(page);
319
320done:
321	if (skip == iov->iov_len) {
322		iov++;
323		skip = 0;
324	}
325	i->count -= wanted - bytes;
326	i->nr_segs -= iov - i->iov;
327	i->iov = iov;
328	i->iov_offset = skip;
329	return wanted - bytes;
330}
331
332#ifdef PIPE_PARANOIA
333static bool sanity(const struct iov_iter *i)
334{
335	struct pipe_inode_info *pipe = i->pipe;
336	unsigned int p_head = pipe->head;
337	unsigned int p_tail = pipe->tail;
338	unsigned int p_mask = pipe->ring_size - 1;
339	unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
340	unsigned int i_head = i->head;
341	unsigned int idx;
342
343	if (i->iov_offset) {
344		struct pipe_buffer *p;
345		if (unlikely(p_occupancy == 0))
346			goto Bad;	// pipe must be non-empty
347		if (unlikely(i_head != p_head - 1))
348			goto Bad;	// must be at the last buffer...
349
350		p = &pipe->bufs[i_head & p_mask];
351		if (unlikely(p->offset + p->len != i->iov_offset))
352			goto Bad;	// ... at the end of segment
353	} else {
354		if (i_head != p_head)
355			goto Bad;	// must be right after the last buffer
356	}
357	return true;
358Bad:
359	printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
360	printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
361			p_head, p_tail, pipe->ring_size);
362	for (idx = 0; idx < pipe->ring_size; idx++)
363		printk(KERN_ERR "[%p %p %d %d]\n",
364			pipe->bufs[idx].ops,
365			pipe->bufs[idx].page,
366			pipe->bufs[idx].offset,
367			pipe->bufs[idx].len);
368	WARN_ON(1);
369	return false;
370}
371#else
372#define sanity(i) true
373#endif
374
375static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
376			 struct iov_iter *i)
377{
378	struct pipe_inode_info *pipe = i->pipe;
379	struct pipe_buffer *buf;
380	unsigned int p_tail = pipe->tail;
381	unsigned int p_mask = pipe->ring_size - 1;
382	unsigned int i_head = i->head;
383	size_t off;
384
385	if (unlikely(bytes > i->count))
386		bytes = i->count;
387
388	if (unlikely(!bytes))
389		return 0;
390
391	if (!sanity(i))
392		return 0;
393
394	off = i->iov_offset;
395	buf = &pipe->bufs[i_head & p_mask];
396	if (off) {
397		if (offset == off && buf->page == page) {
398			/* merge with the last one */
399			buf->len += bytes;
400			i->iov_offset += bytes;
401			goto out;
402		}
403		i_head++;
404		buf = &pipe->bufs[i_head & p_mask];
405	}
406	if (pipe_full(i_head, p_tail, pipe->max_usage))
407		return 0;
408
409	buf->ops = &page_cache_pipe_buf_ops;
410	buf->flags = 0;
411	get_page(page);
412	buf->page = page;
413	buf->offset = offset;
414	buf->len = bytes;
415
416	pipe->head = i_head + 1;
417	i->iov_offset = offset + bytes;
418	i->head = i_head;
419out:
420	i->count -= bytes;
421	return bytes;
422}
423
424/*
425 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
426 * bytes.  For each iovec, fault in each page that constitutes the iovec.
427 *
428 * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
429 * because it is an invalid address).
430 */
431int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
432{
433	size_t skip = i->iov_offset;
434	const struct iovec *iov;
435	int err;
436	struct iovec v;
437
438	if (iter_is_iovec(i)) {
439		iterate_iovec(i, bytes, v, iov, skip, ({
440			err = fault_in_pages_readable(v.iov_base, v.iov_len);
441			if (unlikely(err))
442			return err;
443		0;}))
444	}
445	return 0;
446}
447EXPORT_SYMBOL(iov_iter_fault_in_readable);
448
449void iov_iter_init(struct iov_iter *i, unsigned int direction,
450			const struct iovec *iov, unsigned long nr_segs,
451			size_t count)
452{
453	WARN_ON(direction & ~(READ | WRITE));
454	direction &= READ | WRITE;
455
456	/* It will get better.  Eventually... */
457	if (uaccess_kernel()) {
458		i->type = ITER_KVEC | direction;
459		i->kvec = (struct kvec *)iov;
460	} else {
461		i->type = ITER_IOVEC | direction;
462		i->iov = iov;
463	}
464	i->nr_segs = nr_segs;
465	i->iov_offset = 0;
466	i->count = count;
467}
468EXPORT_SYMBOL(iov_iter_init);
469
470static void memzero_page(struct page *page, size_t offset, size_t len)
471{
472	char *addr = kmap_atomic(page);
473	memset(addr + offset, 0, len);
474	kunmap_atomic(addr);
475}
476
477static inline bool allocated(struct pipe_buffer *buf)
478{
479	return buf->ops == &default_pipe_buf_ops;
480}
481
482static inline void data_start(const struct iov_iter *i,
483			      unsigned int *iter_headp, size_t *offp)
484{
485	unsigned int p_mask = i->pipe->ring_size - 1;
486	unsigned int iter_head = i->head;
487	size_t off = i->iov_offset;
488
489	if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
490		    off == PAGE_SIZE)) {
491		iter_head++;
492		off = 0;
493	}
494	*iter_headp = iter_head;
495	*offp = off;
496}
497
498static size_t push_pipe(struct iov_iter *i, size_t size,
499			int *iter_headp, size_t *offp)
500{
501	struct pipe_inode_info *pipe = i->pipe;
502	unsigned int p_tail = pipe->tail;
503	unsigned int p_mask = pipe->ring_size - 1;
504	unsigned int iter_head;
505	size_t off;
506	ssize_t left;
507
508	if (unlikely(size > i->count))
509		size = i->count;
510	if (unlikely(!size))
511		return 0;
512
513	left = size;
514	data_start(i, &iter_head, &off);
515	*iter_headp = iter_head;
516	*offp = off;
517	if (off) {
518		left -= PAGE_SIZE - off;
519		if (left <= 0) {
520			pipe->bufs[iter_head & p_mask].len += size;
521			return size;
522		}
523		pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
524		iter_head++;
525	}
526	while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
527		struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
528		struct page *page = alloc_page(GFP_USER);
529		if (!page)
530			break;
531
532		buf->ops = &default_pipe_buf_ops;
533		buf->flags = 0;
534		buf->page = page;
535		buf->offset = 0;
536		buf->len = min_t(ssize_t, left, PAGE_SIZE);
537		left -= buf->len;
538		iter_head++;
539		pipe->head = iter_head;
540
541		if (left == 0)
542			return size;
543	}
544	return size - left;
545}
546
547static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
548				struct iov_iter *i)
549{
550	struct pipe_inode_info *pipe = i->pipe;
551	unsigned int p_mask = pipe->ring_size - 1;
552	unsigned int i_head;
553	size_t n, off;
554
555	if (!sanity(i))
556		return 0;
557
558	bytes = n = push_pipe(i, bytes, &i_head, &off);
559	if (unlikely(!n))
560		return 0;
561	do {
562		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
563		memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
564		i->head = i_head;
565		i->iov_offset = off + chunk;
566		n -= chunk;
567		addr += chunk;
568		off = 0;
569		i_head++;
570	} while (n);
571	i->count -= bytes;
572	return bytes;
573}
574
575static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
576			      __wsum sum, size_t off)
577{
578	__wsum next = csum_partial_copy_nocheck(from, to, len);
579	return csum_block_add(sum, next, off);
580}
581
582static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
583					 struct csum_state *csstate,
584					 struct iov_iter *i)
585{
586	struct pipe_inode_info *pipe = i->pipe;
587	unsigned int p_mask = pipe->ring_size - 1;
588	__wsum sum = csstate->csum;
589	size_t off = csstate->off;
590	unsigned int i_head;
591	size_t n, r;
592
593	if (!sanity(i))
594		return 0;
595
596	bytes = n = push_pipe(i, bytes, &i_head, &r);
597	if (unlikely(!n))
598		return 0;
599	do {
600		size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
601		char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
602		sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
603		kunmap_atomic(p);
604		i->head = i_head;
605		i->iov_offset = r + chunk;
606		n -= chunk;
607		off += chunk;
608		addr += chunk;
609		r = 0;
610		i_head++;
611	} while (n);
612	i->count -= bytes;
613	csstate->csum = sum;
614	csstate->off = off;
615	return bytes;
616}
617
618size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
619{
620	const char *from = addr;
621	if (unlikely(iov_iter_is_pipe(i)))
622		return copy_pipe_to_iter(addr, bytes, i);
623	if (iter_is_iovec(i))
624		might_fault();
625	iterate_and_advance(i, bytes, v,
626		copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
627		memcpy_to_page(v.bv_page, v.bv_offset,
628			       (from += v.bv_len) - v.bv_len, v.bv_len),
629		memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
630	)
631
632	return bytes;
633}
634EXPORT_SYMBOL(_copy_to_iter);
635
636#ifdef CONFIG_ARCH_HAS_COPY_MC
637static int copyout_mc(void __user *to, const void *from, size_t n)
638{
639	if (access_ok(to, n)) {
640		instrument_copy_to_user(to, from, n);
641		n = copy_mc_to_user((__force void *) to, from, n);
642	}
643	return n;
644}
645
646static unsigned long copy_mc_to_page(struct page *page, size_t offset,
647		const char *from, size_t len)
648{
649	unsigned long ret;
650	char *to;
651
652	to = kmap_atomic(page);
653	ret = copy_mc_to_kernel(to + offset, from, len);
654	kunmap_atomic(to);
655
656	return ret;
657}
658
659static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
660				struct iov_iter *i)
661{
662	struct pipe_inode_info *pipe = i->pipe;
663	unsigned int p_mask = pipe->ring_size - 1;
664	unsigned int i_head;
665	size_t n, off, xfer = 0;
666
667	if (!sanity(i))
668		return 0;
669
670	bytes = n = push_pipe(i, bytes, &i_head, &off);
671	if (unlikely(!n))
672		return 0;
673	do {
674		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
675		unsigned long rem;
676
677		rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page,
678					    off, addr, chunk);
679		i->head = i_head;
680		i->iov_offset = off + chunk - rem;
681		xfer += chunk - rem;
682		if (rem)
683			break;
684		n -= chunk;
685		addr += chunk;
686		off = 0;
687		i_head++;
688	} while (n);
689	i->count -= xfer;
690	return xfer;
691}
692
693/**
694 * _copy_mc_to_iter - copy to iter with source memory error exception handling
695 * @addr: source kernel address
696 * @bytes: total transfer length
697 * @iter: destination iterator
698 *
699 * The pmem driver deploys this for the dax operation
700 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
701 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
702 * successfully copied.
703 *
704 * The main differences between this and typical _copy_to_iter().
705 *
706 * * Typical tail/residue handling after a fault retries the copy
707 *   byte-by-byte until the fault happens again. Re-triggering machine
708 *   checks is potentially fatal so the implementation uses source
709 *   alignment and poison alignment assumptions to avoid re-triggering
710 *   hardware exceptions.
711 *
712 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
713 *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
714 *   a short copy.
715 */
716size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
717{
718	const char *from = addr;
719	unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
720
721	if (unlikely(iov_iter_is_pipe(i)))
722		return copy_mc_pipe_to_iter(addr, bytes, i);
723	if (iter_is_iovec(i))
724		might_fault();
725	iterate_and_advance(i, bytes, v,
726		copyout_mc(v.iov_base, (from += v.iov_len) - v.iov_len,
727			   v.iov_len),
728		({
729		rem = copy_mc_to_page(v.bv_page, v.bv_offset,
730				      (from += v.bv_len) - v.bv_len, v.bv_len);
731		if (rem) {
732			curr_addr = (unsigned long) from;
733			bytes = curr_addr - s_addr - rem;
734			return bytes;
735		}
736		}),
737		({
738		rem = copy_mc_to_kernel(v.iov_base, (from += v.iov_len)
739					- v.iov_len, v.iov_len);
740		if (rem) {
741			curr_addr = (unsigned long) from;
742			bytes = curr_addr - s_addr - rem;
743			return bytes;
744		}
745		})
746	)
747
748	return bytes;
749}
750EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
751#endif /* CONFIG_ARCH_HAS_COPY_MC */
752
753size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
754{
755	char *to = addr;
756	if (unlikely(iov_iter_is_pipe(i))) {
757		WARN_ON(1);
758		return 0;
759	}
760	if (iter_is_iovec(i))
761		might_fault();
762	iterate_and_advance(i, bytes, v,
763		copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
764		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
765				 v.bv_offset, v.bv_len),
766		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
767	)
768
769	return bytes;
770}
771EXPORT_SYMBOL(_copy_from_iter);
772
773bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
774{
775	char *to = addr;
776	if (unlikely(iov_iter_is_pipe(i))) {
777		WARN_ON(1);
778		return false;
779	}
780	if (unlikely(i->count < bytes))
781		return false;
782
783	if (iter_is_iovec(i))
784		might_fault();
785	iterate_all_kinds(i, bytes, v, ({
786		if (copyin((to += v.iov_len) - v.iov_len,
787				      v.iov_base, v.iov_len))
788			return false;
789		0;}),
790		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
791				 v.bv_offset, v.bv_len),
792		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
793	)
794
795	iov_iter_advance(i, bytes);
796	return true;
797}
798EXPORT_SYMBOL(_copy_from_iter_full);
799
800size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
801{
802	char *to = addr;
803	if (unlikely(iov_iter_is_pipe(i))) {
804		WARN_ON(1);
805		return 0;
806	}
807	iterate_and_advance(i, bytes, v,
808		__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
809					 v.iov_base, v.iov_len),
810		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
811				 v.bv_offset, v.bv_len),
812		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
813	)
814
815	return bytes;
816}
817EXPORT_SYMBOL(_copy_from_iter_nocache);
818
819#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
820/**
821 * _copy_from_iter_flushcache - write destination through cpu cache
822 * @addr: destination kernel address
823 * @bytes: total transfer length
824 * @iter: source iterator
825 *
826 * The pmem driver arranges for filesystem-dax to use this facility via
827 * dax_copy_from_iter() for ensuring that writes to persistent memory
828 * are flushed through the CPU cache. It is differentiated from
829 * _copy_from_iter_nocache() in that guarantees all data is flushed for
830 * all iterator types. The _copy_from_iter_nocache() only attempts to
831 * bypass the cache for the ITER_IOVEC case, and on some archs may use
832 * instructions that strand dirty-data in the cache.
833 */
834size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
835{
836	char *to = addr;
837	if (unlikely(iov_iter_is_pipe(i))) {
838		WARN_ON(1);
839		return 0;
840	}
841	iterate_and_advance(i, bytes, v,
842		__copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
843					 v.iov_base, v.iov_len),
844		memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
845				 v.bv_offset, v.bv_len),
846		memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
847			v.iov_len)
848	)
849
850	return bytes;
851}
852EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
853#endif
854
855bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
856{
857	char *to = addr;
858	if (unlikely(iov_iter_is_pipe(i))) {
859		WARN_ON(1);
860		return false;
861	}
862	if (unlikely(i->count < bytes))
863		return false;
864	iterate_all_kinds(i, bytes, v, ({
865		if (__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
866					     v.iov_base, v.iov_len))
867			return false;
868		0;}),
869		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
870				 v.bv_offset, v.bv_len),
871		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
872	)
873
874	iov_iter_advance(i, bytes);
875	return true;
876}
877EXPORT_SYMBOL(_copy_from_iter_full_nocache);
878
879static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
880{
881	struct page *head;
882	size_t v = n + offset;
883
884	/*
885	 * The general case needs to access the page order in order
886	 * to compute the page size.
887	 * However, we mostly deal with order-0 pages and thus can
888	 * avoid a possible cache line miss for requests that fit all
889	 * page orders.
890	 */
891	if (n <= v && v <= PAGE_SIZE)
892		return true;
893
894	head = compound_head(page);
895	v += (page - head) << PAGE_SHIFT;
896
897	if (likely(n <= v && v <= (page_size(head))))
898		return true;
899	WARN_ON(1);
900	return false;
901}
902
903size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
904			 struct iov_iter *i)
905{
906	if (unlikely(!page_copy_sane(page, offset, bytes)))
907		return 0;
908	if (i->type & (ITER_BVEC|ITER_KVEC)) {
909		void *kaddr = kmap_atomic(page);
910		size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
911		kunmap_atomic(kaddr);
912		return wanted;
913	} else if (unlikely(iov_iter_is_discard(i))) {
914		if (unlikely(i->count < bytes))
915			bytes = i->count;
916		i->count -= bytes;
917		return bytes;
918	} else if (likely(!iov_iter_is_pipe(i)))
919		return copy_page_to_iter_iovec(page, offset, bytes, i);
920	else
921		return copy_page_to_iter_pipe(page, offset, bytes, i);
922}
923EXPORT_SYMBOL(copy_page_to_iter);
924
925size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
926			 struct iov_iter *i)
927{
928	if (unlikely(!page_copy_sane(page, offset, bytes)))
929		return 0;
930	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
931		WARN_ON(1);
932		return 0;
933	}
934	if (i->type & (ITER_BVEC|ITER_KVEC)) {
935		void *kaddr = kmap_atomic(page);
936		size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
937		kunmap_atomic(kaddr);
938		return wanted;
939	} else
940		return copy_page_from_iter_iovec(page, offset, bytes, i);
941}
942EXPORT_SYMBOL(copy_page_from_iter);
943
944static size_t pipe_zero(size_t bytes, struct iov_iter *i)
945{
946	struct pipe_inode_info *pipe = i->pipe;
947	unsigned int p_mask = pipe->ring_size - 1;
948	unsigned int i_head;
949	size_t n, off;
950
951	if (!sanity(i))
952		return 0;
953
954	bytes = n = push_pipe(i, bytes, &i_head, &off);
955	if (unlikely(!n))
956		return 0;
957
958	do {
959		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
960		memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
961		i->head = i_head;
962		i->iov_offset = off + chunk;
963		n -= chunk;
964		off = 0;
965		i_head++;
966	} while (n);
967	i->count -= bytes;
968	return bytes;
969}
970
971size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
972{
973	if (unlikely(iov_iter_is_pipe(i)))
974		return pipe_zero(bytes, i);
975	iterate_and_advance(i, bytes, v,
976		clear_user(v.iov_base, v.iov_len),
977		memzero_page(v.bv_page, v.bv_offset, v.bv_len),
978		memset(v.iov_base, 0, v.iov_len)
979	)
980
981	return bytes;
982}
983EXPORT_SYMBOL(iov_iter_zero);
984
985size_t iov_iter_copy_from_user_atomic(struct page *page,
986		struct iov_iter *i, unsigned long offset, size_t bytes)
987{
988	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
989	if (unlikely(!page_copy_sane(page, offset, bytes))) {
990		kunmap_atomic(kaddr);
991		return 0;
992	}
993	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
994		kunmap_atomic(kaddr);
995		WARN_ON(1);
996		return 0;
997	}
998	iterate_all_kinds(i, bytes, v,
999		copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
1000		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
1001				 v.bv_offset, v.bv_len),
1002		memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
1003	)
1004	kunmap_atomic(kaddr);
1005	return bytes;
1006}
1007EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1008
1009static inline void pipe_truncate(struct iov_iter *i)
1010{
1011	struct pipe_inode_info *pipe = i->pipe;
1012	unsigned int p_tail = pipe->tail;
1013	unsigned int p_head = pipe->head;
1014	unsigned int p_mask = pipe->ring_size - 1;
1015
1016	if (!pipe_empty(p_head, p_tail)) {
1017		struct pipe_buffer *buf;
1018		unsigned int i_head = i->head;
1019		size_t off = i->iov_offset;
1020
1021		if (off) {
1022			buf = &pipe->bufs[i_head & p_mask];
1023			buf->len = off - buf->offset;
1024			i_head++;
1025		}
1026		while (p_head != i_head) {
1027			p_head--;
1028			pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
1029		}
1030
1031		pipe->head = p_head;
1032	}
1033}
1034
1035static void pipe_advance(struct iov_iter *i, size_t size)
1036{
1037	struct pipe_inode_info *pipe = i->pipe;
1038	if (unlikely(i->count < size))
1039		size = i->count;
1040	if (size) {
1041		struct pipe_buffer *buf;
1042		unsigned int p_mask = pipe->ring_size - 1;
1043		unsigned int i_head = i->head;
1044		size_t off = i->iov_offset, left = size;
1045
1046		if (off) /* make it relative to the beginning of buffer */
1047			left += off - pipe->bufs[i_head & p_mask].offset;
1048		while (1) {
1049			buf = &pipe->bufs[i_head & p_mask];
1050			if (left <= buf->len)
1051				break;
1052			left -= buf->len;
1053			i_head++;
1054		}
1055		i->head = i_head;
1056		i->iov_offset = buf->offset + left;
1057	}
1058	i->count -= size;
1059	/* ... and discard everything past that point */
1060	pipe_truncate(i);
1061}
1062
1063void iov_iter_advance(struct iov_iter *i, size_t size)
1064{
1065	if (unlikely(iov_iter_is_pipe(i))) {
1066		pipe_advance(i, size);
1067		return;
1068	}
1069	if (unlikely(iov_iter_is_discard(i))) {
1070		i->count -= size;
1071		return;
1072	}
1073	iterate_and_advance(i, size, v, 0, 0, 0)
1074}
1075EXPORT_SYMBOL(iov_iter_advance);
1076
1077void iov_iter_revert(struct iov_iter *i, size_t unroll)
1078{
1079	if (!unroll)
1080		return;
1081	if (WARN_ON(unroll > MAX_RW_COUNT))
1082		return;
1083	i->count += unroll;
1084	if (unlikely(iov_iter_is_pipe(i))) {
1085		struct pipe_inode_info *pipe = i->pipe;
1086		unsigned int p_mask = pipe->ring_size - 1;
1087		unsigned int i_head = i->head;
1088		size_t off = i->iov_offset;
1089		while (1) {
1090			struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1091			size_t n = off - b->offset;
1092			if (unroll < n) {
1093				off -= unroll;
1094				break;
1095			}
1096			unroll -= n;
1097			if (!unroll && i_head == i->start_head) {
1098				off = 0;
1099				break;
1100			}
1101			i_head--;
1102			b = &pipe->bufs[i_head & p_mask];
1103			off = b->offset + b->len;
1104		}
1105		i->iov_offset = off;
1106		i->head = i_head;
1107		pipe_truncate(i);
1108		return;
1109	}
1110	if (unlikely(iov_iter_is_discard(i)))
1111		return;
1112	if (unroll <= i->iov_offset) {
1113		i->iov_offset -= unroll;
1114		return;
1115	}
1116	unroll -= i->iov_offset;
1117	if (iov_iter_is_bvec(i)) {
1118		const struct bio_vec *bvec = i->bvec;
1119		while (1) {
1120			size_t n = (--bvec)->bv_len;
1121			i->nr_segs++;
1122			if (unroll <= n) {
1123				i->bvec = bvec;
1124				i->iov_offset = n - unroll;
1125				return;
1126			}
1127			unroll -= n;
1128		}
1129	} else { /* same logics for iovec and kvec */
1130		const struct iovec *iov = i->iov;
1131		while (1) {
1132			size_t n = (--iov)->iov_len;
1133			i->nr_segs++;
1134			if (unroll <= n) {
1135				i->iov = iov;
1136				i->iov_offset = n - unroll;
1137				return;
1138			}
1139			unroll -= n;
1140		}
1141	}
1142}
1143EXPORT_SYMBOL(iov_iter_revert);
1144
1145/*
1146 * Return the count of just the current iov_iter segment.
1147 */
1148size_t iov_iter_single_seg_count(const struct iov_iter *i)
1149{
1150	if (unlikely(iov_iter_is_pipe(i)))
1151		return i->count;	// it is a silly place, anyway
1152	if (i->nr_segs == 1)
1153		return i->count;
1154	if (unlikely(iov_iter_is_discard(i)))
1155		return i->count;
1156	else if (iov_iter_is_bvec(i))
1157		return min(i->count, i->bvec->bv_len - i->iov_offset);
1158	else
1159		return min(i->count, i->iov->iov_len - i->iov_offset);
1160}
1161EXPORT_SYMBOL(iov_iter_single_seg_count);
1162
1163void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1164			const struct kvec *kvec, unsigned long nr_segs,
1165			size_t count)
1166{
1167	WARN_ON(direction & ~(READ | WRITE));
1168	i->type = ITER_KVEC | (direction & (READ | WRITE));
1169	i->kvec = kvec;
1170	i->nr_segs = nr_segs;
1171	i->iov_offset = 0;
1172	i->count = count;
1173}
1174EXPORT_SYMBOL(iov_iter_kvec);
1175
1176void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1177			const struct bio_vec *bvec, unsigned long nr_segs,
1178			size_t count)
1179{
1180	WARN_ON(direction & ~(READ | WRITE));
1181	i->type = ITER_BVEC | (direction & (READ | WRITE));
1182	i->bvec = bvec;
1183	i->nr_segs = nr_segs;
1184	i->iov_offset = 0;
1185	i->count = count;
1186}
1187EXPORT_SYMBOL(iov_iter_bvec);
1188
1189void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1190			struct pipe_inode_info *pipe,
1191			size_t count)
1192{
1193	BUG_ON(direction != READ);
1194	WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1195	i->type = ITER_PIPE | READ;
1196	i->pipe = pipe;
1197	i->head = pipe->head;
1198	i->iov_offset = 0;
1199	i->count = count;
1200	i->start_head = i->head;
1201}
1202EXPORT_SYMBOL(iov_iter_pipe);
1203
1204/**
1205 * iov_iter_discard - Initialise an I/O iterator that discards data
1206 * @i: The iterator to initialise.
1207 * @direction: The direction of the transfer.
1208 * @count: The size of the I/O buffer in bytes.
1209 *
1210 * Set up an I/O iterator that just discards everything that's written to it.
1211 * It's only available as a READ iterator.
1212 */
1213void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1214{
1215	BUG_ON(direction != READ);
1216	i->type = ITER_DISCARD | READ;
1217	i->count = count;
1218	i->iov_offset = 0;
1219}
1220EXPORT_SYMBOL(iov_iter_discard);
1221
1222unsigned long iov_iter_alignment(const struct iov_iter *i)
1223{
1224	unsigned long res = 0;
1225	size_t size = i->count;
1226
1227	if (unlikely(iov_iter_is_pipe(i))) {
1228		unsigned int p_mask = i->pipe->ring_size - 1;
1229
1230		if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1231			return size | i->iov_offset;
1232		return size;
1233	}
1234	iterate_all_kinds(i, size, v,
1235		(res |= (unsigned long)v.iov_base | v.iov_len, 0),
1236		res |= v.bv_offset | v.bv_len,
1237		res |= (unsigned long)v.iov_base | v.iov_len
1238	)
1239	return res;
1240}
1241EXPORT_SYMBOL(iov_iter_alignment);
1242
1243unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1244{
1245	unsigned long res = 0;
1246	size_t size = i->count;
1247
1248	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1249		WARN_ON(1);
1250		return ~0U;
1251	}
1252
1253	iterate_all_kinds(i, size, v,
1254		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
1255			(size != v.iov_len ? size : 0), 0),
1256		(res |= (!res ? 0 : (unsigned long)v.bv_offset) |
1257			(size != v.bv_len ? size : 0)),
1258		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
1259			(size != v.iov_len ? size : 0))
1260		);
1261	return res;
1262}
1263EXPORT_SYMBOL(iov_iter_gap_alignment);
1264
1265static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1266				size_t maxsize,
1267				struct page **pages,
1268				int iter_head,
1269				size_t *start)
1270{
1271	struct pipe_inode_info *pipe = i->pipe;
1272	unsigned int p_mask = pipe->ring_size - 1;
1273	ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1274	if (!n)
1275		return -EFAULT;
1276
1277	maxsize = n;
1278	n += *start;
1279	while (n > 0) {
1280		get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1281		iter_head++;
1282		n -= PAGE_SIZE;
1283	}
1284
1285	return maxsize;
1286}
1287
1288static ssize_t pipe_get_pages(struct iov_iter *i,
1289		   struct page **pages, size_t maxsize, unsigned maxpages,
1290		   size_t *start)
1291{
1292	unsigned int iter_head, npages;
1293	size_t capacity;
1294
1295	if (!maxsize)
1296		return 0;
1297
1298	if (!sanity(i))
1299		return -EFAULT;
1300
1301	data_start(i, &iter_head, start);
1302	/* Amount of free space: some of this one + all after this one */
1303	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1304	capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1305
1306	return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1307}
1308
1309ssize_t iov_iter_get_pages(struct iov_iter *i,
1310		   struct page **pages, size_t maxsize, unsigned maxpages,
1311		   size_t *start)
1312{
1313	if (maxsize > i->count)
1314		maxsize = i->count;
1315
1316	if (unlikely(iov_iter_is_pipe(i)))
1317		return pipe_get_pages(i, pages, maxsize, maxpages, start);
1318	if (unlikely(iov_iter_is_discard(i)))
1319		return -EFAULT;
1320
1321	iterate_all_kinds(i, maxsize, v, ({
1322		unsigned long addr = (unsigned long)v.iov_base;
1323		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
1324		int n;
1325		int res;
1326
1327		if (len > maxpages * PAGE_SIZE)
1328			len = maxpages * PAGE_SIZE;
1329		addr &= ~(PAGE_SIZE - 1);
1330		n = DIV_ROUND_UP(len, PAGE_SIZE);
1331		res = get_user_pages_fast(addr, n,
1332				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
1333				pages);
1334		if (unlikely(res <= 0))
1335			return res;
1336		return (res == n ? len : res * PAGE_SIZE) - *start;
1337	0;}),({
1338		/* can't be more than PAGE_SIZE */
1339		*start = v.bv_offset;
1340		get_page(*pages = v.bv_page);
1341		return v.bv_len;
1342	}),({
1343		return -EFAULT;
1344	})
1345	)
1346	return 0;
1347}
1348EXPORT_SYMBOL(iov_iter_get_pages);
1349
1350static struct page **get_pages_array(size_t n)
1351{
1352	return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1353}
1354
1355static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1356		   struct page ***pages, size_t maxsize,
1357		   size_t *start)
1358{
1359	struct page **p;
1360	unsigned int iter_head, npages;
1361	ssize_t n;
1362
1363	if (!maxsize)
1364		return 0;
1365
1366	if (!sanity(i))
1367		return -EFAULT;
1368
1369	data_start(i, &iter_head, start);
1370	/* Amount of free space: some of this one + all after this one */
1371	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1372	n = npages * PAGE_SIZE - *start;
1373	if (maxsize > n)
1374		maxsize = n;
1375	else
1376		npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1377	p = get_pages_array(npages);
1378	if (!p)
1379		return -ENOMEM;
1380	n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1381	if (n > 0)
1382		*pages = p;
1383	else
1384		kvfree(p);
1385	return n;
1386}
1387
1388ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1389		   struct page ***pages, size_t maxsize,
1390		   size_t *start)
1391{
1392	struct page **p;
1393
1394	if (maxsize > i->count)
1395		maxsize = i->count;
1396
1397	if (unlikely(iov_iter_is_pipe(i)))
1398		return pipe_get_pages_alloc(i, pages, maxsize, start);
1399	if (unlikely(iov_iter_is_discard(i)))
1400		return -EFAULT;
1401
1402	iterate_all_kinds(i, maxsize, v, ({
1403		unsigned long addr = (unsigned long)v.iov_base;
1404		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
1405		int n;
1406		int res;
1407
1408		addr &= ~(PAGE_SIZE - 1);
1409		n = DIV_ROUND_UP(len, PAGE_SIZE);
1410		p = get_pages_array(n);
1411		if (!p)
1412			return -ENOMEM;
1413		res = get_user_pages_fast(addr, n,
1414				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
1415		if (unlikely(res <= 0)) {
1416			kvfree(p);
1417			*pages = NULL;
1418			return res;
1419		}
1420		*pages = p;
1421		return (res == n ? len : res * PAGE_SIZE) - *start;
1422	0;}),({
1423		/* can't be more than PAGE_SIZE */
1424		*start = v.bv_offset;
1425		*pages = p = get_pages_array(1);
1426		if (!p)
1427			return -ENOMEM;
1428		get_page(*p = v.bv_page);
1429		return v.bv_len;
1430	}),({
1431		return -EFAULT;
1432	})
1433	)
1434	return 0;
1435}
1436EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1437
1438size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1439			       struct iov_iter *i)
1440{
1441	char *to = addr;
1442	__wsum sum, next;
1443	size_t off = 0;
1444	sum = *csum;
1445	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1446		WARN_ON(1);
1447		return 0;
1448	}
1449	iterate_and_advance(i, bytes, v, ({
1450		next = csum_and_copy_from_user(v.iov_base,
1451					       (to += v.iov_len) - v.iov_len,
1452					       v.iov_len);
1453		if (next) {
1454			sum = csum_block_add(sum, next, off);
1455			off += v.iov_len;
1456		}
1457		next ? 0 : v.iov_len;
1458	}), ({
1459		char *p = kmap_atomic(v.bv_page);
1460		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1461				      p + v.bv_offset, v.bv_len,
1462				      sum, off);
1463		kunmap_atomic(p);
1464		off += v.bv_len;
1465	}),({
1466		sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
1467				      v.iov_base, v.iov_len,
1468				      sum, off);
1469		off += v.iov_len;
1470	})
1471	)
1472	*csum = sum;
1473	return bytes;
1474}
1475EXPORT_SYMBOL(csum_and_copy_from_iter);
1476
1477bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
1478			       struct iov_iter *i)
1479{
1480	char *to = addr;
1481	__wsum sum, next;
1482	size_t off = 0;
1483	sum = *csum;
1484	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1485		WARN_ON(1);
1486		return false;
1487	}
1488	if (unlikely(i->count < bytes))
1489		return false;
1490	iterate_all_kinds(i, bytes, v, ({
1491		next = csum_and_copy_from_user(v.iov_base,
1492					       (to += v.iov_len) - v.iov_len,
1493					       v.iov_len);
1494		if (!next)
1495			return false;
1496		sum = csum_block_add(sum, next, off);
1497		off += v.iov_len;
1498		0;
1499	}), ({
1500		char *p = kmap_atomic(v.bv_page);
1501		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1502				      p + v.bv_offset, v.bv_len,
1503				      sum, off);
1504		kunmap_atomic(p);
1505		off += v.bv_len;
1506	}),({
1507		sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
1508				      v.iov_base, v.iov_len,
1509				      sum, off);
1510		off += v.iov_len;
1511	})
1512	)
1513	*csum = sum;
1514	iov_iter_advance(i, bytes);
1515	return true;
1516}
1517EXPORT_SYMBOL(csum_and_copy_from_iter_full);
1518
1519size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1520			     struct iov_iter *i)
1521{
1522	struct csum_state *csstate = _csstate;
1523	const char *from = addr;
1524	__wsum sum, next;
1525	size_t off;
1526
1527	if (unlikely(iov_iter_is_pipe(i)))
1528		return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);
1529
1530	sum = csstate->csum;
1531	off = csstate->off;
1532	if (unlikely(iov_iter_is_discard(i))) {
1533		WARN_ON(1);	/* for now */
1534		return 0;
1535	}
1536	iterate_and_advance(i, bytes, v, ({
1537		next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
1538					     v.iov_base,
1539					     v.iov_len);
1540		if (next) {
1541			sum = csum_block_add(sum, next, off);
1542			off += v.iov_len;
1543		}
1544		next ? 0 : v.iov_len;
1545	}), ({
1546		char *p = kmap_atomic(v.bv_page);
1547		sum = csum_and_memcpy(p + v.bv_offset,
1548				      (from += v.bv_len) - v.bv_len,
1549				      v.bv_len, sum, off);
1550		kunmap_atomic(p);
1551		off += v.bv_len;
1552	}),({
1553		sum = csum_and_memcpy(v.iov_base,
1554				     (from += v.iov_len) - v.iov_len,
1555				     v.iov_len, sum, off);
1556		off += v.iov_len;
1557	})
1558	)
1559	csstate->csum = sum;
1560	csstate->off = off;
1561	return bytes;
1562}
1563EXPORT_SYMBOL(csum_and_copy_to_iter);
1564
1565size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1566		struct iov_iter *i)
1567{
1568#ifdef CONFIG_CRYPTO_HASH
1569	struct ahash_request *hash = hashp;
1570	struct scatterlist sg;
1571	size_t copied;
1572
1573	copied = copy_to_iter(addr, bytes, i);
1574	sg_init_one(&sg, addr, copied);
1575	ahash_request_set_crypt(hash, &sg, NULL, copied);
1576	crypto_ahash_update(hash);
1577	return copied;
1578#else
1579	return 0;
1580#endif
1581}
1582EXPORT_SYMBOL(hash_and_copy_to_iter);
1583
1584int iov_iter_npages(const struct iov_iter *i, int maxpages)
1585{
1586	size_t size = i->count;
1587	int npages = 0;
1588
1589	if (!size)
1590		return 0;
1591	if (unlikely(iov_iter_is_discard(i)))
1592		return 0;
1593
1594	if (unlikely(iov_iter_is_pipe(i))) {
1595		struct pipe_inode_info *pipe = i->pipe;
1596		unsigned int iter_head;
1597		size_t off;
1598
1599		if (!sanity(i))
1600			return 0;
1601
1602		data_start(i, &iter_head, &off);
1603		/* some of this one + all after this one */
1604		npages = pipe_space_for_user(iter_head, pipe->tail, pipe);
1605		if (npages >= maxpages)
1606			return maxpages;
1607	} else iterate_all_kinds(i, size, v, ({
1608		unsigned long p = (unsigned long)v.iov_base;
1609		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
1610			- p / PAGE_SIZE;
1611		if (npages >= maxpages)
1612			return maxpages;
1613	0;}),({
1614		npages++;
1615		if (npages >= maxpages)
1616			return maxpages;
1617	}),({
1618		unsigned long p = (unsigned long)v.iov_base;
1619		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
1620			- p / PAGE_SIZE;
1621		if (npages >= maxpages)
1622			return maxpages;
1623	})
1624	)
1625	return npages;
1626}
1627EXPORT_SYMBOL(iov_iter_npages);
1628
1629const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1630{
1631	*new = *old;
1632	if (unlikely(iov_iter_is_pipe(new))) {
1633		WARN_ON(1);
1634		return NULL;
1635	}
1636	if (unlikely(iov_iter_is_discard(new)))
1637		return NULL;
1638	if (iov_iter_is_bvec(new))
1639		return new->bvec = kmemdup(new->bvec,
1640				    new->nr_segs * sizeof(struct bio_vec),
1641				    flags);
1642	else
1643		/* iovec and kvec have identical layout */
1644		return new->iov = kmemdup(new->iov,
1645				   new->nr_segs * sizeof(struct iovec),
1646				   flags);
1647}
1648EXPORT_SYMBOL(dup_iter);
1649
1650static int copy_compat_iovec_from_user(struct iovec *iov,
1651		const struct iovec __user *uvec, unsigned long nr_segs)
1652{
1653	const struct compat_iovec __user *uiov =
1654		(const struct compat_iovec __user *)uvec;
1655	int ret = -EFAULT, i;
1656
1657	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1658		return -EFAULT;
1659
1660	for (i = 0; i < nr_segs; i++) {
1661		compat_uptr_t buf;
1662		compat_ssize_t len;
1663
1664		unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1665		unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1666
1667		/* check for compat_size_t not fitting in compat_ssize_t .. */
1668		if (len < 0) {
1669			ret = -EINVAL;
1670			goto uaccess_end;
1671		}
1672		iov[i].iov_base = compat_ptr(buf);
1673		iov[i].iov_len = len;
1674	}
1675
1676	ret = 0;
1677uaccess_end:
1678	user_access_end();
1679	return ret;
1680}
1681
1682static int copy_iovec_from_user(struct iovec *iov,
1683		const struct iovec __user *uvec, unsigned long nr_segs)
1684{
1685	unsigned long seg;
1686
1687	if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1688		return -EFAULT;
1689	for (seg = 0; seg < nr_segs; seg++) {
1690		if ((ssize_t)iov[seg].iov_len < 0)
1691			return -EINVAL;
1692	}
1693
1694	return 0;
1695}
1696
1697struct iovec *iovec_from_user(const struct iovec __user *uvec,
1698		unsigned long nr_segs, unsigned long fast_segs,
1699		struct iovec *fast_iov, bool compat)
1700{
1701	struct iovec *iov = fast_iov;
1702	int ret;
1703
1704	/*
1705	 * SuS says "The readv() function *may* fail if the iovcnt argument was
1706	 * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1707	 * traditionally returned zero for zero segments, so...
1708	 */
1709	if (nr_segs == 0)
1710		return iov;
1711	if (nr_segs > UIO_MAXIOV)
1712		return ERR_PTR(-EINVAL);
1713	if (nr_segs > fast_segs) {
1714		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1715		if (!iov)
1716			return ERR_PTR(-ENOMEM);
1717	}
1718
1719	if (compat)
1720		ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1721	else
1722		ret = copy_iovec_from_user(iov, uvec, nr_segs);
1723	if (ret) {
1724		if (iov != fast_iov)
1725			kfree(iov);
1726		return ERR_PTR(ret);
1727	}
1728
1729	return iov;
1730}
1731
1732ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1733		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1734		 struct iov_iter *i, bool compat)
1735{
1736	ssize_t total_len = 0;
1737	unsigned long seg;
1738	struct iovec *iov;
1739
1740	iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1741	if (IS_ERR(iov)) {
1742		*iovp = NULL;
1743		return PTR_ERR(iov);
1744	}
1745
1746	/*
1747	 * According to the Single Unix Specification we should return EINVAL if
1748	 * an element length is < 0 when cast to ssize_t or if the total length
1749	 * would overflow the ssize_t return value of the system call.
1750	 *
1751	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1752	 * overflow case.
1753	 */
1754	for (seg = 0; seg < nr_segs; seg++) {
1755		ssize_t len = (ssize_t)iov[seg].iov_len;
1756
1757		if (!access_ok(iov[seg].iov_base, len)) {
1758			if (iov != *iovp)
1759				kfree(iov);
1760			*iovp = NULL;
1761			return -EFAULT;
1762		}
1763
1764		if (len > MAX_RW_COUNT - total_len) {
1765			len = MAX_RW_COUNT - total_len;
1766			iov[seg].iov_len = len;
1767		}
1768		total_len += len;
1769	}
1770
1771	iov_iter_init(i, type, iov, nr_segs, total_len);
1772	if (iov == *iovp)
1773		*iovp = NULL;
1774	else
1775		*iovp = iov;
1776	return total_len;
1777}
1778
1779/**
1780 * import_iovec() - Copy an array of &struct iovec from userspace
1781 *     into the kernel, check that it is valid, and initialize a new
1782 *     &struct iov_iter iterator to access it.
1783 *
1784 * @type: One of %READ or %WRITE.
1785 * @uvec: Pointer to the userspace array.
1786 * @nr_segs: Number of elements in userspace array.
1787 * @fast_segs: Number of elements in @iov.
1788 * @iovp: (input and output parameter) Pointer to pointer to (usually small
1789 *     on-stack) kernel array.
1790 * @i: Pointer to iterator that will be initialized on success.
1791 *
1792 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1793 * then this function places %NULL in *@iov on return. Otherwise, a new
1794 * array will be allocated and the result placed in *@iov. This means that
1795 * the caller may call kfree() on *@iov regardless of whether the small
1796 * on-stack array was used or not (and regardless of whether this function
1797 * returns an error or not).
1798 *
1799 * Return: Negative error code on error, bytes imported on success
1800 */
1801ssize_t import_iovec(int type, const struct iovec __user *uvec,
1802		 unsigned nr_segs, unsigned fast_segs,
1803		 struct iovec **iovp, struct iov_iter *i)
1804{
1805	return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1806			      in_compat_syscall());
1807}
1808EXPORT_SYMBOL(import_iovec);
1809
1810int import_single_range(int rw, void __user *buf, size_t len,
1811		 struct iovec *iov, struct iov_iter *i)
1812{
1813	if (len > MAX_RW_COUNT)
1814		len = MAX_RW_COUNT;
1815	if (unlikely(!access_ok(buf, len)))
1816		return -EFAULT;
1817
1818	iov->iov_base = buf;
1819	iov->iov_len = len;
1820	iov_iter_init(i, rw, iov, 1, len);
1821	return 0;
1822}
1823EXPORT_SYMBOL(import_single_range);
1824
1825/**
1826 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
1827 *     iov_iter_save_state() was called.
1828 *
1829 * @i: &struct iov_iter to restore
1830 * @state: state to restore from
1831 *
1832 * Used after iov_iter_save_state() to bring restore @i, if operations may
1833 * have advanced it.
1834 *
1835 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
1836 */
1837void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
1838{
1839	if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
1840			 !iov_iter_is_kvec(i))
1841		return;
1842	i->iov_offset = state->iov_offset;
1843	i->count = state->count;
1844	/*
1845	 * For the *vec iters, nr_segs + iov is constant - if we increment
1846	 * the vec, then we also decrement the nr_segs count. Hence we don't
1847	 * need to track both of these, just one is enough and we can deduct
1848	 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
1849	 * size, so we can just increment the iov pointer as they are unionzed.
1850	 * ITER_BVEC _may_ be the same size on some archs, but on others it is
1851	 * not. Be safe and handle it separately.
1852	 */
1853	BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
1854	if (iov_iter_is_bvec(i))
1855		i->bvec -= state->nr_segs - i->nr_segs;
1856	else
1857		i->iov -= state->nr_segs - i->nr_segs;
1858	i->nr_segs = state->nr_segs;
1859}
1860