1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <crypto/hash.h>
3 #include <linux/export.h>
4 #include <linux/bvec.h>
5 #include <linux/fault-inject-usercopy.h>
7 #include <linux/pagemap.h>
8 #include <linux/highmem.h>
9 #include <linux/slab.h>
10 #include <linux/vmalloc.h>
11 #include <linux/splice.h>
12 #include <linux/compat.h>
13 #include <net/checksum.h>
14 #include <linux/scatterlist.h>
15 #include <linux/instrumented.h>
17 #define PIPE_PARANOIA /* for now */
19 /* covers iovec and kvec alike */
20 #define iterate_iovec(i, n, base, len, off, __p, skip, STEP) { \
23 len = min(n, __p->iov_len - skip); \
25 base = __p->iov_base + skip; \
30 if (skip < __p->iov_len) \
39 #define iterate_bvec(i, n, base, len, off, p, skip, STEP) { \
42 unsigned offset = p->bv_offset + skip; \
44 void *kaddr = kmap_local_page(p->bv_page + \
45 offset / PAGE_SIZE); \
46 base = kaddr + offset % PAGE_SIZE; \
47 len = min(min(n, p->bv_len - skip), \
48 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \
50 kunmap_local(kaddr); \
54 if (skip == p->bv_len) { \
65 #define iterate_xarray(i, n, base, len, __off, skip, STEP) { \
68 struct page *head = NULL; \
70 loff_t start = i->xarray_start + skip; \
71 pgoff_t index = start >> PAGE_SHIFT; \
74 XA_STATE(xas, i->xarray, index); \
77 xas_for_each(&xas, head, ULONG_MAX) { \
79 if (xas_retry(&xas, head)) \
81 if (WARN_ON(xa_is_value(head))) \
83 if (WARN_ON(PageHuge(head))) \
85 for (j = (head->index < index) ? index - head->index : 0; \
86 j < thp_nr_pages(head); j++) { \
87 void *kaddr = kmap_local_page(head + j); \
88 offset = (start + __off) % PAGE_SIZE; \
89 base = kaddr + offset; \
90 len = PAGE_SIZE - offset; \
93 kunmap_local(kaddr); \
107 #define __iterate_and_advance(i, n, base, len, off, I, K) { \
108 if (unlikely(i->count < n)) \
111 size_t skip = i->iov_offset; \
112 if (likely(iter_is_iovec(i))) { \
113 const struct iovec *iov = i->iov; \
116 iterate_iovec(i, n, base, len, off, \
118 i->nr_segs -= iov - i->iov; \
120 } else if (iov_iter_is_bvec(i)) { \
121 const struct bio_vec *bvec = i->bvec; \
124 iterate_bvec(i, n, base, len, off, \
126 i->nr_segs -= bvec - i->bvec; \
128 } else if (iov_iter_is_kvec(i)) { \
129 const struct kvec *kvec = i->kvec; \
132 iterate_iovec(i, n, base, len, off, \
134 i->nr_segs -= kvec - i->kvec; \
136 } else if (iov_iter_is_xarray(i)) { \
139 iterate_xarray(i, n, base, len, off, \
143 i->iov_offset = skip; \
146 #define iterate_and_advance(i, n, base, len, off, I, K) \
147 __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
149 static int copyout(void __user *to, const void *from, size_t n)
151 if (should_fail_usercopy())
153 if (access_ok(to, n)) {
154 instrument_copy_to_user(to, from, n);
155 n = raw_copy_to_user(to, from, n);
160 static int copyin(void *to, const void __user *from, size_t n)
162 if (should_fail_usercopy())
164 if (access_ok(from, n)) {
165 instrument_copy_from_user(to, from, n);
166 n = raw_copy_from_user(to, from, n);
171 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
174 size_t skip, copy, left, wanted;
175 const struct iovec *iov;
179 if (unlikely(bytes > i->count))
182 if (unlikely(!bytes))
188 skip = i->iov_offset;
189 buf = iov->iov_base + skip;
190 copy = min(bytes, iov->iov_len - skip);
192 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
193 kaddr = kmap_atomic(page);
194 from = kaddr + offset;
196 /* first chunk, usually the only one */
197 left = copyout(buf, from, copy);
203 while (unlikely(!left && bytes)) {
206 copy = min(bytes, iov->iov_len);
207 left = copyout(buf, from, copy);
213 if (likely(!bytes)) {
214 kunmap_atomic(kaddr);
217 offset = from - kaddr;
219 kunmap_atomic(kaddr);
220 copy = min(bytes, iov->iov_len - skip);
222 /* Too bad - revert to non-atomic kmap */
225 from = kaddr + offset;
226 left = copyout(buf, from, copy);
231 while (unlikely(!left && bytes)) {
234 copy = min(bytes, iov->iov_len);
235 left = copyout(buf, from, copy);
244 if (skip == iov->iov_len) {
248 i->count -= wanted - bytes;
249 i->nr_segs -= iov - i->iov;
251 i->iov_offset = skip;
252 return wanted - bytes;
255 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
258 size_t skip, copy, left, wanted;
259 const struct iovec *iov;
263 if (unlikely(bytes > i->count))
266 if (unlikely(!bytes))
272 skip = i->iov_offset;
273 buf = iov->iov_base + skip;
274 copy = min(bytes, iov->iov_len - skip);
276 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
277 kaddr = kmap_atomic(page);
280 /* first chunk, usually the only one */
281 left = copyin(to, buf, copy);
287 while (unlikely(!left && bytes)) {
290 copy = min(bytes, iov->iov_len);
291 left = copyin(to, buf, copy);
297 if (likely(!bytes)) {
298 kunmap_atomic(kaddr);
303 kunmap_atomic(kaddr);
304 copy = min(bytes, iov->iov_len - skip);
306 /* Too bad - revert to non-atomic kmap */
310 left = copyin(to, buf, copy);
315 while (unlikely(!left && bytes)) {
318 copy = min(bytes, iov->iov_len);
319 left = copyin(to, buf, copy);
328 if (skip == iov->iov_len) {
332 i->count -= wanted - bytes;
333 i->nr_segs -= iov - i->iov;
335 i->iov_offset = skip;
336 return wanted - bytes;
340 static bool sanity(const struct iov_iter *i)
342 struct pipe_inode_info *pipe = i->pipe;
343 unsigned int p_head = pipe->head;
344 unsigned int p_tail = pipe->tail;
345 unsigned int p_mask = pipe->ring_size - 1;
346 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
347 unsigned int i_head = i->head;
351 struct pipe_buffer *p;
352 if (unlikely(p_occupancy == 0))
353 goto Bad; // pipe must be non-empty
354 if (unlikely(i_head != p_head - 1))
355 goto Bad; // must be at the last buffer...
357 p = &pipe->bufs[i_head & p_mask];
358 if (unlikely(p->offset + p->len != i->iov_offset))
359 goto Bad; // ... at the end of segment
361 if (i_head != p_head)
362 goto Bad; // must be right after the last buffer
366 printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
367 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
368 p_head, p_tail, pipe->ring_size);
369 for (idx = 0; idx < pipe->ring_size; idx++)
370 printk(KERN_ERR "[%p %p %d %d]\n",
372 pipe->bufs[idx].page,
373 pipe->bufs[idx].offset,
374 pipe->bufs[idx].len);
379 #define sanity(i) true
382 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
385 struct pipe_inode_info *pipe = i->pipe;
386 struct pipe_buffer *buf;
387 unsigned int p_tail = pipe->tail;
388 unsigned int p_mask = pipe->ring_size - 1;
389 unsigned int i_head = i->head;
392 if (unlikely(bytes > i->count))
395 if (unlikely(!bytes))
402 buf = &pipe->bufs[i_head & p_mask];
404 if (offset == off && buf->page == page) {
405 /* merge with the last one */
407 i->iov_offset += bytes;
411 buf = &pipe->bufs[i_head & p_mask];
413 if (pipe_full(i_head, p_tail, pipe->max_usage))
416 buf->ops = &page_cache_pipe_buf_ops;
419 buf->offset = offset;
422 pipe->head = i_head + 1;
423 i->iov_offset = offset + bytes;
431 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
432 * bytes. For each iovec, fault in each page that constitutes the iovec.
434 * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
435 * because it is an invalid address).
437 int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
439 if (iter_is_iovec(i)) {
440 const struct iovec *p;
443 if (bytes > i->count)
445 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
446 size_t len = min(bytes, p->iov_len - skip);
451 err = fault_in_pages_readable(p->iov_base + skip, len);
459 EXPORT_SYMBOL(iov_iter_fault_in_readable);
461 void iov_iter_init(struct iov_iter *i, unsigned int direction,
462 const struct iovec *iov, unsigned long nr_segs,
465 WARN_ON(direction & ~(READ | WRITE));
466 WARN_ON_ONCE(uaccess_kernel());
467 *i = (struct iov_iter) {
468 .iter_type = ITER_IOVEC,
469 .data_source = direction,
476 EXPORT_SYMBOL(iov_iter_init);
478 static inline bool allocated(struct pipe_buffer *buf)
480 return buf->ops == &default_pipe_buf_ops;
483 static inline void data_start(const struct iov_iter *i,
484 unsigned int *iter_headp, size_t *offp)
486 unsigned int p_mask = i->pipe->ring_size - 1;
487 unsigned int iter_head = i->head;
488 size_t off = i->iov_offset;
490 if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
495 *iter_headp = iter_head;
499 static size_t push_pipe(struct iov_iter *i, size_t size,
500 int *iter_headp, size_t *offp)
502 struct pipe_inode_info *pipe = i->pipe;
503 unsigned int p_tail = pipe->tail;
504 unsigned int p_mask = pipe->ring_size - 1;
505 unsigned int iter_head;
509 if (unlikely(size > i->count))
515 data_start(i, &iter_head, &off);
516 *iter_headp = iter_head;
519 left -= PAGE_SIZE - off;
521 pipe->bufs[iter_head & p_mask].len += size;
524 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
527 while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
528 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
529 struct page *page = alloc_page(GFP_USER);
533 buf->ops = &default_pipe_buf_ops;
536 buf->len = min_t(ssize_t, left, PAGE_SIZE);
539 pipe->head = iter_head;
547 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
550 struct pipe_inode_info *pipe = i->pipe;
551 unsigned int p_mask = pipe->ring_size - 1;
558 bytes = n = push_pipe(i, bytes, &i_head, &off);
562 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
563 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
565 i->iov_offset = off + chunk;
575 static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
576 __wsum sum, size_t off)
578 __wsum next = csum_partial_copy_nocheck(from, to, len);
579 return csum_block_add(sum, next, off);
582 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
583 struct csum_state *csstate,
586 struct pipe_inode_info *pipe = i->pipe;
587 unsigned int p_mask = pipe->ring_size - 1;
588 __wsum sum = csstate->csum;
589 size_t off = csstate->off;
596 bytes = n = push_pipe(i, bytes, &i_head, &r);
600 size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
601 char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
602 sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
605 i->iov_offset = r + chunk;
618 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
620 if (unlikely(iov_iter_is_pipe(i)))
621 return copy_pipe_to_iter(addr, bytes, i);
622 if (iter_is_iovec(i))
624 iterate_and_advance(i, bytes, base, len, off,
625 copyout(base, addr + off, len),
626 memcpy(base, addr + off, len)
631 EXPORT_SYMBOL(_copy_to_iter);
633 #ifdef CONFIG_ARCH_HAS_COPY_MC
634 static int copyout_mc(void __user *to, const void *from, size_t n)
636 if (access_ok(to, n)) {
637 instrument_copy_to_user(to, from, n);
638 n = copy_mc_to_user((__force void *) to, from, n);
643 static unsigned long copy_mc_to_page(struct page *page, size_t offset,
644 const char *from, size_t len)
649 to = kmap_atomic(page);
650 ret = copy_mc_to_kernel(to + offset, from, len);
656 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
659 struct pipe_inode_info *pipe = i->pipe;
660 unsigned int p_mask = pipe->ring_size - 1;
662 size_t n, off, xfer = 0;
667 bytes = n = push_pipe(i, bytes, &i_head, &off);
671 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
674 rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page,
677 i->iov_offset = off + chunk - rem;
691 * _copy_mc_to_iter - copy to iter with source memory error exception handling
692 * @addr: source kernel address
693 * @bytes: total transfer length
694 * @iter: destination iterator
696 * The pmem driver deploys this for the dax operation
697 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
698 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
699 * successfully copied.
701 * The main differences between this and typical _copy_to_iter().
703 * * Typical tail/residue handling after a fault retries the copy
704 * byte-by-byte until the fault happens again. Re-triggering machine
705 * checks is potentially fatal so the implementation uses source
706 * alignment and poison alignment assumptions to avoid re-triggering
707 * hardware exceptions.
709 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
710 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return
713 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
715 if (unlikely(iov_iter_is_pipe(i)))
716 return copy_mc_pipe_to_iter(addr, bytes, i);
717 if (iter_is_iovec(i))
719 __iterate_and_advance(i, bytes, base, len, off,
720 copyout_mc(base, addr + off, len),
721 copy_mc_to_kernel(base, addr + off, len)
726 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
727 #endif /* CONFIG_ARCH_HAS_COPY_MC */
729 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
731 if (unlikely(iov_iter_is_pipe(i))) {
735 if (iter_is_iovec(i))
737 iterate_and_advance(i, bytes, base, len, off,
738 copyin(addr + off, base, len),
739 memcpy(addr + off, base, len)
744 EXPORT_SYMBOL(_copy_from_iter);
746 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
748 if (unlikely(iov_iter_is_pipe(i))) {
752 iterate_and_advance(i, bytes, base, len, off,
753 __copy_from_user_inatomic_nocache(addr + off, base, len),
754 memcpy(addr + off, base, len)
759 EXPORT_SYMBOL(_copy_from_iter_nocache);
761 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
763 * _copy_from_iter_flushcache - write destination through cpu cache
764 * @addr: destination kernel address
765 * @bytes: total transfer length
766 * @iter: source iterator
768 * The pmem driver arranges for filesystem-dax to use this facility via
769 * dax_copy_from_iter() for ensuring that writes to persistent memory
770 * are flushed through the CPU cache. It is differentiated from
771 * _copy_from_iter_nocache() in that guarantees all data is flushed for
772 * all iterator types. The _copy_from_iter_nocache() only attempts to
773 * bypass the cache for the ITER_IOVEC case, and on some archs may use
774 * instructions that strand dirty-data in the cache.
776 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
778 if (unlikely(iov_iter_is_pipe(i))) {
782 iterate_and_advance(i, bytes, base, len, off,
783 __copy_from_user_flushcache(addr + off, base, len),
784 memcpy_flushcache(addr + off, base, len)
789 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
792 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
795 size_t v = n + offset;
798 * The general case needs to access the page order in order
799 * to compute the page size.
800 * However, we mostly deal with order-0 pages and thus can
801 * avoid a possible cache line miss for requests that fit all
804 if (n <= v && v <= PAGE_SIZE)
807 head = compound_head(page);
808 v += (page - head) << PAGE_SHIFT;
810 if (likely(n <= v && v <= (page_size(head))))
816 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
819 if (likely(iter_is_iovec(i)))
820 return copy_page_to_iter_iovec(page, offset, bytes, i);
821 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
822 void *kaddr = kmap_atomic(page);
823 size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
824 kunmap_atomic(kaddr);
827 if (iov_iter_is_pipe(i))
828 return copy_page_to_iter_pipe(page, offset, bytes, i);
829 if (unlikely(iov_iter_is_discard(i))) {
830 if (unlikely(i->count < bytes))
839 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
843 if (unlikely(!page_copy_sane(page, offset, bytes)))
845 page += offset / PAGE_SIZE; // first subpage
848 size_t n = __copy_page_to_iter(page, offset,
849 min(bytes, (size_t)PAGE_SIZE - offset), i);
855 if (offset == PAGE_SIZE) {
862 EXPORT_SYMBOL(copy_page_to_iter);
864 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
867 if (unlikely(!page_copy_sane(page, offset, bytes)))
869 if (likely(iter_is_iovec(i)))
870 return copy_page_from_iter_iovec(page, offset, bytes, i);
871 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
872 void *kaddr = kmap_atomic(page);
873 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
874 kunmap_atomic(kaddr);
880 EXPORT_SYMBOL(copy_page_from_iter);
882 static size_t pipe_zero(size_t bytes, struct iov_iter *i)
884 struct pipe_inode_info *pipe = i->pipe;
885 unsigned int p_mask = pipe->ring_size - 1;
892 bytes = n = push_pipe(i, bytes, &i_head, &off);
897 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
898 memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
900 i->iov_offset = off + chunk;
909 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
911 if (unlikely(iov_iter_is_pipe(i)))
912 return pipe_zero(bytes, i);
913 iterate_and_advance(i, bytes, base, len, count,
914 clear_user(base, len),
920 EXPORT_SYMBOL(iov_iter_zero);
922 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
925 char *kaddr = kmap_atomic(page), *p = kaddr + offset;
926 if (unlikely(!page_copy_sane(page, offset, bytes))) {
927 kunmap_atomic(kaddr);
930 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
931 kunmap_atomic(kaddr);
935 iterate_and_advance(i, bytes, base, len, off,
936 copyin(p + off, base, len),
937 memcpy(p + off, base, len)
939 kunmap_atomic(kaddr);
942 EXPORT_SYMBOL(copy_page_from_iter_atomic);
944 static inline void pipe_truncate(struct iov_iter *i)
946 struct pipe_inode_info *pipe = i->pipe;
947 unsigned int p_tail = pipe->tail;
948 unsigned int p_head = pipe->head;
949 unsigned int p_mask = pipe->ring_size - 1;
951 if (!pipe_empty(p_head, p_tail)) {
952 struct pipe_buffer *buf;
953 unsigned int i_head = i->head;
954 size_t off = i->iov_offset;
957 buf = &pipe->bufs[i_head & p_mask];
958 buf->len = off - buf->offset;
961 while (p_head != i_head) {
963 pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
970 static void pipe_advance(struct iov_iter *i, size_t size)
972 struct pipe_inode_info *pipe = i->pipe;
974 struct pipe_buffer *buf;
975 unsigned int p_mask = pipe->ring_size - 1;
976 unsigned int i_head = i->head;
977 size_t off = i->iov_offset, left = size;
979 if (off) /* make it relative to the beginning of buffer */
980 left += off - pipe->bufs[i_head & p_mask].offset;
982 buf = &pipe->bufs[i_head & p_mask];
983 if (left <= buf->len)
989 i->iov_offset = buf->offset + left;
992 /* ... and discard everything past that point */
996 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
1000 bi.bi_size = i->count;
1001 bi.bi_bvec_done = i->iov_offset;
1003 bvec_iter_advance(i->bvec, &bi, size);
1005 i->bvec += bi.bi_idx;
1006 i->nr_segs -= bi.bi_idx;
1007 i->count = bi.bi_size;
1008 i->iov_offset = bi.bi_bvec_done;
1011 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
1013 const struct iovec *iov, *end;
1019 size += i->iov_offset; // from beginning of current segment
1020 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
1021 if (likely(size < iov->iov_len))
1023 size -= iov->iov_len;
1025 i->iov_offset = size;
1026 i->nr_segs -= iov - i->iov;
1030 void iov_iter_advance(struct iov_iter *i, size_t size)
1032 if (unlikely(i->count < size))
1034 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
1035 /* iovec and kvec have identical layouts */
1036 iov_iter_iovec_advance(i, size);
1037 } else if (iov_iter_is_bvec(i)) {
1038 iov_iter_bvec_advance(i, size);
1039 } else if (iov_iter_is_pipe(i)) {
1040 pipe_advance(i, size);
1041 } else if (unlikely(iov_iter_is_xarray(i))) {
1042 i->iov_offset += size;
1044 } else if (iov_iter_is_discard(i)) {
1048 EXPORT_SYMBOL(iov_iter_advance);
1050 void iov_iter_revert(struct iov_iter *i, size_t unroll)
1054 if (WARN_ON(unroll > MAX_RW_COUNT))
1057 if (unlikely(iov_iter_is_pipe(i))) {
1058 struct pipe_inode_info *pipe = i->pipe;
1059 unsigned int p_mask = pipe->ring_size - 1;
1060 unsigned int i_head = i->head;
1061 size_t off = i->iov_offset;
1063 struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1064 size_t n = off - b->offset;
1070 if (!unroll && i_head == i->start_head) {
1075 b = &pipe->bufs[i_head & p_mask];
1076 off = b->offset + b->len;
1078 i->iov_offset = off;
1083 if (unlikely(iov_iter_is_discard(i)))
1085 if (unroll <= i->iov_offset) {
1086 i->iov_offset -= unroll;
1089 unroll -= i->iov_offset;
1090 if (iov_iter_is_xarray(i)) {
1091 BUG(); /* We should never go beyond the start of the specified
1092 * range since we might then be straying into pages that
1095 } else if (iov_iter_is_bvec(i)) {
1096 const struct bio_vec *bvec = i->bvec;
1098 size_t n = (--bvec)->bv_len;
1102 i->iov_offset = n - unroll;
1107 } else { /* same logics for iovec and kvec */
1108 const struct iovec *iov = i->iov;
1110 size_t n = (--iov)->iov_len;
1114 i->iov_offset = n - unroll;
1121 EXPORT_SYMBOL(iov_iter_revert);
1124 * Return the count of just the current iov_iter segment.
1126 size_t iov_iter_single_seg_count(const struct iov_iter *i)
1128 if (i->nr_segs > 1) {
1129 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1130 return min(i->count, i->iov->iov_len - i->iov_offset);
1131 if (iov_iter_is_bvec(i))
1132 return min(i->count, i->bvec->bv_len - i->iov_offset);
1136 EXPORT_SYMBOL(iov_iter_single_seg_count);
1138 void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1139 const struct kvec *kvec, unsigned long nr_segs,
1142 WARN_ON(direction & ~(READ | WRITE));
1143 *i = (struct iov_iter){
1144 .iter_type = ITER_KVEC,
1145 .data_source = direction,
1152 EXPORT_SYMBOL(iov_iter_kvec);
1154 void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1155 const struct bio_vec *bvec, unsigned long nr_segs,
1158 WARN_ON(direction & ~(READ | WRITE));
1159 *i = (struct iov_iter){
1160 .iter_type = ITER_BVEC,
1161 .data_source = direction,
1168 EXPORT_SYMBOL(iov_iter_bvec);
1170 void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1171 struct pipe_inode_info *pipe,
1174 BUG_ON(direction != READ);
1175 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1176 *i = (struct iov_iter){
1177 .iter_type = ITER_PIPE,
1178 .data_source = false,
1181 .start_head = pipe->head,
1186 EXPORT_SYMBOL(iov_iter_pipe);
1189 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1190 * @i: The iterator to initialise.
1191 * @direction: The direction of the transfer.
1192 * @xarray: The xarray to access.
1193 * @start: The start file position.
1194 * @count: The size of the I/O buffer in bytes.
1196 * Set up an I/O iterator to either draw data out of the pages attached to an
1197 * inode or to inject data into those pages. The pages *must* be prevented
1198 * from evaporation, either by taking a ref on them or locking them by the
1201 void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1202 struct xarray *xarray, loff_t start, size_t count)
1204 BUG_ON(direction & ~1);
1205 *i = (struct iov_iter) {
1206 .iter_type = ITER_XARRAY,
1207 .data_source = direction,
1209 .xarray_start = start,
1214 EXPORT_SYMBOL(iov_iter_xarray);
1217 * iov_iter_discard - Initialise an I/O iterator that discards data
1218 * @i: The iterator to initialise.
1219 * @direction: The direction of the transfer.
1220 * @count: The size of the I/O buffer in bytes.
1222 * Set up an I/O iterator that just discards everything that's written to it.
1223 * It's only available as a READ iterator.
1225 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1227 BUG_ON(direction != READ);
1228 *i = (struct iov_iter){
1229 .iter_type = ITER_DISCARD,
1230 .data_source = false,
1235 EXPORT_SYMBOL(iov_iter_discard);
1237 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
1239 unsigned long res = 0;
1240 size_t size = i->count;
1241 size_t skip = i->iov_offset;
1244 for (k = 0; k < i->nr_segs; k++, skip = 0) {
1245 size_t len = i->iov[k].iov_len - skip;
1247 res |= (unsigned long)i->iov[k].iov_base + skip;
1259 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1262 size_t size = i->count;
1263 unsigned skip = i->iov_offset;
1266 for (k = 0; k < i->nr_segs; k++, skip = 0) {
1267 size_t len = i->bvec[k].bv_len - skip;
1268 res |= (unsigned long)i->bvec[k].bv_offset + skip;
1279 unsigned long iov_iter_alignment(const struct iov_iter *i)
1281 /* iovec and kvec have identical layouts */
1282 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1283 return iov_iter_alignment_iovec(i);
1285 if (iov_iter_is_bvec(i))
1286 return iov_iter_alignment_bvec(i);
1288 if (iov_iter_is_pipe(i)) {
1289 unsigned int p_mask = i->pipe->ring_size - 1;
1290 size_t size = i->count;
1292 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1293 return size | i->iov_offset;
1297 if (iov_iter_is_xarray(i))
1298 return (i->xarray_start + i->iov_offset) | i->count;
1302 EXPORT_SYMBOL(iov_iter_alignment);
1304 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1306 unsigned long res = 0;
1307 unsigned long v = 0;
1308 size_t size = i->count;
1311 if (WARN_ON(!iter_is_iovec(i)))
1314 for (k = 0; k < i->nr_segs; k++) {
1315 if (i->iov[k].iov_len) {
1316 unsigned long base = (unsigned long)i->iov[k].iov_base;
1317 if (v) // if not the first one
1318 res |= base | v; // this start | previous end
1319 v = base + i->iov[k].iov_len;
1320 if (size <= i->iov[k].iov_len)
1322 size -= i->iov[k].iov_len;
1327 EXPORT_SYMBOL(iov_iter_gap_alignment);
1329 static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1331 struct page **pages,
1335 struct pipe_inode_info *pipe = i->pipe;
1336 unsigned int p_mask = pipe->ring_size - 1;
1337 ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1344 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1352 static ssize_t pipe_get_pages(struct iov_iter *i,
1353 struct page **pages, size_t maxsize, unsigned maxpages,
1356 unsigned int iter_head, npages;
1362 data_start(i, &iter_head, start);
1363 /* Amount of free space: some of this one + all after this one */
1364 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1365 capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1367 return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1370 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1371 pgoff_t index, unsigned int nr_pages)
1373 XA_STATE(xas, xa, index);
1375 unsigned int ret = 0;
1378 for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1379 if (xas_retry(&xas, page))
1382 /* Has the page moved or been split? */
1383 if (unlikely(page != xas_reload(&xas))) {
1388 pages[ret] = find_subpage(page, xas.xa_index);
1389 get_page(pages[ret]);
1390 if (++ret == nr_pages)
1397 static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1398 struct page **pages, size_t maxsize,
1399 unsigned maxpages, size_t *_start_offset)
1401 unsigned nr, offset;
1402 pgoff_t index, count;
1403 size_t size = maxsize, actual;
1406 if (!size || !maxpages)
1409 pos = i->xarray_start + i->iov_offset;
1410 index = pos >> PAGE_SHIFT;
1411 offset = pos & ~PAGE_MASK;
1412 *_start_offset = offset;
1415 if (size > PAGE_SIZE - offset) {
1416 size -= PAGE_SIZE - offset;
1417 count += size >> PAGE_SHIFT;
1423 if (count > maxpages)
1426 nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1430 actual = PAGE_SIZE * nr;
1432 if (nr == count && size > 0) {
1433 unsigned last_offset = (nr > 1) ? 0 : offset;
1434 actual -= PAGE_SIZE - (last_offset + size);
1439 /* must be done on non-empty ITER_IOVEC one */
1440 static unsigned long first_iovec_segment(const struct iov_iter *i,
1441 size_t *size, size_t *start,
1442 size_t maxsize, unsigned maxpages)
1447 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1448 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip;
1449 size_t len = i->iov[k].iov_len - skip;
1455 len += (*start = addr % PAGE_SIZE);
1456 if (len > maxpages * PAGE_SIZE)
1457 len = maxpages * PAGE_SIZE;
1459 return addr & PAGE_MASK;
1461 BUG(); // if it had been empty, we wouldn't get called
1464 /* must be done on non-empty ITER_BVEC one */
1465 static struct page *first_bvec_segment(const struct iov_iter *i,
1466 size_t *size, size_t *start,
1467 size_t maxsize, unsigned maxpages)
1470 size_t skip = i->iov_offset, len;
1472 len = i->bvec->bv_len - skip;
1475 skip += i->bvec->bv_offset;
1476 page = i->bvec->bv_page + skip / PAGE_SIZE;
1477 len += (*start = skip % PAGE_SIZE);
1478 if (len > maxpages * PAGE_SIZE)
1479 len = maxpages * PAGE_SIZE;
1484 ssize_t iov_iter_get_pages(struct iov_iter *i,
1485 struct page **pages, size_t maxsize, unsigned maxpages,
1491 if (maxsize > i->count)
1496 if (likely(iter_is_iovec(i))) {
1499 addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
1500 n = DIV_ROUND_UP(len, PAGE_SIZE);
1501 res = get_user_pages_fast(addr, n,
1502 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0,
1504 if (unlikely(res < 0))
1506 return (res == n ? len : res * PAGE_SIZE) - *start;
1508 if (iov_iter_is_bvec(i)) {
1511 page = first_bvec_segment(i, &len, start, maxsize, maxpages);
1512 n = DIV_ROUND_UP(len, PAGE_SIZE);
1514 get_page(*pages++ = page++);
1515 return len - *start;
1517 if (iov_iter_is_pipe(i))
1518 return pipe_get_pages(i, pages, maxsize, maxpages, start);
1519 if (iov_iter_is_xarray(i))
1520 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1523 EXPORT_SYMBOL(iov_iter_get_pages);
1525 static struct page **get_pages_array(size_t n)
1527 return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1530 static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1531 struct page ***pages, size_t maxsize,
1535 unsigned int iter_head, npages;
1541 data_start(i, &iter_head, start);
1542 /* Amount of free space: some of this one + all after this one */
1543 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1544 n = npages * PAGE_SIZE - *start;
1548 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1549 p = get_pages_array(npages);
1552 n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1560 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1561 struct page ***pages, size_t maxsize,
1562 size_t *_start_offset)
1565 unsigned nr, offset;
1566 pgoff_t index, count;
1567 size_t size = maxsize, actual;
1573 pos = i->xarray_start + i->iov_offset;
1574 index = pos >> PAGE_SHIFT;
1575 offset = pos & ~PAGE_MASK;
1576 *_start_offset = offset;
1579 if (size > PAGE_SIZE - offset) {
1580 size -= PAGE_SIZE - offset;
1581 count += size >> PAGE_SHIFT;
1587 p = get_pages_array(count);
1592 nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1596 actual = PAGE_SIZE * nr;
1598 if (nr == count && size > 0) {
1599 unsigned last_offset = (nr > 1) ? 0 : offset;
1600 actual -= PAGE_SIZE - (last_offset + size);
1605 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1606 struct page ***pages, size_t maxsize,
1613 if (maxsize > i->count)
1618 if (likely(iter_is_iovec(i))) {
1621 addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
1622 n = DIV_ROUND_UP(len, PAGE_SIZE);
1623 p = get_pages_array(n);
1626 res = get_user_pages_fast(addr, n,
1627 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p);
1628 if (unlikely(res < 0)) {
1633 return (res == n ? len : res * PAGE_SIZE) - *start;
1635 if (iov_iter_is_bvec(i)) {
1638 page = first_bvec_segment(i, &len, start, maxsize, ~0U);
1639 n = DIV_ROUND_UP(len, PAGE_SIZE);
1640 *pages = p = get_pages_array(n);
1644 get_page(*p++ = page++);
1645 return len - *start;
1647 if (iov_iter_is_pipe(i))
1648 return pipe_get_pages_alloc(i, pages, maxsize, start);
1649 if (iov_iter_is_xarray(i))
1650 return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1653 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1655 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1660 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1664 iterate_and_advance(i, bytes, base, len, off, ({
1665 next = csum_and_copy_from_user(base, addr + off, len);
1667 sum = csum_block_add(sum, next, off);
1670 sum = csum_and_memcpy(addr + off, base, len, sum, off);
1676 EXPORT_SYMBOL(csum_and_copy_from_iter);
1678 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1681 struct csum_state *csstate = _csstate;
1684 if (unlikely(iov_iter_is_pipe(i)))
1685 return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);
1687 sum = csum_shift(csstate->csum, csstate->off);
1688 if (unlikely(iov_iter_is_discard(i))) {
1689 WARN_ON(1); /* for now */
1692 iterate_and_advance(i, bytes, base, len, off, ({
1693 next = csum_and_copy_to_user(addr + off, base, len);
1695 sum = csum_block_add(sum, next, off);
1698 sum = csum_and_memcpy(base, addr + off, len, sum, off);
1701 csstate->csum = csum_shift(sum, csstate->off);
1702 csstate->off += bytes;
1705 EXPORT_SYMBOL(csum_and_copy_to_iter);
1707 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1710 #ifdef CONFIG_CRYPTO_HASH
1711 struct ahash_request *hash = hashp;
1712 struct scatterlist sg;
1715 copied = copy_to_iter(addr, bytes, i);
1716 sg_init_one(&sg, addr, copied);
1717 ahash_request_set_crypt(hash, &sg, NULL, copied);
1718 crypto_ahash_update(hash);
1724 EXPORT_SYMBOL(hash_and_copy_to_iter);
1726 static int iov_npages(const struct iov_iter *i, int maxpages)
1728 size_t skip = i->iov_offset, size = i->count;
1729 const struct iovec *p;
1732 for (p = i->iov; size; skip = 0, p++) {
1733 unsigned offs = offset_in_page(p->iov_base + skip);
1734 size_t len = min(p->iov_len - skip, size);
1738 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1739 if (unlikely(npages > maxpages))
1746 static int bvec_npages(const struct iov_iter *i, int maxpages)
1748 size_t skip = i->iov_offset, size = i->count;
1749 const struct bio_vec *p;
1752 for (p = i->bvec; size; skip = 0, p++) {
1753 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1754 size_t len = min(p->bv_len - skip, size);
1757 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1758 if (unlikely(npages > maxpages))
1764 int iov_iter_npages(const struct iov_iter *i, int maxpages)
1766 if (unlikely(!i->count))
1768 /* iovec and kvec have identical layouts */
1769 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1770 return iov_npages(i, maxpages);
1771 if (iov_iter_is_bvec(i))
1772 return bvec_npages(i, maxpages);
1773 if (iov_iter_is_pipe(i)) {
1774 unsigned int iter_head;
1781 data_start(i, &iter_head, &off);
1782 /* some of this one + all after this one */
1783 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1784 return min(npages, maxpages);
1786 if (iov_iter_is_xarray(i)) {
1787 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1788 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1789 return min(npages, maxpages);
1793 EXPORT_SYMBOL(iov_iter_npages);
1795 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1798 if (unlikely(iov_iter_is_pipe(new))) {
1802 if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
1804 if (iov_iter_is_bvec(new))
1805 return new->bvec = kmemdup(new->bvec,
1806 new->nr_segs * sizeof(struct bio_vec),
1809 /* iovec and kvec have identical layout */
1810 return new->iov = kmemdup(new->iov,
1811 new->nr_segs * sizeof(struct iovec),
1814 EXPORT_SYMBOL(dup_iter);
1816 static int copy_compat_iovec_from_user(struct iovec *iov,
1817 const struct iovec __user *uvec, unsigned long nr_segs)
1819 const struct compat_iovec __user *uiov =
1820 (const struct compat_iovec __user *)uvec;
1821 int ret = -EFAULT, i;
1823 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1826 for (i = 0; i < nr_segs; i++) {
1830 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1831 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1833 /* check for compat_size_t not fitting in compat_ssize_t .. */
1838 iov[i].iov_base = compat_ptr(buf);
1839 iov[i].iov_len = len;
1848 static int copy_iovec_from_user(struct iovec *iov,
1849 const struct iovec __user *uvec, unsigned long nr_segs)
1853 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1855 for (seg = 0; seg < nr_segs; seg++) {
1856 if ((ssize_t)iov[seg].iov_len < 0)
1863 struct iovec *iovec_from_user(const struct iovec __user *uvec,
1864 unsigned long nr_segs, unsigned long fast_segs,
1865 struct iovec *fast_iov, bool compat)
1867 struct iovec *iov = fast_iov;
1871 * SuS says "The readv() function *may* fail if the iovcnt argument was
1872 * less than or equal to 0, or greater than {IOV_MAX}. Linux has
1873 * traditionally returned zero for zero segments, so...
1877 if (nr_segs > UIO_MAXIOV)
1878 return ERR_PTR(-EINVAL);
1879 if (nr_segs > fast_segs) {
1880 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1882 return ERR_PTR(-ENOMEM);
1886 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1888 ret = copy_iovec_from_user(iov, uvec, nr_segs);
1890 if (iov != fast_iov)
1892 return ERR_PTR(ret);
1898 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1899 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1900 struct iov_iter *i, bool compat)
1902 ssize_t total_len = 0;
1906 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1909 return PTR_ERR(iov);
1913 * According to the Single Unix Specification we should return EINVAL if
1914 * an element length is < 0 when cast to ssize_t or if the total length
1915 * would overflow the ssize_t return value of the system call.
1917 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1920 for (seg = 0; seg < nr_segs; seg++) {
1921 ssize_t len = (ssize_t)iov[seg].iov_len;
1923 if (!access_ok(iov[seg].iov_base, len)) {
1930 if (len > MAX_RW_COUNT - total_len) {
1931 len = MAX_RW_COUNT - total_len;
1932 iov[seg].iov_len = len;
1937 iov_iter_init(i, type, iov, nr_segs, total_len);
1946 * import_iovec() - Copy an array of &struct iovec from userspace
1947 * into the kernel, check that it is valid, and initialize a new
1948 * &struct iov_iter iterator to access it.
1950 * @type: One of %READ or %WRITE.
1951 * @uvec: Pointer to the userspace array.
1952 * @nr_segs: Number of elements in userspace array.
1953 * @fast_segs: Number of elements in @iov.
1954 * @iovp: (input and output parameter) Pointer to pointer to (usually small
1955 * on-stack) kernel array.
1956 * @i: Pointer to iterator that will be initialized on success.
1958 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1959 * then this function places %NULL in *@iov on return. Otherwise, a new
1960 * array will be allocated and the result placed in *@iov. This means that
1961 * the caller may call kfree() on *@iov regardless of whether the small
1962 * on-stack array was used or not (and regardless of whether this function
1963 * returns an error or not).
1965 * Return: Negative error code on error, bytes imported on success
1967 ssize_t import_iovec(int type, const struct iovec __user *uvec,
1968 unsigned nr_segs, unsigned fast_segs,
1969 struct iovec **iovp, struct iov_iter *i)
1971 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1972 in_compat_syscall());
1974 EXPORT_SYMBOL(import_iovec);
1976 int import_single_range(int rw, void __user *buf, size_t len,
1977 struct iovec *iov, struct iov_iter *i)
1979 if (len > MAX_RW_COUNT)
1981 if (unlikely(!access_ok(buf, len)))
1984 iov->iov_base = buf;
1986 iov_iter_init(i, rw, iov, 1, len);
1989 EXPORT_SYMBOL(import_single_range);