iov_iter: make iterator callbacks use base and len instead of iovec
[linux-2.6-microblaze.git] / lib / iov_iter.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <crypto/hash.h>
3 #include <linux/export.h>
4 #include <linux/bvec.h>
5 #include <linux/fault-inject-usercopy.h>
6 #include <linux/uio.h>
7 #include <linux/pagemap.h>
8 #include <linux/highmem.h>
9 #include <linux/slab.h>
10 #include <linux/vmalloc.h>
11 #include <linux/splice.h>
12 #include <linux/compat.h>
13 #include <net/checksum.h>
14 #include <linux/scatterlist.h>
15 #include <linux/instrumented.h>
16
17 #define PIPE_PARANOIA /* for now */
18
19 /* covers iovec and kvec alike */
20 #define iterate_iovec(i, n, base, len, off, __p, skip, STEP) {  \
21         size_t off = 0;                                         \
22         do {                                                    \
23                 len = min(n, __p->iov_len - skip);              \
24                 if (likely(len)) {                              \
25                         base = __p->iov_base + skip;            \
26                         len -= (STEP);                          \
27                         off += len;                             \
28                         skip += len;                            \
29                         n -= len;                               \
30                         if (skip < __p->iov_len)                \
31                                 break;                          \
32                 }                                               \
33                 __p++;                                          \
34                 skip = 0;                                       \
35         } while (n);                                            \
36         n = off;                                                \
37 }
38
39 #define iterate_bvec(i, n, base, len, off, p, skip, STEP) {     \
40         size_t off = 0;                                         \
41         while (n) {                                             \
42                 unsigned offset = p->bv_offset + skip;          \
43                 unsigned left;                                  \
44                 void *kaddr = kmap_local_page(p->bv_page +      \
45                                         offset / PAGE_SIZE);    \
46                 base = kaddr + offset % PAGE_SIZE;              \
47                 len = min(min(n, p->bv_len - skip),             \
48                      (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \
49                 left = (STEP);                                  \
50                 kunmap_local(kaddr);                            \
51                 len -= left;                                    \
52                 off += len;                                     \
53                 skip += len;                                    \
54                 if (skip == p->bv_len) {                        \
55                         skip = 0;                               \
56                         p++;                                    \
57                 }                                               \
58                 n -= len;                                       \
59                 if (left)                                       \
60                         break;                                  \
61         }                                                       \
62         n = off;                                                \
63 }
64
65 #define iterate_xarray(i, n, base, len, __off, skip, STEP) {    \
66         __label__ __out;                                        \
67         size_t __off = 0;                                       \
68         struct page *head = NULL;                               \
69         size_t offset;                                          \
70         loff_t start = i->xarray_start + skip;                  \
71         pgoff_t index = start >> PAGE_SHIFT;                    \
72         int j;                                                  \
73                                                                 \
74         XA_STATE(xas, i->xarray, index);                        \
75                                                                 \
76         rcu_read_lock();                                        \
77         xas_for_each(&xas, head, ULONG_MAX) {                   \
78                 unsigned left;                                  \
79                 if (xas_retry(&xas, head))                      \
80                         continue;                               \
81                 if (WARN_ON(xa_is_value(head)))                 \
82                         break;                                  \
83                 if (WARN_ON(PageHuge(head)))                    \
84                         break;                                  \
85                 for (j = (head->index < index) ? index - head->index : 0; \
86                      j < thp_nr_pages(head); j++) {             \
87                         void *kaddr = kmap_local_page(head + j);        \
88                         offset = (start + __off) % PAGE_SIZE;   \
89                         base = kaddr + offset;                  \
90                         len = PAGE_SIZE - offset;               \
91                         len = min(n, len);                      \
92                         left = (STEP);                          \
93                         kunmap_local(kaddr);                    \
94                         len -= left;                            \
95                         __off += len;                           \
96                         n -= len;                               \
97                         if (left || n == 0)                     \
98                                 goto __out;                     \
99                 }                                               \
100         }                                                       \
101 __out:                                                          \
102         rcu_read_unlock();                                      \
103         skip += __off;                                          \
104         n = __off;                                              \
105 }
106
107 #define __iterate_and_advance(i, n, base, len, off, I, K) {     \
108         if (unlikely(i->count < n))                             \
109                 n = i->count;                                   \
110         if (likely(n)) {                                        \
111                 size_t skip = i->iov_offset;                    \
112                 if (likely(iter_is_iovec(i))) {                 \
113                         const struct iovec *iov = i->iov;       \
114                         void __user *base;                      \
115                         size_t len;                             \
116                         iterate_iovec(i, n, base, len, off,     \
117                                                 iov, skip, (I)) \
118                         i->nr_segs -= iov - i->iov;             \
119                         i->iov = iov;                           \
120                 } else if (iov_iter_is_bvec(i)) {               \
121                         const struct bio_vec *bvec = i->bvec;   \
122                         void *base;                             \
123                         size_t len;                             \
124                         iterate_bvec(i, n, base, len, off,      \
125                                         bvec, skip, (K))        \
126                         i->nr_segs -= bvec - i->bvec;           \
127                         i->bvec = bvec;                         \
128                 } else if (iov_iter_is_kvec(i)) {               \
129                         const struct kvec *kvec = i->kvec;      \
130                         void *base;                             \
131                         size_t len;                             \
132                         iterate_iovec(i, n, base, len, off,     \
133                                         kvec, skip, (K))        \
134                         i->nr_segs -= kvec - i->kvec;           \
135                         i->kvec = kvec;                         \
136                 } else if (iov_iter_is_xarray(i)) {             \
137                         void *base;                             \
138                         size_t len;                             \
139                         iterate_xarray(i, n, base, len, off,    \
140                                                 skip, (K))      \
141                 }                                               \
142                 i->count -= n;                                  \
143                 i->iov_offset = skip;                           \
144         }                                                       \
145 }
146 #define iterate_and_advance(i, n, base, len, off, I, K) \
147         __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
148
149 static int copyout(void __user *to, const void *from, size_t n)
150 {
151         if (should_fail_usercopy())
152                 return n;
153         if (access_ok(to, n)) {
154                 instrument_copy_to_user(to, from, n);
155                 n = raw_copy_to_user(to, from, n);
156         }
157         return n;
158 }
159
160 static int copyin(void *to, const void __user *from, size_t n)
161 {
162         if (should_fail_usercopy())
163                 return n;
164         if (access_ok(from, n)) {
165                 instrument_copy_from_user(to, from, n);
166                 n = raw_copy_from_user(to, from, n);
167         }
168         return n;
169 }
170
171 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
172                          struct iov_iter *i)
173 {
174         size_t skip, copy, left, wanted;
175         const struct iovec *iov;
176         char __user *buf;
177         void *kaddr, *from;
178
179         if (unlikely(bytes > i->count))
180                 bytes = i->count;
181
182         if (unlikely(!bytes))
183                 return 0;
184
185         might_fault();
186         wanted = bytes;
187         iov = i->iov;
188         skip = i->iov_offset;
189         buf = iov->iov_base + skip;
190         copy = min(bytes, iov->iov_len - skip);
191
192         if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
193                 kaddr = kmap_atomic(page);
194                 from = kaddr + offset;
195
196                 /* first chunk, usually the only one */
197                 left = copyout(buf, from, copy);
198                 copy -= left;
199                 skip += copy;
200                 from += copy;
201                 bytes -= copy;
202
203                 while (unlikely(!left && bytes)) {
204                         iov++;
205                         buf = iov->iov_base;
206                         copy = min(bytes, iov->iov_len);
207                         left = copyout(buf, from, copy);
208                         copy -= left;
209                         skip = copy;
210                         from += copy;
211                         bytes -= copy;
212                 }
213                 if (likely(!bytes)) {
214                         kunmap_atomic(kaddr);
215                         goto done;
216                 }
217                 offset = from - kaddr;
218                 buf += copy;
219                 kunmap_atomic(kaddr);
220                 copy = min(bytes, iov->iov_len - skip);
221         }
222         /* Too bad - revert to non-atomic kmap */
223
224         kaddr = kmap(page);
225         from = kaddr + offset;
226         left = copyout(buf, from, copy);
227         copy -= left;
228         skip += copy;
229         from += copy;
230         bytes -= copy;
231         while (unlikely(!left && bytes)) {
232                 iov++;
233                 buf = iov->iov_base;
234                 copy = min(bytes, iov->iov_len);
235                 left = copyout(buf, from, copy);
236                 copy -= left;
237                 skip = copy;
238                 from += copy;
239                 bytes -= copy;
240         }
241         kunmap(page);
242
243 done:
244         if (skip == iov->iov_len) {
245                 iov++;
246                 skip = 0;
247         }
248         i->count -= wanted - bytes;
249         i->nr_segs -= iov - i->iov;
250         i->iov = iov;
251         i->iov_offset = skip;
252         return wanted - bytes;
253 }
254
255 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
256                          struct iov_iter *i)
257 {
258         size_t skip, copy, left, wanted;
259         const struct iovec *iov;
260         char __user *buf;
261         void *kaddr, *to;
262
263         if (unlikely(bytes > i->count))
264                 bytes = i->count;
265
266         if (unlikely(!bytes))
267                 return 0;
268
269         might_fault();
270         wanted = bytes;
271         iov = i->iov;
272         skip = i->iov_offset;
273         buf = iov->iov_base + skip;
274         copy = min(bytes, iov->iov_len - skip);
275
276         if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
277                 kaddr = kmap_atomic(page);
278                 to = kaddr + offset;
279
280                 /* first chunk, usually the only one */
281                 left = copyin(to, buf, copy);
282                 copy -= left;
283                 skip += copy;
284                 to += copy;
285                 bytes -= copy;
286
287                 while (unlikely(!left && bytes)) {
288                         iov++;
289                         buf = iov->iov_base;
290                         copy = min(bytes, iov->iov_len);
291                         left = copyin(to, buf, copy);
292                         copy -= left;
293                         skip = copy;
294                         to += copy;
295                         bytes -= copy;
296                 }
297                 if (likely(!bytes)) {
298                         kunmap_atomic(kaddr);
299                         goto done;
300                 }
301                 offset = to - kaddr;
302                 buf += copy;
303                 kunmap_atomic(kaddr);
304                 copy = min(bytes, iov->iov_len - skip);
305         }
306         /* Too bad - revert to non-atomic kmap */
307
308         kaddr = kmap(page);
309         to = kaddr + offset;
310         left = copyin(to, buf, copy);
311         copy -= left;
312         skip += copy;
313         to += copy;
314         bytes -= copy;
315         while (unlikely(!left && bytes)) {
316                 iov++;
317                 buf = iov->iov_base;
318                 copy = min(bytes, iov->iov_len);
319                 left = copyin(to, buf, copy);
320                 copy -= left;
321                 skip = copy;
322                 to += copy;
323                 bytes -= copy;
324         }
325         kunmap(page);
326
327 done:
328         if (skip == iov->iov_len) {
329                 iov++;
330                 skip = 0;
331         }
332         i->count -= wanted - bytes;
333         i->nr_segs -= iov - i->iov;
334         i->iov = iov;
335         i->iov_offset = skip;
336         return wanted - bytes;
337 }
338
339 #ifdef PIPE_PARANOIA
340 static bool sanity(const struct iov_iter *i)
341 {
342         struct pipe_inode_info *pipe = i->pipe;
343         unsigned int p_head = pipe->head;
344         unsigned int p_tail = pipe->tail;
345         unsigned int p_mask = pipe->ring_size - 1;
346         unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
347         unsigned int i_head = i->head;
348         unsigned int idx;
349
350         if (i->iov_offset) {
351                 struct pipe_buffer *p;
352                 if (unlikely(p_occupancy == 0))
353                         goto Bad;       // pipe must be non-empty
354                 if (unlikely(i_head != p_head - 1))
355                         goto Bad;       // must be at the last buffer...
356
357                 p = &pipe->bufs[i_head & p_mask];
358                 if (unlikely(p->offset + p->len != i->iov_offset))
359                         goto Bad;       // ... at the end of segment
360         } else {
361                 if (i_head != p_head)
362                         goto Bad;       // must be right after the last buffer
363         }
364         return true;
365 Bad:
366         printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
367         printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
368                         p_head, p_tail, pipe->ring_size);
369         for (idx = 0; idx < pipe->ring_size; idx++)
370                 printk(KERN_ERR "[%p %p %d %d]\n",
371                         pipe->bufs[idx].ops,
372                         pipe->bufs[idx].page,
373                         pipe->bufs[idx].offset,
374                         pipe->bufs[idx].len);
375         WARN_ON(1);
376         return false;
377 }
378 #else
379 #define sanity(i) true
380 #endif
381
382 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
383                          struct iov_iter *i)
384 {
385         struct pipe_inode_info *pipe = i->pipe;
386         struct pipe_buffer *buf;
387         unsigned int p_tail = pipe->tail;
388         unsigned int p_mask = pipe->ring_size - 1;
389         unsigned int i_head = i->head;
390         size_t off;
391
392         if (unlikely(bytes > i->count))
393                 bytes = i->count;
394
395         if (unlikely(!bytes))
396                 return 0;
397
398         if (!sanity(i))
399                 return 0;
400
401         off = i->iov_offset;
402         buf = &pipe->bufs[i_head & p_mask];
403         if (off) {
404                 if (offset == off && buf->page == page) {
405                         /* merge with the last one */
406                         buf->len += bytes;
407                         i->iov_offset += bytes;
408                         goto out;
409                 }
410                 i_head++;
411                 buf = &pipe->bufs[i_head & p_mask];
412         }
413         if (pipe_full(i_head, p_tail, pipe->max_usage))
414                 return 0;
415
416         buf->ops = &page_cache_pipe_buf_ops;
417         get_page(page);
418         buf->page = page;
419         buf->offset = offset;
420         buf->len = bytes;
421
422         pipe->head = i_head + 1;
423         i->iov_offset = offset + bytes;
424         i->head = i_head;
425 out:
426         i->count -= bytes;
427         return bytes;
428 }
429
430 /*
431  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
432  * bytes.  For each iovec, fault in each page that constitutes the iovec.
433  *
434  * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
435  * because it is an invalid address).
436  */
437 int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
438 {
439         if (iter_is_iovec(i)) {
440                 const struct iovec *p;
441                 size_t skip;
442
443                 if (bytes > i->count)
444                         bytes = i->count;
445                 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
446                         size_t len = min(bytes, p->iov_len - skip);
447                         int err;
448
449                         if (unlikely(!len))
450                                 continue;
451                         err = fault_in_pages_readable(p->iov_base + skip, len);
452                         if (unlikely(err))
453                                 return err;
454                         bytes -= len;
455                 }
456         }
457         return 0;
458 }
459 EXPORT_SYMBOL(iov_iter_fault_in_readable);
460
461 void iov_iter_init(struct iov_iter *i, unsigned int direction,
462                         const struct iovec *iov, unsigned long nr_segs,
463                         size_t count)
464 {
465         WARN_ON(direction & ~(READ | WRITE));
466         WARN_ON_ONCE(uaccess_kernel());
467         *i = (struct iov_iter) {
468                 .iter_type = ITER_IOVEC,
469                 .data_source = direction,
470                 .iov = iov,
471                 .nr_segs = nr_segs,
472                 .iov_offset = 0,
473                 .count = count
474         };
475 }
476 EXPORT_SYMBOL(iov_iter_init);
477
478 static inline bool allocated(struct pipe_buffer *buf)
479 {
480         return buf->ops == &default_pipe_buf_ops;
481 }
482
483 static inline void data_start(const struct iov_iter *i,
484                               unsigned int *iter_headp, size_t *offp)
485 {
486         unsigned int p_mask = i->pipe->ring_size - 1;
487         unsigned int iter_head = i->head;
488         size_t off = i->iov_offset;
489
490         if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
491                     off == PAGE_SIZE)) {
492                 iter_head++;
493                 off = 0;
494         }
495         *iter_headp = iter_head;
496         *offp = off;
497 }
498
499 static size_t push_pipe(struct iov_iter *i, size_t size,
500                         int *iter_headp, size_t *offp)
501 {
502         struct pipe_inode_info *pipe = i->pipe;
503         unsigned int p_tail = pipe->tail;
504         unsigned int p_mask = pipe->ring_size - 1;
505         unsigned int iter_head;
506         size_t off;
507         ssize_t left;
508
509         if (unlikely(size > i->count))
510                 size = i->count;
511         if (unlikely(!size))
512                 return 0;
513
514         left = size;
515         data_start(i, &iter_head, &off);
516         *iter_headp = iter_head;
517         *offp = off;
518         if (off) {
519                 left -= PAGE_SIZE - off;
520                 if (left <= 0) {
521                         pipe->bufs[iter_head & p_mask].len += size;
522                         return size;
523                 }
524                 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
525                 iter_head++;
526         }
527         while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
528                 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
529                 struct page *page = alloc_page(GFP_USER);
530                 if (!page)
531                         break;
532
533                 buf->ops = &default_pipe_buf_ops;
534                 buf->page = page;
535                 buf->offset = 0;
536                 buf->len = min_t(ssize_t, left, PAGE_SIZE);
537                 left -= buf->len;
538                 iter_head++;
539                 pipe->head = iter_head;
540
541                 if (left == 0)
542                         return size;
543         }
544         return size - left;
545 }
546
547 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
548                                 struct iov_iter *i)
549 {
550         struct pipe_inode_info *pipe = i->pipe;
551         unsigned int p_mask = pipe->ring_size - 1;
552         unsigned int i_head;
553         size_t n, off;
554
555         if (!sanity(i))
556                 return 0;
557
558         bytes = n = push_pipe(i, bytes, &i_head, &off);
559         if (unlikely(!n))
560                 return 0;
561         do {
562                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
563                 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
564                 i->head = i_head;
565                 i->iov_offset = off + chunk;
566                 n -= chunk;
567                 addr += chunk;
568                 off = 0;
569                 i_head++;
570         } while (n);
571         i->count -= bytes;
572         return bytes;
573 }
574
575 static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
576                               __wsum sum, size_t off)
577 {
578         __wsum next = csum_partial_copy_nocheck(from, to, len);
579         return csum_block_add(sum, next, off);
580 }
581
582 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
583                                          struct csum_state *csstate,
584                                          struct iov_iter *i)
585 {
586         struct pipe_inode_info *pipe = i->pipe;
587         unsigned int p_mask = pipe->ring_size - 1;
588         __wsum sum = csstate->csum;
589         size_t off = csstate->off;
590         unsigned int i_head;
591         size_t n, r;
592
593         if (!sanity(i))
594                 return 0;
595
596         bytes = n = push_pipe(i, bytes, &i_head, &r);
597         if (unlikely(!n))
598                 return 0;
599         do {
600                 size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
601                 char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
602                 sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
603                 kunmap_atomic(p);
604                 i->head = i_head;
605                 i->iov_offset = r + chunk;
606                 n -= chunk;
607                 off += chunk;
608                 addr += chunk;
609                 r = 0;
610                 i_head++;
611         } while (n);
612         i->count -= bytes;
613         csstate->csum = sum;
614         csstate->off = off;
615         return bytes;
616 }
617
618 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
619 {
620         if (unlikely(iov_iter_is_pipe(i)))
621                 return copy_pipe_to_iter(addr, bytes, i);
622         if (iter_is_iovec(i))
623                 might_fault();
624         iterate_and_advance(i, bytes, base, len, off,
625                 copyout(base, addr + off, len),
626                 memcpy(base, addr + off, len)
627         )
628
629         return bytes;
630 }
631 EXPORT_SYMBOL(_copy_to_iter);
632
633 #ifdef CONFIG_ARCH_HAS_COPY_MC
634 static int copyout_mc(void __user *to, const void *from, size_t n)
635 {
636         if (access_ok(to, n)) {
637                 instrument_copy_to_user(to, from, n);
638                 n = copy_mc_to_user((__force void *) to, from, n);
639         }
640         return n;
641 }
642
643 static unsigned long copy_mc_to_page(struct page *page, size_t offset,
644                 const char *from, size_t len)
645 {
646         unsigned long ret;
647         char *to;
648
649         to = kmap_atomic(page);
650         ret = copy_mc_to_kernel(to + offset, from, len);
651         kunmap_atomic(to);
652
653         return ret;
654 }
655
656 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
657                                 struct iov_iter *i)
658 {
659         struct pipe_inode_info *pipe = i->pipe;
660         unsigned int p_mask = pipe->ring_size - 1;
661         unsigned int i_head;
662         size_t n, off, xfer = 0;
663
664         if (!sanity(i))
665                 return 0;
666
667         bytes = n = push_pipe(i, bytes, &i_head, &off);
668         if (unlikely(!n))
669                 return 0;
670         do {
671                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
672                 unsigned long rem;
673
674                 rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page,
675                                             off, addr, chunk);
676                 i->head = i_head;
677                 i->iov_offset = off + chunk - rem;
678                 xfer += chunk - rem;
679                 if (rem)
680                         break;
681                 n -= chunk;
682                 addr += chunk;
683                 off = 0;
684                 i_head++;
685         } while (n);
686         i->count -= xfer;
687         return xfer;
688 }
689
690 /**
691  * _copy_mc_to_iter - copy to iter with source memory error exception handling
692  * @addr: source kernel address
693  * @bytes: total transfer length
694  * @iter: destination iterator
695  *
696  * The pmem driver deploys this for the dax operation
697  * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
698  * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
699  * successfully copied.
700  *
701  * The main differences between this and typical _copy_to_iter().
702  *
703  * * Typical tail/residue handling after a fault retries the copy
704  *   byte-by-byte until the fault happens again. Re-triggering machine
705  *   checks is potentially fatal so the implementation uses source
706  *   alignment and poison alignment assumptions to avoid re-triggering
707  *   hardware exceptions.
708  *
709  * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
710  *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
711  *   a short copy.
712  */
713 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
714 {
715         if (unlikely(iov_iter_is_pipe(i)))
716                 return copy_mc_pipe_to_iter(addr, bytes, i);
717         if (iter_is_iovec(i))
718                 might_fault();
719         __iterate_and_advance(i, bytes, base, len, off,
720                 copyout_mc(base, addr + off, len),
721                 copy_mc_to_kernel(base, addr + off, len)
722         )
723
724         return bytes;
725 }
726 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
727 #endif /* CONFIG_ARCH_HAS_COPY_MC */
728
729 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
730 {
731         if (unlikely(iov_iter_is_pipe(i))) {
732                 WARN_ON(1);
733                 return 0;
734         }
735         if (iter_is_iovec(i))
736                 might_fault();
737         iterate_and_advance(i, bytes, base, len, off,
738                 copyin(addr + off, base, len),
739                 memcpy(addr + off, base, len)
740         )
741
742         return bytes;
743 }
744 EXPORT_SYMBOL(_copy_from_iter);
745
746 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
747 {
748         if (unlikely(iov_iter_is_pipe(i))) {
749                 WARN_ON(1);
750                 return 0;
751         }
752         iterate_and_advance(i, bytes, base, len, off,
753                 __copy_from_user_inatomic_nocache(addr + off, base, len),
754                 memcpy(addr + off, base, len)
755         )
756
757         return bytes;
758 }
759 EXPORT_SYMBOL(_copy_from_iter_nocache);
760
761 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
762 /**
763  * _copy_from_iter_flushcache - write destination through cpu cache
764  * @addr: destination kernel address
765  * @bytes: total transfer length
766  * @iter: source iterator
767  *
768  * The pmem driver arranges for filesystem-dax to use this facility via
769  * dax_copy_from_iter() for ensuring that writes to persistent memory
770  * are flushed through the CPU cache. It is differentiated from
771  * _copy_from_iter_nocache() in that guarantees all data is flushed for
772  * all iterator types. The _copy_from_iter_nocache() only attempts to
773  * bypass the cache for the ITER_IOVEC case, and on some archs may use
774  * instructions that strand dirty-data in the cache.
775  */
776 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
777 {
778         if (unlikely(iov_iter_is_pipe(i))) {
779                 WARN_ON(1);
780                 return 0;
781         }
782         iterate_and_advance(i, bytes, base, len, off,
783                 __copy_from_user_flushcache(addr + off, base, len),
784                 memcpy_flushcache(addr + off, base, len)
785         )
786
787         return bytes;
788 }
789 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
790 #endif
791
792 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
793 {
794         struct page *head;
795         size_t v = n + offset;
796
797         /*
798          * The general case needs to access the page order in order
799          * to compute the page size.
800          * However, we mostly deal with order-0 pages and thus can
801          * avoid a possible cache line miss for requests that fit all
802          * page orders.
803          */
804         if (n <= v && v <= PAGE_SIZE)
805                 return true;
806
807         head = compound_head(page);
808         v += (page - head) << PAGE_SHIFT;
809
810         if (likely(n <= v && v <= (page_size(head))))
811                 return true;
812         WARN_ON(1);
813         return false;
814 }
815
816 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
817                          struct iov_iter *i)
818 {
819         if (likely(iter_is_iovec(i)))
820                 return copy_page_to_iter_iovec(page, offset, bytes, i);
821         if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
822                 void *kaddr = kmap_atomic(page);
823                 size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
824                 kunmap_atomic(kaddr);
825                 return wanted;
826         }
827         if (iov_iter_is_pipe(i))
828                 return copy_page_to_iter_pipe(page, offset, bytes, i);
829         if (unlikely(iov_iter_is_discard(i))) {
830                 if (unlikely(i->count < bytes))
831                         bytes = i->count;
832                 i->count -= bytes;
833                 return bytes;
834         }
835         WARN_ON(1);
836         return 0;
837 }
838
839 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
840                          struct iov_iter *i)
841 {
842         size_t res = 0;
843         if (unlikely(!page_copy_sane(page, offset, bytes)))
844                 return 0;
845         page += offset / PAGE_SIZE; // first subpage
846         offset %= PAGE_SIZE;
847         while (1) {
848                 size_t n = __copy_page_to_iter(page, offset,
849                                 min(bytes, (size_t)PAGE_SIZE - offset), i);
850                 res += n;
851                 bytes -= n;
852                 if (!bytes || !n)
853                         break;
854                 offset += n;
855                 if (offset == PAGE_SIZE) {
856                         page++;
857                         offset = 0;
858                 }
859         }
860         return res;
861 }
862 EXPORT_SYMBOL(copy_page_to_iter);
863
864 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
865                          struct iov_iter *i)
866 {
867         if (unlikely(!page_copy_sane(page, offset, bytes)))
868                 return 0;
869         if (likely(iter_is_iovec(i)))
870                 return copy_page_from_iter_iovec(page, offset, bytes, i);
871         if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
872                 void *kaddr = kmap_atomic(page);
873                 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
874                 kunmap_atomic(kaddr);
875                 return wanted;
876         }
877         WARN_ON(1);
878         return 0;
879 }
880 EXPORT_SYMBOL(copy_page_from_iter);
881
882 static size_t pipe_zero(size_t bytes, struct iov_iter *i)
883 {
884         struct pipe_inode_info *pipe = i->pipe;
885         unsigned int p_mask = pipe->ring_size - 1;
886         unsigned int i_head;
887         size_t n, off;
888
889         if (!sanity(i))
890                 return 0;
891
892         bytes = n = push_pipe(i, bytes, &i_head, &off);
893         if (unlikely(!n))
894                 return 0;
895
896         do {
897                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
898                 memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
899                 i->head = i_head;
900                 i->iov_offset = off + chunk;
901                 n -= chunk;
902                 off = 0;
903                 i_head++;
904         } while (n);
905         i->count -= bytes;
906         return bytes;
907 }
908
909 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
910 {
911         if (unlikely(iov_iter_is_pipe(i)))
912                 return pipe_zero(bytes, i);
913         iterate_and_advance(i, bytes, base, len, count,
914                 clear_user(base, len),
915                 memset(base, 0, len)
916         )
917
918         return bytes;
919 }
920 EXPORT_SYMBOL(iov_iter_zero);
921
922 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
923                                   struct iov_iter *i)
924 {
925         char *kaddr = kmap_atomic(page), *p = kaddr + offset;
926         if (unlikely(!page_copy_sane(page, offset, bytes))) {
927                 kunmap_atomic(kaddr);
928                 return 0;
929         }
930         if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
931                 kunmap_atomic(kaddr);
932                 WARN_ON(1);
933                 return 0;
934         }
935         iterate_and_advance(i, bytes, base, len, off,
936                 copyin(p + off, base, len),
937                 memcpy(p + off, base, len)
938         )
939         kunmap_atomic(kaddr);
940         return bytes;
941 }
942 EXPORT_SYMBOL(copy_page_from_iter_atomic);
943
944 static inline void pipe_truncate(struct iov_iter *i)
945 {
946         struct pipe_inode_info *pipe = i->pipe;
947         unsigned int p_tail = pipe->tail;
948         unsigned int p_head = pipe->head;
949         unsigned int p_mask = pipe->ring_size - 1;
950
951         if (!pipe_empty(p_head, p_tail)) {
952                 struct pipe_buffer *buf;
953                 unsigned int i_head = i->head;
954                 size_t off = i->iov_offset;
955
956                 if (off) {
957                         buf = &pipe->bufs[i_head & p_mask];
958                         buf->len = off - buf->offset;
959                         i_head++;
960                 }
961                 while (p_head != i_head) {
962                         p_head--;
963                         pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
964                 }
965
966                 pipe->head = p_head;
967         }
968 }
969
970 static void pipe_advance(struct iov_iter *i, size_t size)
971 {
972         struct pipe_inode_info *pipe = i->pipe;
973         if (size) {
974                 struct pipe_buffer *buf;
975                 unsigned int p_mask = pipe->ring_size - 1;
976                 unsigned int i_head = i->head;
977                 size_t off = i->iov_offset, left = size;
978
979                 if (off) /* make it relative to the beginning of buffer */
980                         left += off - pipe->bufs[i_head & p_mask].offset;
981                 while (1) {
982                         buf = &pipe->bufs[i_head & p_mask];
983                         if (left <= buf->len)
984                                 break;
985                         left -= buf->len;
986                         i_head++;
987                 }
988                 i->head = i_head;
989                 i->iov_offset = buf->offset + left;
990         }
991         i->count -= size;
992         /* ... and discard everything past that point */
993         pipe_truncate(i);
994 }
995
996 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
997 {
998         struct bvec_iter bi;
999
1000         bi.bi_size = i->count;
1001         bi.bi_bvec_done = i->iov_offset;
1002         bi.bi_idx = 0;
1003         bvec_iter_advance(i->bvec, &bi, size);
1004
1005         i->bvec += bi.bi_idx;
1006         i->nr_segs -= bi.bi_idx;
1007         i->count = bi.bi_size;
1008         i->iov_offset = bi.bi_bvec_done;
1009 }
1010
1011 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
1012 {
1013         const struct iovec *iov, *end;
1014
1015         if (!i->count)
1016                 return;
1017         i->count -= size;
1018
1019         size += i->iov_offset; // from beginning of current segment
1020         for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
1021                 if (likely(size < iov->iov_len))
1022                         break;
1023                 size -= iov->iov_len;
1024         }
1025         i->iov_offset = size;
1026         i->nr_segs -= iov - i->iov;
1027         i->iov = iov;
1028 }
1029
1030 void iov_iter_advance(struct iov_iter *i, size_t size)
1031 {
1032         if (unlikely(i->count < size))
1033                 size = i->count;
1034         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
1035                 /* iovec and kvec have identical layouts */
1036                 iov_iter_iovec_advance(i, size);
1037         } else if (iov_iter_is_bvec(i)) {
1038                 iov_iter_bvec_advance(i, size);
1039         } else if (iov_iter_is_pipe(i)) {
1040                 pipe_advance(i, size);
1041         } else if (unlikely(iov_iter_is_xarray(i))) {
1042                 i->iov_offset += size;
1043                 i->count -= size;
1044         } else if (iov_iter_is_discard(i)) {
1045                 i->count -= size;
1046         }
1047 }
1048 EXPORT_SYMBOL(iov_iter_advance);
1049
1050 void iov_iter_revert(struct iov_iter *i, size_t unroll)
1051 {
1052         if (!unroll)
1053                 return;
1054         if (WARN_ON(unroll > MAX_RW_COUNT))
1055                 return;
1056         i->count += unroll;
1057         if (unlikely(iov_iter_is_pipe(i))) {
1058                 struct pipe_inode_info *pipe = i->pipe;
1059                 unsigned int p_mask = pipe->ring_size - 1;
1060                 unsigned int i_head = i->head;
1061                 size_t off = i->iov_offset;
1062                 while (1) {
1063                         struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1064                         size_t n = off - b->offset;
1065                         if (unroll < n) {
1066                                 off -= unroll;
1067                                 break;
1068                         }
1069                         unroll -= n;
1070                         if (!unroll && i_head == i->start_head) {
1071                                 off = 0;
1072                                 break;
1073                         }
1074                         i_head--;
1075                         b = &pipe->bufs[i_head & p_mask];
1076                         off = b->offset + b->len;
1077                 }
1078                 i->iov_offset = off;
1079                 i->head = i_head;
1080                 pipe_truncate(i);
1081                 return;
1082         }
1083         if (unlikely(iov_iter_is_discard(i)))
1084                 return;
1085         if (unroll <= i->iov_offset) {
1086                 i->iov_offset -= unroll;
1087                 return;
1088         }
1089         unroll -= i->iov_offset;
1090         if (iov_iter_is_xarray(i)) {
1091                 BUG(); /* We should never go beyond the start of the specified
1092                         * range since we might then be straying into pages that
1093                         * aren't pinned.
1094                         */
1095         } else if (iov_iter_is_bvec(i)) {
1096                 const struct bio_vec *bvec = i->bvec;
1097                 while (1) {
1098                         size_t n = (--bvec)->bv_len;
1099                         i->nr_segs++;
1100                         if (unroll <= n) {
1101                                 i->bvec = bvec;
1102                                 i->iov_offset = n - unroll;
1103                                 return;
1104                         }
1105                         unroll -= n;
1106                 }
1107         } else { /* same logics for iovec and kvec */
1108                 const struct iovec *iov = i->iov;
1109                 while (1) {
1110                         size_t n = (--iov)->iov_len;
1111                         i->nr_segs++;
1112                         if (unroll <= n) {
1113                                 i->iov = iov;
1114                                 i->iov_offset = n - unroll;
1115                                 return;
1116                         }
1117                         unroll -= n;
1118                 }
1119         }
1120 }
1121 EXPORT_SYMBOL(iov_iter_revert);
1122
1123 /*
1124  * Return the count of just the current iov_iter segment.
1125  */
1126 size_t iov_iter_single_seg_count(const struct iov_iter *i)
1127 {
1128         if (i->nr_segs > 1) {
1129                 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1130                         return min(i->count, i->iov->iov_len - i->iov_offset);
1131                 if (iov_iter_is_bvec(i))
1132                         return min(i->count, i->bvec->bv_len - i->iov_offset);
1133         }
1134         return i->count;
1135 }
1136 EXPORT_SYMBOL(iov_iter_single_seg_count);
1137
1138 void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1139                         const struct kvec *kvec, unsigned long nr_segs,
1140                         size_t count)
1141 {
1142         WARN_ON(direction & ~(READ | WRITE));
1143         *i = (struct iov_iter){
1144                 .iter_type = ITER_KVEC,
1145                 .data_source = direction,
1146                 .kvec = kvec,
1147                 .nr_segs = nr_segs,
1148                 .iov_offset = 0,
1149                 .count = count
1150         };
1151 }
1152 EXPORT_SYMBOL(iov_iter_kvec);
1153
1154 void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1155                         const struct bio_vec *bvec, unsigned long nr_segs,
1156                         size_t count)
1157 {
1158         WARN_ON(direction & ~(READ | WRITE));
1159         *i = (struct iov_iter){
1160                 .iter_type = ITER_BVEC,
1161                 .data_source = direction,
1162                 .bvec = bvec,
1163                 .nr_segs = nr_segs,
1164                 .iov_offset = 0,
1165                 .count = count
1166         };
1167 }
1168 EXPORT_SYMBOL(iov_iter_bvec);
1169
1170 void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1171                         struct pipe_inode_info *pipe,
1172                         size_t count)
1173 {
1174         BUG_ON(direction != READ);
1175         WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1176         *i = (struct iov_iter){
1177                 .iter_type = ITER_PIPE,
1178                 .data_source = false,
1179                 .pipe = pipe,
1180                 .head = pipe->head,
1181                 .start_head = pipe->head,
1182                 .iov_offset = 0,
1183                 .count = count
1184         };
1185 }
1186 EXPORT_SYMBOL(iov_iter_pipe);
1187
1188 /**
1189  * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1190  * @i: The iterator to initialise.
1191  * @direction: The direction of the transfer.
1192  * @xarray: The xarray to access.
1193  * @start: The start file position.
1194  * @count: The size of the I/O buffer in bytes.
1195  *
1196  * Set up an I/O iterator to either draw data out of the pages attached to an
1197  * inode or to inject data into those pages.  The pages *must* be prevented
1198  * from evaporation, either by taking a ref on them or locking them by the
1199  * caller.
1200  */
1201 void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1202                      struct xarray *xarray, loff_t start, size_t count)
1203 {
1204         BUG_ON(direction & ~1);
1205         *i = (struct iov_iter) {
1206                 .iter_type = ITER_XARRAY,
1207                 .data_source = direction,
1208                 .xarray = xarray,
1209                 .xarray_start = start,
1210                 .count = count,
1211                 .iov_offset = 0
1212         };
1213 }
1214 EXPORT_SYMBOL(iov_iter_xarray);
1215
1216 /**
1217  * iov_iter_discard - Initialise an I/O iterator that discards data
1218  * @i: The iterator to initialise.
1219  * @direction: The direction of the transfer.
1220  * @count: The size of the I/O buffer in bytes.
1221  *
1222  * Set up an I/O iterator that just discards everything that's written to it.
1223  * It's only available as a READ iterator.
1224  */
1225 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1226 {
1227         BUG_ON(direction != READ);
1228         *i = (struct iov_iter){
1229                 .iter_type = ITER_DISCARD,
1230                 .data_source = false,
1231                 .count = count,
1232                 .iov_offset = 0
1233         };
1234 }
1235 EXPORT_SYMBOL(iov_iter_discard);
1236
1237 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
1238 {
1239         unsigned long res = 0;
1240         size_t size = i->count;
1241         size_t skip = i->iov_offset;
1242         unsigned k;
1243
1244         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1245                 size_t len = i->iov[k].iov_len - skip;
1246                 if (len) {
1247                         res |= (unsigned long)i->iov[k].iov_base + skip;
1248                         if (len > size)
1249                                 len = size;
1250                         res |= len;
1251                         size -= len;
1252                         if (!size)
1253                                 break;
1254                 }
1255         }
1256         return res;
1257 }
1258
1259 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1260 {
1261         unsigned res = 0;
1262         size_t size = i->count;
1263         unsigned skip = i->iov_offset;
1264         unsigned k;
1265
1266         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1267                 size_t len = i->bvec[k].bv_len - skip;
1268                 res |= (unsigned long)i->bvec[k].bv_offset + skip;
1269                 if (len > size)
1270                         len = size;
1271                 res |= len;
1272                 size -= len;
1273                 if (!size)
1274                         break;
1275         }
1276         return res;
1277 }
1278
1279 unsigned long iov_iter_alignment(const struct iov_iter *i)
1280 {
1281         /* iovec and kvec have identical layouts */
1282         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1283                 return iov_iter_alignment_iovec(i);
1284
1285         if (iov_iter_is_bvec(i))
1286                 return iov_iter_alignment_bvec(i);
1287
1288         if (iov_iter_is_pipe(i)) {
1289                 unsigned int p_mask = i->pipe->ring_size - 1;
1290                 size_t size = i->count;
1291
1292                 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1293                         return size | i->iov_offset;
1294                 return size;
1295         }
1296
1297         if (iov_iter_is_xarray(i))
1298                 return (i->xarray_start + i->iov_offset) | i->count;
1299
1300         return 0;
1301 }
1302 EXPORT_SYMBOL(iov_iter_alignment);
1303
1304 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1305 {
1306         unsigned long res = 0;
1307         unsigned long v = 0;
1308         size_t size = i->count;
1309         unsigned k;
1310
1311         if (WARN_ON(!iter_is_iovec(i)))
1312                 return ~0U;
1313
1314         for (k = 0; k < i->nr_segs; k++) {
1315                 if (i->iov[k].iov_len) {
1316                         unsigned long base = (unsigned long)i->iov[k].iov_base;
1317                         if (v) // if not the first one
1318                                 res |= base | v; // this start | previous end
1319                         v = base + i->iov[k].iov_len;
1320                         if (size <= i->iov[k].iov_len)
1321                                 break;
1322                         size -= i->iov[k].iov_len;
1323                 }
1324         }
1325         return res;
1326 }
1327 EXPORT_SYMBOL(iov_iter_gap_alignment);
1328
1329 static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1330                                 size_t maxsize,
1331                                 struct page **pages,
1332                                 int iter_head,
1333                                 size_t *start)
1334 {
1335         struct pipe_inode_info *pipe = i->pipe;
1336         unsigned int p_mask = pipe->ring_size - 1;
1337         ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1338         if (!n)
1339                 return -EFAULT;
1340
1341         maxsize = n;
1342         n += *start;
1343         while (n > 0) {
1344                 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1345                 iter_head++;
1346                 n -= PAGE_SIZE;
1347         }
1348
1349         return maxsize;
1350 }
1351
1352 static ssize_t pipe_get_pages(struct iov_iter *i,
1353                    struct page **pages, size_t maxsize, unsigned maxpages,
1354                    size_t *start)
1355 {
1356         unsigned int iter_head, npages;
1357         size_t capacity;
1358
1359         if (!sanity(i))
1360                 return -EFAULT;
1361
1362         data_start(i, &iter_head, start);
1363         /* Amount of free space: some of this one + all after this one */
1364         npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1365         capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1366
1367         return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1368 }
1369
1370 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1371                                           pgoff_t index, unsigned int nr_pages)
1372 {
1373         XA_STATE(xas, xa, index);
1374         struct page *page;
1375         unsigned int ret = 0;
1376
1377         rcu_read_lock();
1378         for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1379                 if (xas_retry(&xas, page))
1380                         continue;
1381
1382                 /* Has the page moved or been split? */
1383                 if (unlikely(page != xas_reload(&xas))) {
1384                         xas_reset(&xas);
1385                         continue;
1386                 }
1387
1388                 pages[ret] = find_subpage(page, xas.xa_index);
1389                 get_page(pages[ret]);
1390                 if (++ret == nr_pages)
1391                         break;
1392         }
1393         rcu_read_unlock();
1394         return ret;
1395 }
1396
1397 static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1398                                      struct page **pages, size_t maxsize,
1399                                      unsigned maxpages, size_t *_start_offset)
1400 {
1401         unsigned nr, offset;
1402         pgoff_t index, count;
1403         size_t size = maxsize, actual;
1404         loff_t pos;
1405
1406         if (!size || !maxpages)
1407                 return 0;
1408
1409         pos = i->xarray_start + i->iov_offset;
1410         index = pos >> PAGE_SHIFT;
1411         offset = pos & ~PAGE_MASK;
1412         *_start_offset = offset;
1413
1414         count = 1;
1415         if (size > PAGE_SIZE - offset) {
1416                 size -= PAGE_SIZE - offset;
1417                 count += size >> PAGE_SHIFT;
1418                 size &= ~PAGE_MASK;
1419                 if (size)
1420                         count++;
1421         }
1422
1423         if (count > maxpages)
1424                 count = maxpages;
1425
1426         nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1427         if (nr == 0)
1428                 return 0;
1429
1430         actual = PAGE_SIZE * nr;
1431         actual -= offset;
1432         if (nr == count && size > 0) {
1433                 unsigned last_offset = (nr > 1) ? 0 : offset;
1434                 actual -= PAGE_SIZE - (last_offset + size);
1435         }
1436         return actual;
1437 }
1438
1439 /* must be done on non-empty ITER_IOVEC one */
1440 static unsigned long first_iovec_segment(const struct iov_iter *i,
1441                                          size_t *size, size_t *start,
1442                                          size_t maxsize, unsigned maxpages)
1443 {
1444         size_t skip;
1445         long k;
1446
1447         for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1448                 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip;
1449                 size_t len = i->iov[k].iov_len - skip;
1450
1451                 if (unlikely(!len))
1452                         continue;
1453                 if (len > maxsize)
1454                         len = maxsize;
1455                 len += (*start = addr % PAGE_SIZE);
1456                 if (len > maxpages * PAGE_SIZE)
1457                         len = maxpages * PAGE_SIZE;
1458                 *size = len;
1459                 return addr & PAGE_MASK;
1460         }
1461         BUG(); // if it had been empty, we wouldn't get called
1462 }
1463
1464 /* must be done on non-empty ITER_BVEC one */
1465 static struct page *first_bvec_segment(const struct iov_iter *i,
1466                                        size_t *size, size_t *start,
1467                                        size_t maxsize, unsigned maxpages)
1468 {
1469         struct page *page;
1470         size_t skip = i->iov_offset, len;
1471
1472         len = i->bvec->bv_len - skip;
1473         if (len > maxsize)
1474                 len = maxsize;
1475         skip += i->bvec->bv_offset;
1476         page = i->bvec->bv_page + skip / PAGE_SIZE;
1477         len += (*start = skip % PAGE_SIZE);
1478         if (len > maxpages * PAGE_SIZE)
1479                 len = maxpages * PAGE_SIZE;
1480         *size = len;
1481         return page;
1482 }
1483
1484 ssize_t iov_iter_get_pages(struct iov_iter *i,
1485                    struct page **pages, size_t maxsize, unsigned maxpages,
1486                    size_t *start)
1487 {
1488         size_t len;
1489         int n, res;
1490
1491         if (maxsize > i->count)
1492                 maxsize = i->count;
1493         if (!maxsize)
1494                 return 0;
1495
1496         if (likely(iter_is_iovec(i))) {
1497                 unsigned long addr;
1498
1499                 addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
1500                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1501                 res = get_user_pages_fast(addr, n,
1502                                 iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
1503                                 pages);
1504                 if (unlikely(res < 0))
1505                         return res;
1506                 return (res == n ? len : res * PAGE_SIZE) - *start;
1507         }
1508         if (iov_iter_is_bvec(i)) {
1509                 struct page *page;
1510
1511                 page = first_bvec_segment(i, &len, start, maxsize, maxpages);
1512                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1513                 while (n--)
1514                         get_page(*pages++ = page++);
1515                 return len - *start;
1516         }
1517         if (iov_iter_is_pipe(i))
1518                 return pipe_get_pages(i, pages, maxsize, maxpages, start);
1519         if (iov_iter_is_xarray(i))
1520                 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1521         return -EFAULT;
1522 }
1523 EXPORT_SYMBOL(iov_iter_get_pages);
1524
1525 static struct page **get_pages_array(size_t n)
1526 {
1527         return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1528 }
1529
1530 static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1531                    struct page ***pages, size_t maxsize,
1532                    size_t *start)
1533 {
1534         struct page **p;
1535         unsigned int iter_head, npages;
1536         ssize_t n;
1537
1538         if (!sanity(i))
1539                 return -EFAULT;
1540
1541         data_start(i, &iter_head, start);
1542         /* Amount of free space: some of this one + all after this one */
1543         npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1544         n = npages * PAGE_SIZE - *start;
1545         if (maxsize > n)
1546                 maxsize = n;
1547         else
1548                 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1549         p = get_pages_array(npages);
1550         if (!p)
1551                 return -ENOMEM;
1552         n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1553         if (n > 0)
1554                 *pages = p;
1555         else
1556                 kvfree(p);
1557         return n;
1558 }
1559
1560 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1561                                            struct page ***pages, size_t maxsize,
1562                                            size_t *_start_offset)
1563 {
1564         struct page **p;
1565         unsigned nr, offset;
1566         pgoff_t index, count;
1567         size_t size = maxsize, actual;
1568         loff_t pos;
1569
1570         if (!size)
1571                 return 0;
1572
1573         pos = i->xarray_start + i->iov_offset;
1574         index = pos >> PAGE_SHIFT;
1575         offset = pos & ~PAGE_MASK;
1576         *_start_offset = offset;
1577
1578         count = 1;
1579         if (size > PAGE_SIZE - offset) {
1580                 size -= PAGE_SIZE - offset;
1581                 count += size >> PAGE_SHIFT;
1582                 size &= ~PAGE_MASK;
1583                 if (size)
1584                         count++;
1585         }
1586
1587         p = get_pages_array(count);
1588         if (!p)
1589                 return -ENOMEM;
1590         *pages = p;
1591
1592         nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1593         if (nr == 0)
1594                 return 0;
1595
1596         actual = PAGE_SIZE * nr;
1597         actual -= offset;
1598         if (nr == count && size > 0) {
1599                 unsigned last_offset = (nr > 1) ? 0 : offset;
1600                 actual -= PAGE_SIZE - (last_offset + size);
1601         }
1602         return actual;
1603 }
1604
1605 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1606                    struct page ***pages, size_t maxsize,
1607                    size_t *start)
1608 {
1609         struct page **p;
1610         size_t len;
1611         int n, res;
1612
1613         if (maxsize > i->count)
1614                 maxsize = i->count;
1615         if (!maxsize)
1616                 return 0;
1617
1618         if (likely(iter_is_iovec(i))) {
1619                 unsigned long addr;
1620
1621                 addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
1622                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1623                 p = get_pages_array(n);
1624                 if (!p)
1625                         return -ENOMEM;
1626                 res = get_user_pages_fast(addr, n,
1627                                 iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
1628                 if (unlikely(res < 0)) {
1629                         kvfree(p);
1630                         return res;
1631                 }
1632                 *pages = p;
1633                 return (res == n ? len : res * PAGE_SIZE) - *start;
1634         }
1635         if (iov_iter_is_bvec(i)) {
1636                 struct page *page;
1637
1638                 page = first_bvec_segment(i, &len, start, maxsize, ~0U);
1639                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1640                 *pages = p = get_pages_array(n);
1641                 if (!p)
1642                         return -ENOMEM;
1643                 while (n--)
1644                         get_page(*p++ = page++);
1645                 return len - *start;
1646         }
1647         if (iov_iter_is_pipe(i))
1648                 return pipe_get_pages_alloc(i, pages, maxsize, start);
1649         if (iov_iter_is_xarray(i))
1650                 return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1651         return -EFAULT;
1652 }
1653 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1654
1655 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1656                                struct iov_iter *i)
1657 {
1658         __wsum sum, next;
1659         sum = *csum;
1660         if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1661                 WARN_ON(1);
1662                 return 0;
1663         }
1664         iterate_and_advance(i, bytes, base, len, off, ({
1665                 next = csum_and_copy_from_user(base, addr + off, len);
1666                 if (next)
1667                         sum = csum_block_add(sum, next, off);
1668                 next ? 0 : len;
1669         }), ({
1670                 sum = csum_and_memcpy(addr + off, base, len, sum, off);
1671         })
1672         )
1673         *csum = sum;
1674         return bytes;
1675 }
1676 EXPORT_SYMBOL(csum_and_copy_from_iter);
1677
1678 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1679                              struct iov_iter *i)
1680 {
1681         struct csum_state *csstate = _csstate;
1682         __wsum sum, next;
1683
1684         if (unlikely(iov_iter_is_pipe(i)))
1685                 return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);
1686
1687         sum = csum_shift(csstate->csum, csstate->off);
1688         if (unlikely(iov_iter_is_discard(i))) {
1689                 WARN_ON(1);     /* for now */
1690                 return 0;
1691         }
1692         iterate_and_advance(i, bytes, base, len, off, ({
1693                 next = csum_and_copy_to_user(addr + off, base, len);
1694                 if (next)
1695                         sum = csum_block_add(sum, next, off);
1696                 next ? 0 : len;
1697         }), ({
1698                 sum = csum_and_memcpy(base, addr + off, len, sum, off);
1699         })
1700         )
1701         csstate->csum = csum_shift(sum, csstate->off);
1702         csstate->off += bytes;
1703         return bytes;
1704 }
1705 EXPORT_SYMBOL(csum_and_copy_to_iter);
1706
1707 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1708                 struct iov_iter *i)
1709 {
1710 #ifdef CONFIG_CRYPTO_HASH
1711         struct ahash_request *hash = hashp;
1712         struct scatterlist sg;
1713         size_t copied;
1714
1715         copied = copy_to_iter(addr, bytes, i);
1716         sg_init_one(&sg, addr, copied);
1717         ahash_request_set_crypt(hash, &sg, NULL, copied);
1718         crypto_ahash_update(hash);
1719         return copied;
1720 #else
1721         return 0;
1722 #endif
1723 }
1724 EXPORT_SYMBOL(hash_and_copy_to_iter);
1725
1726 static int iov_npages(const struct iov_iter *i, int maxpages)
1727 {
1728         size_t skip = i->iov_offset, size = i->count;
1729         const struct iovec *p;
1730         int npages = 0;
1731
1732         for (p = i->iov; size; skip = 0, p++) {
1733                 unsigned offs = offset_in_page(p->iov_base + skip);
1734                 size_t len = min(p->iov_len - skip, size);
1735
1736                 if (len) {
1737                         size -= len;
1738                         npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1739                         if (unlikely(npages > maxpages))
1740                                 return maxpages;
1741                 }
1742         }
1743         return npages;
1744 }
1745
1746 static int bvec_npages(const struct iov_iter *i, int maxpages)
1747 {
1748         size_t skip = i->iov_offset, size = i->count;
1749         const struct bio_vec *p;
1750         int npages = 0;
1751
1752         for (p = i->bvec; size; skip = 0, p++) {
1753                 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1754                 size_t len = min(p->bv_len - skip, size);
1755
1756                 size -= len;
1757                 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1758                 if (unlikely(npages > maxpages))
1759                         return maxpages;
1760         }
1761         return npages;
1762 }
1763
1764 int iov_iter_npages(const struct iov_iter *i, int maxpages)
1765 {
1766         if (unlikely(!i->count))
1767                 return 0;
1768         /* iovec and kvec have identical layouts */
1769         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1770                 return iov_npages(i, maxpages);
1771         if (iov_iter_is_bvec(i))
1772                 return bvec_npages(i, maxpages);
1773         if (iov_iter_is_pipe(i)) {
1774                 unsigned int iter_head;
1775                 int npages;
1776                 size_t off;
1777
1778                 if (!sanity(i))
1779                         return 0;
1780
1781                 data_start(i, &iter_head, &off);
1782                 /* some of this one + all after this one */
1783                 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1784                 return min(npages, maxpages);
1785         }
1786         if (iov_iter_is_xarray(i)) {
1787                 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1788                 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1789                 return min(npages, maxpages);
1790         }
1791         return 0;
1792 }
1793 EXPORT_SYMBOL(iov_iter_npages);
1794
1795 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1796 {
1797         *new = *old;
1798         if (unlikely(iov_iter_is_pipe(new))) {
1799                 WARN_ON(1);
1800                 return NULL;
1801         }
1802         if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
1803                 return NULL;
1804         if (iov_iter_is_bvec(new))
1805                 return new->bvec = kmemdup(new->bvec,
1806                                     new->nr_segs * sizeof(struct bio_vec),
1807                                     flags);
1808         else
1809                 /* iovec and kvec have identical layout */
1810                 return new->iov = kmemdup(new->iov,
1811                                    new->nr_segs * sizeof(struct iovec),
1812                                    flags);
1813 }
1814 EXPORT_SYMBOL(dup_iter);
1815
1816 static int copy_compat_iovec_from_user(struct iovec *iov,
1817                 const struct iovec __user *uvec, unsigned long nr_segs)
1818 {
1819         const struct compat_iovec __user *uiov =
1820                 (const struct compat_iovec __user *)uvec;
1821         int ret = -EFAULT, i;
1822
1823         if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1824                 return -EFAULT;
1825
1826         for (i = 0; i < nr_segs; i++) {
1827                 compat_uptr_t buf;
1828                 compat_ssize_t len;
1829
1830                 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1831                 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1832
1833                 /* check for compat_size_t not fitting in compat_ssize_t .. */
1834                 if (len < 0) {
1835                         ret = -EINVAL;
1836                         goto uaccess_end;
1837                 }
1838                 iov[i].iov_base = compat_ptr(buf);
1839                 iov[i].iov_len = len;
1840         }
1841
1842         ret = 0;
1843 uaccess_end:
1844         user_access_end();
1845         return ret;
1846 }
1847
1848 static int copy_iovec_from_user(struct iovec *iov,
1849                 const struct iovec __user *uvec, unsigned long nr_segs)
1850 {
1851         unsigned long seg;
1852
1853         if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1854                 return -EFAULT;
1855         for (seg = 0; seg < nr_segs; seg++) {
1856                 if ((ssize_t)iov[seg].iov_len < 0)
1857                         return -EINVAL;
1858         }
1859
1860         return 0;
1861 }
1862
1863 struct iovec *iovec_from_user(const struct iovec __user *uvec,
1864                 unsigned long nr_segs, unsigned long fast_segs,
1865                 struct iovec *fast_iov, bool compat)
1866 {
1867         struct iovec *iov = fast_iov;
1868         int ret;
1869
1870         /*
1871          * SuS says "The readv() function *may* fail if the iovcnt argument was
1872          * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1873          * traditionally returned zero for zero segments, so...
1874          */
1875         if (nr_segs == 0)
1876                 return iov;
1877         if (nr_segs > UIO_MAXIOV)
1878                 return ERR_PTR(-EINVAL);
1879         if (nr_segs > fast_segs) {
1880                 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1881                 if (!iov)
1882                         return ERR_PTR(-ENOMEM);
1883         }
1884
1885         if (compat)
1886                 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1887         else
1888                 ret = copy_iovec_from_user(iov, uvec, nr_segs);
1889         if (ret) {
1890                 if (iov != fast_iov)
1891                         kfree(iov);
1892                 return ERR_PTR(ret);
1893         }
1894
1895         return iov;
1896 }
1897
1898 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1899                  unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1900                  struct iov_iter *i, bool compat)
1901 {
1902         ssize_t total_len = 0;
1903         unsigned long seg;
1904         struct iovec *iov;
1905
1906         iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1907         if (IS_ERR(iov)) {
1908                 *iovp = NULL;
1909                 return PTR_ERR(iov);
1910         }
1911
1912         /*
1913          * According to the Single Unix Specification we should return EINVAL if
1914          * an element length is < 0 when cast to ssize_t or if the total length
1915          * would overflow the ssize_t return value of the system call.
1916          *
1917          * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1918          * overflow case.
1919          */
1920         for (seg = 0; seg < nr_segs; seg++) {
1921                 ssize_t len = (ssize_t)iov[seg].iov_len;
1922
1923                 if (!access_ok(iov[seg].iov_base, len)) {
1924                         if (iov != *iovp)
1925                                 kfree(iov);
1926                         *iovp = NULL;
1927                         return -EFAULT;
1928                 }
1929
1930                 if (len > MAX_RW_COUNT - total_len) {
1931                         len = MAX_RW_COUNT - total_len;
1932                         iov[seg].iov_len = len;
1933                 }
1934                 total_len += len;
1935         }
1936
1937         iov_iter_init(i, type, iov, nr_segs, total_len);
1938         if (iov == *iovp)
1939                 *iovp = NULL;
1940         else
1941                 *iovp = iov;
1942         return total_len;
1943 }
1944
1945 /**
1946  * import_iovec() - Copy an array of &struct iovec from userspace
1947  *     into the kernel, check that it is valid, and initialize a new
1948  *     &struct iov_iter iterator to access it.
1949  *
1950  * @type: One of %READ or %WRITE.
1951  * @uvec: Pointer to the userspace array.
1952  * @nr_segs: Number of elements in userspace array.
1953  * @fast_segs: Number of elements in @iov.
1954  * @iovp: (input and output parameter) Pointer to pointer to (usually small
1955  *     on-stack) kernel array.
1956  * @i: Pointer to iterator that will be initialized on success.
1957  *
1958  * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1959  * then this function places %NULL in *@iov on return. Otherwise, a new
1960  * array will be allocated and the result placed in *@iov. This means that
1961  * the caller may call kfree() on *@iov regardless of whether the small
1962  * on-stack array was used or not (and regardless of whether this function
1963  * returns an error or not).
1964  *
1965  * Return: Negative error code on error, bytes imported on success
1966  */
1967 ssize_t import_iovec(int type, const struct iovec __user *uvec,
1968                  unsigned nr_segs, unsigned fast_segs,
1969                  struct iovec **iovp, struct iov_iter *i)
1970 {
1971         return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1972                               in_compat_syscall());
1973 }
1974 EXPORT_SYMBOL(import_iovec);
1975
1976 int import_single_range(int rw, void __user *buf, size_t len,
1977                  struct iovec *iov, struct iov_iter *i)
1978 {
1979         if (len > MAX_RW_COUNT)
1980                 len = MAX_RW_COUNT;
1981         if (unlikely(!access_ok(buf, len)))
1982                 return -EFAULT;
1983
1984         iov->iov_base = buf;
1985         iov->iov_len = len;
1986         iov_iter_init(i, rw, iov, 1, len);
1987         return 0;
1988 }
1989 EXPORT_SYMBOL(import_single_range);