iov_iter: make the amount already copied available to iterator callbacks
[linux-2.6-microblaze.git] / lib / iov_iter.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <crypto/hash.h>
3 #include <linux/export.h>
4 #include <linux/bvec.h>
5 #include <linux/fault-inject-usercopy.h>
6 #include <linux/uio.h>
7 #include <linux/pagemap.h>
8 #include <linux/highmem.h>
9 #include <linux/slab.h>
10 #include <linux/vmalloc.h>
11 #include <linux/splice.h>
12 #include <linux/compat.h>
13 #include <net/checksum.h>
14 #include <linux/scatterlist.h>
15 #include <linux/instrumented.h>
16
17 #define PIPE_PARANOIA /* for now */
18
19 /* covers iovec and kvec alike */
20 #define iterate_iovec(i, n, __v, __off, __p, skip, STEP) {      \
21         size_t __off = 0;                                       \
22         do {                                                    \
23                 __v.iov_len = min(n, __p->iov_len - skip);      \
24                 if (likely(__v.iov_len)) {                      \
25                         __v.iov_base = __p->iov_base + skip;    \
26                         __v.iov_len -= (STEP);                  \
27                         __off += __v.iov_len;                   \
28                         skip += __v.iov_len;                    \
29                         n -= __v.iov_len;                       \
30                         if (skip < __p->iov_len)                \
31                                 break;                          \
32                 }                                               \
33                 __p++;                                          \
34                 skip = 0;                                       \
35         } while (n);                                            \
36         n = __off;                                              \
37 }
38
39 #define iterate_bvec(i, n, __v, __off, p, skip, STEP) {         \
40         size_t __off = 0;                                       \
41         while (n) {                                             \
42                 unsigned offset = p->bv_offset + skip;          \
43                 unsigned left;                                  \
44                 void *kaddr = kmap_local_page(p->bv_page +      \
45                                         offset / PAGE_SIZE);    \
46                 __v.iov_base = kaddr + offset % PAGE_SIZE;      \
47                 __v.iov_len = min(min(n, p->bv_len - skip),     \
48                      (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \
49                 left = (STEP);                                  \
50                 kunmap_local(kaddr);                            \
51                 __v.iov_len -= left;                            \
52                 __off += __v.iov_len;                           \
53                 skip += __v.iov_len;                            \
54                 if (skip == p->bv_len) {                        \
55                         skip = 0;                               \
56                         p++;                                    \
57                 }                                               \
58                 n -= __v.iov_len;                               \
59                 if (left)                                       \
60                         break;                                  \
61         }                                                       \
62         n = __off;                                              \
63 }
64
65 #define iterate_xarray(i, n, __v, __off, skip, STEP) {          \
66         __label__ __out;                                        \
67         size_t __off = 0;                                       \
68         struct page *head = NULL;                               \
69         size_t seg, offset;                                     \
70         loff_t start = i->xarray_start + skip;                  \
71         pgoff_t index = start >> PAGE_SHIFT;                    \
72         int j;                                                  \
73                                                                 \
74         XA_STATE(xas, i->xarray, index);                        \
75                                                                 \
76         rcu_read_lock();                                                \
77         xas_for_each(&xas, head, ULONG_MAX) {                           \
78                 unsigned left;                                          \
79                 if (xas_retry(&xas, head))                              \
80                         continue;                                       \
81                 if (WARN_ON(xa_is_value(head)))                         \
82                         break;                                          \
83                 if (WARN_ON(PageHuge(head)))                            \
84                         break;                                          \
85                 for (j = (head->index < index) ? index - head->index : 0; \
86                      j < thp_nr_pages(head); j++) {                     \
87                         void *kaddr = kmap_local_page(head + j);        \
88                         offset = (start + __off) % PAGE_SIZE;           \
89                         __v.iov_base = kaddr + offset;                  \
90                         seg = PAGE_SIZE - offset;                       \
91                         __v.iov_len = min(n, seg);                      \
92                         left = (STEP);                                  \
93                         kunmap_local(kaddr);                            \
94                         __v.iov_len -= left;                            \
95                         __off += __v.iov_len;                           \
96                         n -= __v.iov_len;                               \
97                         if (left || n == 0)                             \
98                                 goto __out;                             \
99                 }                                                       \
100         }                                                       \
101 __out:                                                          \
102         rcu_read_unlock();                                      \
103         skip += __off;                                          \
104         n = __off;                                              \
105 }
106
107 #define __iterate_and_advance(i, n, v, off, I, K) {             \
108         if (unlikely(i->count < n))                             \
109                 n = i->count;                                   \
110         if (likely(n)) {                                        \
111                 size_t skip = i->iov_offset;                    \
112                 if (likely(iter_is_iovec(i))) {                 \
113                         const struct iovec *iov = i->iov;       \
114                         struct iovec v;                         \
115                         iterate_iovec(i, n, v, off, iov, skip, (I))     \
116                         i->nr_segs -= iov - i->iov;             \
117                         i->iov = iov;                           \
118                 } else if (iov_iter_is_bvec(i)) {               \
119                         const struct bio_vec *bvec = i->bvec;   \
120                         struct kvec v;                          \
121                         iterate_bvec(i, n, v, off, bvec, skip, (K))     \
122                         i->nr_segs -= bvec - i->bvec;           \
123                         i->bvec = bvec;                         \
124                 } else if (iov_iter_is_kvec(i)) {               \
125                         const struct kvec *kvec = i->kvec;      \
126                         struct kvec v;                          \
127                         iterate_iovec(i, n, v, off, kvec, skip, (K))    \
128                         i->nr_segs -= kvec - i->kvec;           \
129                         i->kvec = kvec;                         \
130                 } else if (iov_iter_is_xarray(i)) {             \
131                         struct kvec v;                          \
132                         iterate_xarray(i, n, v, off, skip, (K)) \
133                 }                                               \
134                 i->count -= n;                                  \
135                 i->iov_offset = skip;                           \
136         }                                                       \
137 }
138 #define iterate_and_advance(i, n, v, off, I, K) \
139         __iterate_and_advance(i, n, v, off, I, ((void)(K),0))
140
141 static int copyout(void __user *to, const void *from, size_t n)
142 {
143         if (should_fail_usercopy())
144                 return n;
145         if (access_ok(to, n)) {
146                 instrument_copy_to_user(to, from, n);
147                 n = raw_copy_to_user(to, from, n);
148         }
149         return n;
150 }
151
152 static int copyin(void *to, const void __user *from, size_t n)
153 {
154         if (should_fail_usercopy())
155                 return n;
156         if (access_ok(from, n)) {
157                 instrument_copy_from_user(to, from, n);
158                 n = raw_copy_from_user(to, from, n);
159         }
160         return n;
161 }
162
163 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
164                          struct iov_iter *i)
165 {
166         size_t skip, copy, left, wanted;
167         const struct iovec *iov;
168         char __user *buf;
169         void *kaddr, *from;
170
171         if (unlikely(bytes > i->count))
172                 bytes = i->count;
173
174         if (unlikely(!bytes))
175                 return 0;
176
177         might_fault();
178         wanted = bytes;
179         iov = i->iov;
180         skip = i->iov_offset;
181         buf = iov->iov_base + skip;
182         copy = min(bytes, iov->iov_len - skip);
183
184         if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
185                 kaddr = kmap_atomic(page);
186                 from = kaddr + offset;
187
188                 /* first chunk, usually the only one */
189                 left = copyout(buf, from, copy);
190                 copy -= left;
191                 skip += copy;
192                 from += copy;
193                 bytes -= copy;
194
195                 while (unlikely(!left && bytes)) {
196                         iov++;
197                         buf = iov->iov_base;
198                         copy = min(bytes, iov->iov_len);
199                         left = copyout(buf, from, copy);
200                         copy -= left;
201                         skip = copy;
202                         from += copy;
203                         bytes -= copy;
204                 }
205                 if (likely(!bytes)) {
206                         kunmap_atomic(kaddr);
207                         goto done;
208                 }
209                 offset = from - kaddr;
210                 buf += copy;
211                 kunmap_atomic(kaddr);
212                 copy = min(bytes, iov->iov_len - skip);
213         }
214         /* Too bad - revert to non-atomic kmap */
215
216         kaddr = kmap(page);
217         from = kaddr + offset;
218         left = copyout(buf, from, copy);
219         copy -= left;
220         skip += copy;
221         from += copy;
222         bytes -= copy;
223         while (unlikely(!left && bytes)) {
224                 iov++;
225                 buf = iov->iov_base;
226                 copy = min(bytes, iov->iov_len);
227                 left = copyout(buf, from, copy);
228                 copy -= left;
229                 skip = copy;
230                 from += copy;
231                 bytes -= copy;
232         }
233         kunmap(page);
234
235 done:
236         if (skip == iov->iov_len) {
237                 iov++;
238                 skip = 0;
239         }
240         i->count -= wanted - bytes;
241         i->nr_segs -= iov - i->iov;
242         i->iov = iov;
243         i->iov_offset = skip;
244         return wanted - bytes;
245 }
246
247 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
248                          struct iov_iter *i)
249 {
250         size_t skip, copy, left, wanted;
251         const struct iovec *iov;
252         char __user *buf;
253         void *kaddr, *to;
254
255         if (unlikely(bytes > i->count))
256                 bytes = i->count;
257
258         if (unlikely(!bytes))
259                 return 0;
260
261         might_fault();
262         wanted = bytes;
263         iov = i->iov;
264         skip = i->iov_offset;
265         buf = iov->iov_base + skip;
266         copy = min(bytes, iov->iov_len - skip);
267
268         if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
269                 kaddr = kmap_atomic(page);
270                 to = kaddr + offset;
271
272                 /* first chunk, usually the only one */
273                 left = copyin(to, buf, copy);
274                 copy -= left;
275                 skip += copy;
276                 to += copy;
277                 bytes -= copy;
278
279                 while (unlikely(!left && bytes)) {
280                         iov++;
281                         buf = iov->iov_base;
282                         copy = min(bytes, iov->iov_len);
283                         left = copyin(to, buf, copy);
284                         copy -= left;
285                         skip = copy;
286                         to += copy;
287                         bytes -= copy;
288                 }
289                 if (likely(!bytes)) {
290                         kunmap_atomic(kaddr);
291                         goto done;
292                 }
293                 offset = to - kaddr;
294                 buf += copy;
295                 kunmap_atomic(kaddr);
296                 copy = min(bytes, iov->iov_len - skip);
297         }
298         /* Too bad - revert to non-atomic kmap */
299
300         kaddr = kmap(page);
301         to = kaddr + offset;
302         left = copyin(to, buf, copy);
303         copy -= left;
304         skip += copy;
305         to += copy;
306         bytes -= copy;
307         while (unlikely(!left && bytes)) {
308                 iov++;
309                 buf = iov->iov_base;
310                 copy = min(bytes, iov->iov_len);
311                 left = copyin(to, buf, copy);
312                 copy -= left;
313                 skip = copy;
314                 to += copy;
315                 bytes -= copy;
316         }
317         kunmap(page);
318
319 done:
320         if (skip == iov->iov_len) {
321                 iov++;
322                 skip = 0;
323         }
324         i->count -= wanted - bytes;
325         i->nr_segs -= iov - i->iov;
326         i->iov = iov;
327         i->iov_offset = skip;
328         return wanted - bytes;
329 }
330
331 #ifdef PIPE_PARANOIA
332 static bool sanity(const struct iov_iter *i)
333 {
334         struct pipe_inode_info *pipe = i->pipe;
335         unsigned int p_head = pipe->head;
336         unsigned int p_tail = pipe->tail;
337         unsigned int p_mask = pipe->ring_size - 1;
338         unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
339         unsigned int i_head = i->head;
340         unsigned int idx;
341
342         if (i->iov_offset) {
343                 struct pipe_buffer *p;
344                 if (unlikely(p_occupancy == 0))
345                         goto Bad;       // pipe must be non-empty
346                 if (unlikely(i_head != p_head - 1))
347                         goto Bad;       // must be at the last buffer...
348
349                 p = &pipe->bufs[i_head & p_mask];
350                 if (unlikely(p->offset + p->len != i->iov_offset))
351                         goto Bad;       // ... at the end of segment
352         } else {
353                 if (i_head != p_head)
354                         goto Bad;       // must be right after the last buffer
355         }
356         return true;
357 Bad:
358         printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
359         printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
360                         p_head, p_tail, pipe->ring_size);
361         for (idx = 0; idx < pipe->ring_size; idx++)
362                 printk(KERN_ERR "[%p %p %d %d]\n",
363                         pipe->bufs[idx].ops,
364                         pipe->bufs[idx].page,
365                         pipe->bufs[idx].offset,
366                         pipe->bufs[idx].len);
367         WARN_ON(1);
368         return false;
369 }
370 #else
371 #define sanity(i) true
372 #endif
373
374 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
375                          struct iov_iter *i)
376 {
377         struct pipe_inode_info *pipe = i->pipe;
378         struct pipe_buffer *buf;
379         unsigned int p_tail = pipe->tail;
380         unsigned int p_mask = pipe->ring_size - 1;
381         unsigned int i_head = i->head;
382         size_t off;
383
384         if (unlikely(bytes > i->count))
385                 bytes = i->count;
386
387         if (unlikely(!bytes))
388                 return 0;
389
390         if (!sanity(i))
391                 return 0;
392
393         off = i->iov_offset;
394         buf = &pipe->bufs[i_head & p_mask];
395         if (off) {
396                 if (offset == off && buf->page == page) {
397                         /* merge with the last one */
398                         buf->len += bytes;
399                         i->iov_offset += bytes;
400                         goto out;
401                 }
402                 i_head++;
403                 buf = &pipe->bufs[i_head & p_mask];
404         }
405         if (pipe_full(i_head, p_tail, pipe->max_usage))
406                 return 0;
407
408         buf->ops = &page_cache_pipe_buf_ops;
409         get_page(page);
410         buf->page = page;
411         buf->offset = offset;
412         buf->len = bytes;
413
414         pipe->head = i_head + 1;
415         i->iov_offset = offset + bytes;
416         i->head = i_head;
417 out:
418         i->count -= bytes;
419         return bytes;
420 }
421
422 /*
423  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
424  * bytes.  For each iovec, fault in each page that constitutes the iovec.
425  *
426  * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
427  * because it is an invalid address).
428  */
429 int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
430 {
431         if (iter_is_iovec(i)) {
432                 const struct iovec *p;
433                 size_t skip;
434
435                 if (bytes > i->count)
436                         bytes = i->count;
437                 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
438                         size_t len = min(bytes, p->iov_len - skip);
439                         int err;
440
441                         if (unlikely(!len))
442                                 continue;
443                         err = fault_in_pages_readable(p->iov_base + skip, len);
444                         if (unlikely(err))
445                                 return err;
446                         bytes -= len;
447                 }
448         }
449         return 0;
450 }
451 EXPORT_SYMBOL(iov_iter_fault_in_readable);
452
453 void iov_iter_init(struct iov_iter *i, unsigned int direction,
454                         const struct iovec *iov, unsigned long nr_segs,
455                         size_t count)
456 {
457         WARN_ON(direction & ~(READ | WRITE));
458         WARN_ON_ONCE(uaccess_kernel());
459         *i = (struct iov_iter) {
460                 .iter_type = ITER_IOVEC,
461                 .data_source = direction,
462                 .iov = iov,
463                 .nr_segs = nr_segs,
464                 .iov_offset = 0,
465                 .count = count
466         };
467 }
468 EXPORT_SYMBOL(iov_iter_init);
469
470 static inline bool allocated(struct pipe_buffer *buf)
471 {
472         return buf->ops == &default_pipe_buf_ops;
473 }
474
475 static inline void data_start(const struct iov_iter *i,
476                               unsigned int *iter_headp, size_t *offp)
477 {
478         unsigned int p_mask = i->pipe->ring_size - 1;
479         unsigned int iter_head = i->head;
480         size_t off = i->iov_offset;
481
482         if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
483                     off == PAGE_SIZE)) {
484                 iter_head++;
485                 off = 0;
486         }
487         *iter_headp = iter_head;
488         *offp = off;
489 }
490
491 static size_t push_pipe(struct iov_iter *i, size_t size,
492                         int *iter_headp, size_t *offp)
493 {
494         struct pipe_inode_info *pipe = i->pipe;
495         unsigned int p_tail = pipe->tail;
496         unsigned int p_mask = pipe->ring_size - 1;
497         unsigned int iter_head;
498         size_t off;
499         ssize_t left;
500
501         if (unlikely(size > i->count))
502                 size = i->count;
503         if (unlikely(!size))
504                 return 0;
505
506         left = size;
507         data_start(i, &iter_head, &off);
508         *iter_headp = iter_head;
509         *offp = off;
510         if (off) {
511                 left -= PAGE_SIZE - off;
512                 if (left <= 0) {
513                         pipe->bufs[iter_head & p_mask].len += size;
514                         return size;
515                 }
516                 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
517                 iter_head++;
518         }
519         while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
520                 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
521                 struct page *page = alloc_page(GFP_USER);
522                 if (!page)
523                         break;
524
525                 buf->ops = &default_pipe_buf_ops;
526                 buf->page = page;
527                 buf->offset = 0;
528                 buf->len = min_t(ssize_t, left, PAGE_SIZE);
529                 left -= buf->len;
530                 iter_head++;
531                 pipe->head = iter_head;
532
533                 if (left == 0)
534                         return size;
535         }
536         return size - left;
537 }
538
539 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
540                                 struct iov_iter *i)
541 {
542         struct pipe_inode_info *pipe = i->pipe;
543         unsigned int p_mask = pipe->ring_size - 1;
544         unsigned int i_head;
545         size_t n, off;
546
547         if (!sanity(i))
548                 return 0;
549
550         bytes = n = push_pipe(i, bytes, &i_head, &off);
551         if (unlikely(!n))
552                 return 0;
553         do {
554                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
555                 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
556                 i->head = i_head;
557                 i->iov_offset = off + chunk;
558                 n -= chunk;
559                 addr += chunk;
560                 off = 0;
561                 i_head++;
562         } while (n);
563         i->count -= bytes;
564         return bytes;
565 }
566
567 static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
568                               __wsum sum, size_t off)
569 {
570         __wsum next = csum_partial_copy_nocheck(from, to, len);
571         return csum_block_add(sum, next, off);
572 }
573
574 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
575                                          struct csum_state *csstate,
576                                          struct iov_iter *i)
577 {
578         struct pipe_inode_info *pipe = i->pipe;
579         unsigned int p_mask = pipe->ring_size - 1;
580         __wsum sum = csstate->csum;
581         size_t off = csstate->off;
582         unsigned int i_head;
583         size_t n, r;
584
585         if (!sanity(i))
586                 return 0;
587
588         bytes = n = push_pipe(i, bytes, &i_head, &r);
589         if (unlikely(!n))
590                 return 0;
591         do {
592                 size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
593                 char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
594                 sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
595                 kunmap_atomic(p);
596                 i->head = i_head;
597                 i->iov_offset = r + chunk;
598                 n -= chunk;
599                 off += chunk;
600                 addr += chunk;
601                 r = 0;
602                 i_head++;
603         } while (n);
604         i->count -= bytes;
605         csstate->csum = sum;
606         csstate->off = off;
607         return bytes;
608 }
609
610 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
611 {
612         if (unlikely(iov_iter_is_pipe(i)))
613                 return copy_pipe_to_iter(addr, bytes, i);
614         if (iter_is_iovec(i))
615                 might_fault();
616         iterate_and_advance(i, bytes, v, off,
617                 copyout(v.iov_base, addr + off, v.iov_len),
618                 memcpy(v.iov_base, addr + off, v.iov_len)
619         )
620
621         return bytes;
622 }
623 EXPORT_SYMBOL(_copy_to_iter);
624
625 #ifdef CONFIG_ARCH_HAS_COPY_MC
626 static int copyout_mc(void __user *to, const void *from, size_t n)
627 {
628         if (access_ok(to, n)) {
629                 instrument_copy_to_user(to, from, n);
630                 n = copy_mc_to_user((__force void *) to, from, n);
631         }
632         return n;
633 }
634
635 static unsigned long copy_mc_to_page(struct page *page, size_t offset,
636                 const char *from, size_t len)
637 {
638         unsigned long ret;
639         char *to;
640
641         to = kmap_atomic(page);
642         ret = copy_mc_to_kernel(to + offset, from, len);
643         kunmap_atomic(to);
644
645         return ret;
646 }
647
648 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
649                                 struct iov_iter *i)
650 {
651         struct pipe_inode_info *pipe = i->pipe;
652         unsigned int p_mask = pipe->ring_size - 1;
653         unsigned int i_head;
654         size_t n, off, xfer = 0;
655
656         if (!sanity(i))
657                 return 0;
658
659         bytes = n = push_pipe(i, bytes, &i_head, &off);
660         if (unlikely(!n))
661                 return 0;
662         do {
663                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
664                 unsigned long rem;
665
666                 rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page,
667                                             off, addr, chunk);
668                 i->head = i_head;
669                 i->iov_offset = off + chunk - rem;
670                 xfer += chunk - rem;
671                 if (rem)
672                         break;
673                 n -= chunk;
674                 addr += chunk;
675                 off = 0;
676                 i_head++;
677         } while (n);
678         i->count -= xfer;
679         return xfer;
680 }
681
682 /**
683  * _copy_mc_to_iter - copy to iter with source memory error exception handling
684  * @addr: source kernel address
685  * @bytes: total transfer length
686  * @iter: destination iterator
687  *
688  * The pmem driver deploys this for the dax operation
689  * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
690  * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
691  * successfully copied.
692  *
693  * The main differences between this and typical _copy_to_iter().
694  *
695  * * Typical tail/residue handling after a fault retries the copy
696  *   byte-by-byte until the fault happens again. Re-triggering machine
697  *   checks is potentially fatal so the implementation uses source
698  *   alignment and poison alignment assumptions to avoid re-triggering
699  *   hardware exceptions.
700  *
701  * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
702  *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
703  *   a short copy.
704  */
705 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
706 {
707         if (unlikely(iov_iter_is_pipe(i)))
708                 return copy_mc_pipe_to_iter(addr, bytes, i);
709         if (iter_is_iovec(i))
710                 might_fault();
711         __iterate_and_advance(i, bytes, v, off,
712                 copyout_mc(v.iov_base, addr + off, v.iov_len),
713                 copy_mc_to_kernel(v.iov_base, addr + off, v.iov_len)
714         )
715
716         return bytes;
717 }
718 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
719 #endif /* CONFIG_ARCH_HAS_COPY_MC */
720
721 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
722 {
723         if (unlikely(iov_iter_is_pipe(i))) {
724                 WARN_ON(1);
725                 return 0;
726         }
727         if (iter_is_iovec(i))
728                 might_fault();
729         iterate_and_advance(i, bytes, v, off,
730                 copyin(addr + off, v.iov_base, v.iov_len),
731                 memcpy(addr + off, v.iov_base, v.iov_len)
732         )
733
734         return bytes;
735 }
736 EXPORT_SYMBOL(_copy_from_iter);
737
738 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
739 {
740         if (unlikely(iov_iter_is_pipe(i))) {
741                 WARN_ON(1);
742                 return 0;
743         }
744         iterate_and_advance(i, bytes, v, off,
745                 __copy_from_user_inatomic_nocache(addr + off,
746                                          v.iov_base, v.iov_len),
747                 memcpy(addr + off, v.iov_base, v.iov_len)
748         )
749
750         return bytes;
751 }
752 EXPORT_SYMBOL(_copy_from_iter_nocache);
753
754 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
755 /**
756  * _copy_from_iter_flushcache - write destination through cpu cache
757  * @addr: destination kernel address
758  * @bytes: total transfer length
759  * @iter: source iterator
760  *
761  * The pmem driver arranges for filesystem-dax to use this facility via
762  * dax_copy_from_iter() for ensuring that writes to persistent memory
763  * are flushed through the CPU cache. It is differentiated from
764  * _copy_from_iter_nocache() in that guarantees all data is flushed for
765  * all iterator types. The _copy_from_iter_nocache() only attempts to
766  * bypass the cache for the ITER_IOVEC case, and on some archs may use
767  * instructions that strand dirty-data in the cache.
768  */
769 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
770 {
771         if (unlikely(iov_iter_is_pipe(i))) {
772                 WARN_ON(1);
773                 return 0;
774         }
775         iterate_and_advance(i, bytes, v, off,
776                 __copy_from_user_flushcache(addr + off, v.iov_base, v.iov_len),
777                 memcpy_flushcache(addr + off, v.iov_base, v.iov_len)
778         )
779
780         return bytes;
781 }
782 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
783 #endif
784
785 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
786 {
787         struct page *head;
788         size_t v = n + offset;
789
790         /*
791          * The general case needs to access the page order in order
792          * to compute the page size.
793          * However, we mostly deal with order-0 pages and thus can
794          * avoid a possible cache line miss for requests that fit all
795          * page orders.
796          */
797         if (n <= v && v <= PAGE_SIZE)
798                 return true;
799
800         head = compound_head(page);
801         v += (page - head) << PAGE_SHIFT;
802
803         if (likely(n <= v && v <= (page_size(head))))
804                 return true;
805         WARN_ON(1);
806         return false;
807 }
808
809 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
810                          struct iov_iter *i)
811 {
812         if (likely(iter_is_iovec(i)))
813                 return copy_page_to_iter_iovec(page, offset, bytes, i);
814         if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
815                 void *kaddr = kmap_atomic(page);
816                 size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
817                 kunmap_atomic(kaddr);
818                 return wanted;
819         }
820         if (iov_iter_is_pipe(i))
821                 return copy_page_to_iter_pipe(page, offset, bytes, i);
822         if (unlikely(iov_iter_is_discard(i))) {
823                 if (unlikely(i->count < bytes))
824                         bytes = i->count;
825                 i->count -= bytes;
826                 return bytes;
827         }
828         WARN_ON(1);
829         return 0;
830 }
831
832 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
833                          struct iov_iter *i)
834 {
835         size_t res = 0;
836         if (unlikely(!page_copy_sane(page, offset, bytes)))
837                 return 0;
838         page += offset / PAGE_SIZE; // first subpage
839         offset %= PAGE_SIZE;
840         while (1) {
841                 size_t n = __copy_page_to_iter(page, offset,
842                                 min(bytes, (size_t)PAGE_SIZE - offset), i);
843                 res += n;
844                 bytes -= n;
845                 if (!bytes || !n)
846                         break;
847                 offset += n;
848                 if (offset == PAGE_SIZE) {
849                         page++;
850                         offset = 0;
851                 }
852         }
853         return res;
854 }
855 EXPORT_SYMBOL(copy_page_to_iter);
856
857 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
858                          struct iov_iter *i)
859 {
860         if (unlikely(!page_copy_sane(page, offset, bytes)))
861                 return 0;
862         if (likely(iter_is_iovec(i)))
863                 return copy_page_from_iter_iovec(page, offset, bytes, i);
864         if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
865                 void *kaddr = kmap_atomic(page);
866                 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
867                 kunmap_atomic(kaddr);
868                 return wanted;
869         }
870         WARN_ON(1);
871         return 0;
872 }
873 EXPORT_SYMBOL(copy_page_from_iter);
874
875 static size_t pipe_zero(size_t bytes, struct iov_iter *i)
876 {
877         struct pipe_inode_info *pipe = i->pipe;
878         unsigned int p_mask = pipe->ring_size - 1;
879         unsigned int i_head;
880         size_t n, off;
881
882         if (!sanity(i))
883                 return 0;
884
885         bytes = n = push_pipe(i, bytes, &i_head, &off);
886         if (unlikely(!n))
887                 return 0;
888
889         do {
890                 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
891                 memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
892                 i->head = i_head;
893                 i->iov_offset = off + chunk;
894                 n -= chunk;
895                 off = 0;
896                 i_head++;
897         } while (n);
898         i->count -= bytes;
899         return bytes;
900 }
901
902 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
903 {
904         if (unlikely(iov_iter_is_pipe(i)))
905                 return pipe_zero(bytes, i);
906         iterate_and_advance(i, bytes, v, count,
907                 clear_user(v.iov_base, v.iov_len),
908                 memset(v.iov_base, 0, v.iov_len)
909         )
910
911         return bytes;
912 }
913 EXPORT_SYMBOL(iov_iter_zero);
914
915 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
916                                   struct iov_iter *i)
917 {
918         char *kaddr = kmap_atomic(page), *p = kaddr + offset;
919         if (unlikely(!page_copy_sane(page, offset, bytes))) {
920                 kunmap_atomic(kaddr);
921                 return 0;
922         }
923         if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
924                 kunmap_atomic(kaddr);
925                 WARN_ON(1);
926                 return 0;
927         }
928         iterate_and_advance(i, bytes, v, off,
929                 copyin(p + off, v.iov_base, v.iov_len),
930                 memcpy(p + off, v.iov_base, v.iov_len)
931         )
932         kunmap_atomic(kaddr);
933         return bytes;
934 }
935 EXPORT_SYMBOL(copy_page_from_iter_atomic);
936
937 static inline void pipe_truncate(struct iov_iter *i)
938 {
939         struct pipe_inode_info *pipe = i->pipe;
940         unsigned int p_tail = pipe->tail;
941         unsigned int p_head = pipe->head;
942         unsigned int p_mask = pipe->ring_size - 1;
943
944         if (!pipe_empty(p_head, p_tail)) {
945                 struct pipe_buffer *buf;
946                 unsigned int i_head = i->head;
947                 size_t off = i->iov_offset;
948
949                 if (off) {
950                         buf = &pipe->bufs[i_head & p_mask];
951                         buf->len = off - buf->offset;
952                         i_head++;
953                 }
954                 while (p_head != i_head) {
955                         p_head--;
956                         pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
957                 }
958
959                 pipe->head = p_head;
960         }
961 }
962
963 static void pipe_advance(struct iov_iter *i, size_t size)
964 {
965         struct pipe_inode_info *pipe = i->pipe;
966         if (size) {
967                 struct pipe_buffer *buf;
968                 unsigned int p_mask = pipe->ring_size - 1;
969                 unsigned int i_head = i->head;
970                 size_t off = i->iov_offset, left = size;
971
972                 if (off) /* make it relative to the beginning of buffer */
973                         left += off - pipe->bufs[i_head & p_mask].offset;
974                 while (1) {
975                         buf = &pipe->bufs[i_head & p_mask];
976                         if (left <= buf->len)
977                                 break;
978                         left -= buf->len;
979                         i_head++;
980                 }
981                 i->head = i_head;
982                 i->iov_offset = buf->offset + left;
983         }
984         i->count -= size;
985         /* ... and discard everything past that point */
986         pipe_truncate(i);
987 }
988
989 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
990 {
991         struct bvec_iter bi;
992
993         bi.bi_size = i->count;
994         bi.bi_bvec_done = i->iov_offset;
995         bi.bi_idx = 0;
996         bvec_iter_advance(i->bvec, &bi, size);
997
998         i->bvec += bi.bi_idx;
999         i->nr_segs -= bi.bi_idx;
1000         i->count = bi.bi_size;
1001         i->iov_offset = bi.bi_bvec_done;
1002 }
1003
1004 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
1005 {
1006         const struct iovec *iov, *end;
1007
1008         if (!i->count)
1009                 return;
1010         i->count -= size;
1011
1012         size += i->iov_offset; // from beginning of current segment
1013         for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
1014                 if (likely(size < iov->iov_len))
1015                         break;
1016                 size -= iov->iov_len;
1017         }
1018         i->iov_offset = size;
1019         i->nr_segs -= iov - i->iov;
1020         i->iov = iov;
1021 }
1022
1023 void iov_iter_advance(struct iov_iter *i, size_t size)
1024 {
1025         if (unlikely(i->count < size))
1026                 size = i->count;
1027         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
1028                 /* iovec and kvec have identical layouts */
1029                 iov_iter_iovec_advance(i, size);
1030         } else if (iov_iter_is_bvec(i)) {
1031                 iov_iter_bvec_advance(i, size);
1032         } else if (iov_iter_is_pipe(i)) {
1033                 pipe_advance(i, size);
1034         } else if (unlikely(iov_iter_is_xarray(i))) {
1035                 i->iov_offset += size;
1036                 i->count -= size;
1037         } else if (iov_iter_is_discard(i)) {
1038                 i->count -= size;
1039         }
1040 }
1041 EXPORT_SYMBOL(iov_iter_advance);
1042
1043 void iov_iter_revert(struct iov_iter *i, size_t unroll)
1044 {
1045         if (!unroll)
1046                 return;
1047         if (WARN_ON(unroll > MAX_RW_COUNT))
1048                 return;
1049         i->count += unroll;
1050         if (unlikely(iov_iter_is_pipe(i))) {
1051                 struct pipe_inode_info *pipe = i->pipe;
1052                 unsigned int p_mask = pipe->ring_size - 1;
1053                 unsigned int i_head = i->head;
1054                 size_t off = i->iov_offset;
1055                 while (1) {
1056                         struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1057                         size_t n = off - b->offset;
1058                         if (unroll < n) {
1059                                 off -= unroll;
1060                                 break;
1061                         }
1062                         unroll -= n;
1063                         if (!unroll && i_head == i->start_head) {
1064                                 off = 0;
1065                                 break;
1066                         }
1067                         i_head--;
1068                         b = &pipe->bufs[i_head & p_mask];
1069                         off = b->offset + b->len;
1070                 }
1071                 i->iov_offset = off;
1072                 i->head = i_head;
1073                 pipe_truncate(i);
1074                 return;
1075         }
1076         if (unlikely(iov_iter_is_discard(i)))
1077                 return;
1078         if (unroll <= i->iov_offset) {
1079                 i->iov_offset -= unroll;
1080                 return;
1081         }
1082         unroll -= i->iov_offset;
1083         if (iov_iter_is_xarray(i)) {
1084                 BUG(); /* We should never go beyond the start of the specified
1085                         * range since we might then be straying into pages that
1086                         * aren't pinned.
1087                         */
1088         } else if (iov_iter_is_bvec(i)) {
1089                 const struct bio_vec *bvec = i->bvec;
1090                 while (1) {
1091                         size_t n = (--bvec)->bv_len;
1092                         i->nr_segs++;
1093                         if (unroll <= n) {
1094                                 i->bvec = bvec;
1095                                 i->iov_offset = n - unroll;
1096                                 return;
1097                         }
1098                         unroll -= n;
1099                 }
1100         } else { /* same logics for iovec and kvec */
1101                 const struct iovec *iov = i->iov;
1102                 while (1) {
1103                         size_t n = (--iov)->iov_len;
1104                         i->nr_segs++;
1105                         if (unroll <= n) {
1106                                 i->iov = iov;
1107                                 i->iov_offset = n - unroll;
1108                                 return;
1109                         }
1110                         unroll -= n;
1111                 }
1112         }
1113 }
1114 EXPORT_SYMBOL(iov_iter_revert);
1115
1116 /*
1117  * Return the count of just the current iov_iter segment.
1118  */
1119 size_t iov_iter_single_seg_count(const struct iov_iter *i)
1120 {
1121         if (i->nr_segs > 1) {
1122                 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1123                         return min(i->count, i->iov->iov_len - i->iov_offset);
1124                 if (iov_iter_is_bvec(i))
1125                         return min(i->count, i->bvec->bv_len - i->iov_offset);
1126         }
1127         return i->count;
1128 }
1129 EXPORT_SYMBOL(iov_iter_single_seg_count);
1130
1131 void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1132                         const struct kvec *kvec, unsigned long nr_segs,
1133                         size_t count)
1134 {
1135         WARN_ON(direction & ~(READ | WRITE));
1136         *i = (struct iov_iter){
1137                 .iter_type = ITER_KVEC,
1138                 .data_source = direction,
1139                 .kvec = kvec,
1140                 .nr_segs = nr_segs,
1141                 .iov_offset = 0,
1142                 .count = count
1143         };
1144 }
1145 EXPORT_SYMBOL(iov_iter_kvec);
1146
1147 void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1148                         const struct bio_vec *bvec, unsigned long nr_segs,
1149                         size_t count)
1150 {
1151         WARN_ON(direction & ~(READ | WRITE));
1152         *i = (struct iov_iter){
1153                 .iter_type = ITER_BVEC,
1154                 .data_source = direction,
1155                 .bvec = bvec,
1156                 .nr_segs = nr_segs,
1157                 .iov_offset = 0,
1158                 .count = count
1159         };
1160 }
1161 EXPORT_SYMBOL(iov_iter_bvec);
1162
1163 void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1164                         struct pipe_inode_info *pipe,
1165                         size_t count)
1166 {
1167         BUG_ON(direction != READ);
1168         WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1169         *i = (struct iov_iter){
1170                 .iter_type = ITER_PIPE,
1171                 .data_source = false,
1172                 .pipe = pipe,
1173                 .head = pipe->head,
1174                 .start_head = pipe->head,
1175                 .iov_offset = 0,
1176                 .count = count
1177         };
1178 }
1179 EXPORT_SYMBOL(iov_iter_pipe);
1180
1181 /**
1182  * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1183  * @i: The iterator to initialise.
1184  * @direction: The direction of the transfer.
1185  * @xarray: The xarray to access.
1186  * @start: The start file position.
1187  * @count: The size of the I/O buffer in bytes.
1188  *
1189  * Set up an I/O iterator to either draw data out of the pages attached to an
1190  * inode or to inject data into those pages.  The pages *must* be prevented
1191  * from evaporation, either by taking a ref on them or locking them by the
1192  * caller.
1193  */
1194 void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1195                      struct xarray *xarray, loff_t start, size_t count)
1196 {
1197         BUG_ON(direction & ~1);
1198         *i = (struct iov_iter) {
1199                 .iter_type = ITER_XARRAY,
1200                 .data_source = direction,
1201                 .xarray = xarray,
1202                 .xarray_start = start,
1203                 .count = count,
1204                 .iov_offset = 0
1205         };
1206 }
1207 EXPORT_SYMBOL(iov_iter_xarray);
1208
1209 /**
1210  * iov_iter_discard - Initialise an I/O iterator that discards data
1211  * @i: The iterator to initialise.
1212  * @direction: The direction of the transfer.
1213  * @count: The size of the I/O buffer in bytes.
1214  *
1215  * Set up an I/O iterator that just discards everything that's written to it.
1216  * It's only available as a READ iterator.
1217  */
1218 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1219 {
1220         BUG_ON(direction != READ);
1221         *i = (struct iov_iter){
1222                 .iter_type = ITER_DISCARD,
1223                 .data_source = false,
1224                 .count = count,
1225                 .iov_offset = 0
1226         };
1227 }
1228 EXPORT_SYMBOL(iov_iter_discard);
1229
1230 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
1231 {
1232         unsigned long res = 0;
1233         size_t size = i->count;
1234         size_t skip = i->iov_offset;
1235         unsigned k;
1236
1237         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1238                 size_t len = i->iov[k].iov_len - skip;
1239                 if (len) {
1240                         res |= (unsigned long)i->iov[k].iov_base + skip;
1241                         if (len > size)
1242                                 len = size;
1243                         res |= len;
1244                         size -= len;
1245                         if (!size)
1246                                 break;
1247                 }
1248         }
1249         return res;
1250 }
1251
1252 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1253 {
1254         unsigned res = 0;
1255         size_t size = i->count;
1256         unsigned skip = i->iov_offset;
1257         unsigned k;
1258
1259         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1260                 size_t len = i->bvec[k].bv_len - skip;
1261                 res |= (unsigned long)i->bvec[k].bv_offset + skip;
1262                 if (len > size)
1263                         len = size;
1264                 res |= len;
1265                 size -= len;
1266                 if (!size)
1267                         break;
1268         }
1269         return res;
1270 }
1271
1272 unsigned long iov_iter_alignment(const struct iov_iter *i)
1273 {
1274         /* iovec and kvec have identical layouts */
1275         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1276                 return iov_iter_alignment_iovec(i);
1277
1278         if (iov_iter_is_bvec(i))
1279                 return iov_iter_alignment_bvec(i);
1280
1281         if (iov_iter_is_pipe(i)) {
1282                 unsigned int p_mask = i->pipe->ring_size - 1;
1283                 size_t size = i->count;
1284
1285                 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1286                         return size | i->iov_offset;
1287                 return size;
1288         }
1289
1290         if (iov_iter_is_xarray(i))
1291                 return (i->xarray_start + i->iov_offset) | i->count;
1292
1293         return 0;
1294 }
1295 EXPORT_SYMBOL(iov_iter_alignment);
1296
1297 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1298 {
1299         unsigned long res = 0;
1300         unsigned long v = 0;
1301         size_t size = i->count;
1302         unsigned k;
1303
1304         if (WARN_ON(!iter_is_iovec(i)))
1305                 return ~0U;
1306
1307         for (k = 0; k < i->nr_segs; k++) {
1308                 if (i->iov[k].iov_len) {
1309                         unsigned long base = (unsigned long)i->iov[k].iov_base;
1310                         if (v) // if not the first one
1311                                 res |= base | v; // this start | previous end
1312                         v = base + i->iov[k].iov_len;
1313                         if (size <= i->iov[k].iov_len)
1314                                 break;
1315                         size -= i->iov[k].iov_len;
1316                 }
1317         }
1318         return res;
1319 }
1320 EXPORT_SYMBOL(iov_iter_gap_alignment);
1321
1322 static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1323                                 size_t maxsize,
1324                                 struct page **pages,
1325                                 int iter_head,
1326                                 size_t *start)
1327 {
1328         struct pipe_inode_info *pipe = i->pipe;
1329         unsigned int p_mask = pipe->ring_size - 1;
1330         ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1331         if (!n)
1332                 return -EFAULT;
1333
1334         maxsize = n;
1335         n += *start;
1336         while (n > 0) {
1337                 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1338                 iter_head++;
1339                 n -= PAGE_SIZE;
1340         }
1341
1342         return maxsize;
1343 }
1344
1345 static ssize_t pipe_get_pages(struct iov_iter *i,
1346                    struct page **pages, size_t maxsize, unsigned maxpages,
1347                    size_t *start)
1348 {
1349         unsigned int iter_head, npages;
1350         size_t capacity;
1351
1352         if (!sanity(i))
1353                 return -EFAULT;
1354
1355         data_start(i, &iter_head, start);
1356         /* Amount of free space: some of this one + all after this one */
1357         npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1358         capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1359
1360         return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1361 }
1362
1363 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1364                                           pgoff_t index, unsigned int nr_pages)
1365 {
1366         XA_STATE(xas, xa, index);
1367         struct page *page;
1368         unsigned int ret = 0;
1369
1370         rcu_read_lock();
1371         for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1372                 if (xas_retry(&xas, page))
1373                         continue;
1374
1375                 /* Has the page moved or been split? */
1376                 if (unlikely(page != xas_reload(&xas))) {
1377                         xas_reset(&xas);
1378                         continue;
1379                 }
1380
1381                 pages[ret] = find_subpage(page, xas.xa_index);
1382                 get_page(pages[ret]);
1383                 if (++ret == nr_pages)
1384                         break;
1385         }
1386         rcu_read_unlock();
1387         return ret;
1388 }
1389
1390 static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1391                                      struct page **pages, size_t maxsize,
1392                                      unsigned maxpages, size_t *_start_offset)
1393 {
1394         unsigned nr, offset;
1395         pgoff_t index, count;
1396         size_t size = maxsize, actual;
1397         loff_t pos;
1398
1399         if (!size || !maxpages)
1400                 return 0;
1401
1402         pos = i->xarray_start + i->iov_offset;
1403         index = pos >> PAGE_SHIFT;
1404         offset = pos & ~PAGE_MASK;
1405         *_start_offset = offset;
1406
1407         count = 1;
1408         if (size > PAGE_SIZE - offset) {
1409                 size -= PAGE_SIZE - offset;
1410                 count += size >> PAGE_SHIFT;
1411                 size &= ~PAGE_MASK;
1412                 if (size)
1413                         count++;
1414         }
1415
1416         if (count > maxpages)
1417                 count = maxpages;
1418
1419         nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1420         if (nr == 0)
1421                 return 0;
1422
1423         actual = PAGE_SIZE * nr;
1424         actual -= offset;
1425         if (nr == count && size > 0) {
1426                 unsigned last_offset = (nr > 1) ? 0 : offset;
1427                 actual -= PAGE_SIZE - (last_offset + size);
1428         }
1429         return actual;
1430 }
1431
1432 /* must be done on non-empty ITER_IOVEC one */
1433 static unsigned long first_iovec_segment(const struct iov_iter *i,
1434                                          size_t *size, size_t *start,
1435                                          size_t maxsize, unsigned maxpages)
1436 {
1437         size_t skip;
1438         long k;
1439
1440         for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1441                 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip;
1442                 size_t len = i->iov[k].iov_len - skip;
1443
1444                 if (unlikely(!len))
1445                         continue;
1446                 if (len > maxsize)
1447                         len = maxsize;
1448                 len += (*start = addr % PAGE_SIZE);
1449                 if (len > maxpages * PAGE_SIZE)
1450                         len = maxpages * PAGE_SIZE;
1451                 *size = len;
1452                 return addr & PAGE_MASK;
1453         }
1454         BUG(); // if it had been empty, we wouldn't get called
1455 }
1456
1457 /* must be done on non-empty ITER_BVEC one */
1458 static struct page *first_bvec_segment(const struct iov_iter *i,
1459                                        size_t *size, size_t *start,
1460                                        size_t maxsize, unsigned maxpages)
1461 {
1462         struct page *page;
1463         size_t skip = i->iov_offset, len;
1464
1465         len = i->bvec->bv_len - skip;
1466         if (len > maxsize)
1467                 len = maxsize;
1468         skip += i->bvec->bv_offset;
1469         page = i->bvec->bv_page + skip / PAGE_SIZE;
1470         len += (*start = skip % PAGE_SIZE);
1471         if (len > maxpages * PAGE_SIZE)
1472                 len = maxpages * PAGE_SIZE;
1473         *size = len;
1474         return page;
1475 }
1476
1477 ssize_t iov_iter_get_pages(struct iov_iter *i,
1478                    struct page **pages, size_t maxsize, unsigned maxpages,
1479                    size_t *start)
1480 {
1481         size_t len;
1482         int n, res;
1483
1484         if (maxsize > i->count)
1485                 maxsize = i->count;
1486         if (!maxsize)
1487                 return 0;
1488
1489         if (likely(iter_is_iovec(i))) {
1490                 unsigned long addr;
1491
1492                 addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
1493                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1494                 res = get_user_pages_fast(addr, n,
1495                                 iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
1496                                 pages);
1497                 if (unlikely(res < 0))
1498                         return res;
1499                 return (res == n ? len : res * PAGE_SIZE) - *start;
1500         }
1501         if (iov_iter_is_bvec(i)) {
1502                 struct page *page;
1503
1504                 page = first_bvec_segment(i, &len, start, maxsize, maxpages);
1505                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1506                 while (n--)
1507                         get_page(*pages++ = page++);
1508                 return len - *start;
1509         }
1510         if (iov_iter_is_pipe(i))
1511                 return pipe_get_pages(i, pages, maxsize, maxpages, start);
1512         if (iov_iter_is_xarray(i))
1513                 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1514         return -EFAULT;
1515 }
1516 EXPORT_SYMBOL(iov_iter_get_pages);
1517
1518 static struct page **get_pages_array(size_t n)
1519 {
1520         return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1521 }
1522
1523 static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1524                    struct page ***pages, size_t maxsize,
1525                    size_t *start)
1526 {
1527         struct page **p;
1528         unsigned int iter_head, npages;
1529         ssize_t n;
1530
1531         if (!sanity(i))
1532                 return -EFAULT;
1533
1534         data_start(i, &iter_head, start);
1535         /* Amount of free space: some of this one + all after this one */
1536         npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1537         n = npages * PAGE_SIZE - *start;
1538         if (maxsize > n)
1539                 maxsize = n;
1540         else
1541                 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1542         p = get_pages_array(npages);
1543         if (!p)
1544                 return -ENOMEM;
1545         n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1546         if (n > 0)
1547                 *pages = p;
1548         else
1549                 kvfree(p);
1550         return n;
1551 }
1552
1553 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1554                                            struct page ***pages, size_t maxsize,
1555                                            size_t *_start_offset)
1556 {
1557         struct page **p;
1558         unsigned nr, offset;
1559         pgoff_t index, count;
1560         size_t size = maxsize, actual;
1561         loff_t pos;
1562
1563         if (!size)
1564                 return 0;
1565
1566         pos = i->xarray_start + i->iov_offset;
1567         index = pos >> PAGE_SHIFT;
1568         offset = pos & ~PAGE_MASK;
1569         *_start_offset = offset;
1570
1571         count = 1;
1572         if (size > PAGE_SIZE - offset) {
1573                 size -= PAGE_SIZE - offset;
1574                 count += size >> PAGE_SHIFT;
1575                 size &= ~PAGE_MASK;
1576                 if (size)
1577                         count++;
1578         }
1579
1580         p = get_pages_array(count);
1581         if (!p)
1582                 return -ENOMEM;
1583         *pages = p;
1584
1585         nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1586         if (nr == 0)
1587                 return 0;
1588
1589         actual = PAGE_SIZE * nr;
1590         actual -= offset;
1591         if (nr == count && size > 0) {
1592                 unsigned last_offset = (nr > 1) ? 0 : offset;
1593                 actual -= PAGE_SIZE - (last_offset + size);
1594         }
1595         return actual;
1596 }
1597
1598 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1599                    struct page ***pages, size_t maxsize,
1600                    size_t *start)
1601 {
1602         struct page **p;
1603         size_t len;
1604         int n, res;
1605
1606         if (maxsize > i->count)
1607                 maxsize = i->count;
1608         if (!maxsize)
1609                 return 0;
1610
1611         if (likely(iter_is_iovec(i))) {
1612                 unsigned long addr;
1613
1614                 addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
1615                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1616                 p = get_pages_array(n);
1617                 if (!p)
1618                         return -ENOMEM;
1619                 res = get_user_pages_fast(addr, n,
1620                                 iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
1621                 if (unlikely(res < 0)) {
1622                         kvfree(p);
1623                         return res;
1624                 }
1625                 *pages = p;
1626                 return (res == n ? len : res * PAGE_SIZE) - *start;
1627         }
1628         if (iov_iter_is_bvec(i)) {
1629                 struct page *page;
1630
1631                 page = first_bvec_segment(i, &len, start, maxsize, ~0U);
1632                 n = DIV_ROUND_UP(len, PAGE_SIZE);
1633                 *pages = p = get_pages_array(n);
1634                 if (!p)
1635                         return -ENOMEM;
1636                 while (n--)
1637                         get_page(*p++ = page++);
1638                 return len - *start;
1639         }
1640         if (iov_iter_is_pipe(i))
1641                 return pipe_get_pages_alloc(i, pages, maxsize, start);
1642         if (iov_iter_is_xarray(i))
1643                 return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1644         return -EFAULT;
1645 }
1646 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1647
1648 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1649                                struct iov_iter *i)
1650 {
1651         __wsum sum, next;
1652         sum = *csum;
1653         if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1654                 WARN_ON(1);
1655                 return 0;
1656         }
1657         iterate_and_advance(i, bytes, v, off, ({
1658                 next = csum_and_copy_from_user(v.iov_base,
1659                                                addr + off,
1660                                                v.iov_len);
1661                 if (next)
1662                         sum = csum_block_add(sum, next, off);
1663                 next ? 0 : v.iov_len;
1664         }), ({
1665                 sum = csum_and_memcpy(addr + off, v.iov_base, v.iov_len,
1666                                       sum, off);
1667         })
1668         )
1669         *csum = sum;
1670         return bytes;
1671 }
1672 EXPORT_SYMBOL(csum_and_copy_from_iter);
1673
1674 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1675                              struct iov_iter *i)
1676 {
1677         struct csum_state *csstate = _csstate;
1678         __wsum sum, next;
1679
1680         if (unlikely(iov_iter_is_pipe(i)))
1681                 return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);
1682
1683         sum = csum_shift(csstate->csum, csstate->off);
1684         if (unlikely(iov_iter_is_discard(i))) {
1685                 WARN_ON(1);     /* for now */
1686                 return 0;
1687         }
1688         iterate_and_advance(i, bytes, v, off, ({
1689                 next = csum_and_copy_to_user(addr + off,
1690                                              v.iov_base,
1691                                              v.iov_len);
1692                 if (next)
1693                         sum = csum_block_add(sum, next, off);
1694                 next ? 0 : v.iov_len;
1695         }), ({
1696                 sum = csum_and_memcpy(v.iov_base,
1697                                      addr + off,
1698                                      v.iov_len, sum, off);
1699         })
1700         )
1701         csstate->csum = csum_shift(sum, csstate->off);
1702         csstate->off += bytes;
1703         return bytes;
1704 }
1705 EXPORT_SYMBOL(csum_and_copy_to_iter);
1706
1707 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1708                 struct iov_iter *i)
1709 {
1710 #ifdef CONFIG_CRYPTO_HASH
1711         struct ahash_request *hash = hashp;
1712         struct scatterlist sg;
1713         size_t copied;
1714
1715         copied = copy_to_iter(addr, bytes, i);
1716         sg_init_one(&sg, addr, copied);
1717         ahash_request_set_crypt(hash, &sg, NULL, copied);
1718         crypto_ahash_update(hash);
1719         return copied;
1720 #else
1721         return 0;
1722 #endif
1723 }
1724 EXPORT_SYMBOL(hash_and_copy_to_iter);
1725
1726 static int iov_npages(const struct iov_iter *i, int maxpages)
1727 {
1728         size_t skip = i->iov_offset, size = i->count;
1729         const struct iovec *p;
1730         int npages = 0;
1731
1732         for (p = i->iov; size; skip = 0, p++) {
1733                 unsigned offs = offset_in_page(p->iov_base + skip);
1734                 size_t len = min(p->iov_len - skip, size);
1735
1736                 if (len) {
1737                         size -= len;
1738                         npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1739                         if (unlikely(npages > maxpages))
1740                                 return maxpages;
1741                 }
1742         }
1743         return npages;
1744 }
1745
1746 static int bvec_npages(const struct iov_iter *i, int maxpages)
1747 {
1748         size_t skip = i->iov_offset, size = i->count;
1749         const struct bio_vec *p;
1750         int npages = 0;
1751
1752         for (p = i->bvec; size; skip = 0, p++) {
1753                 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1754                 size_t len = min(p->bv_len - skip, size);
1755
1756                 size -= len;
1757                 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1758                 if (unlikely(npages > maxpages))
1759                         return maxpages;
1760         }
1761         return npages;
1762 }
1763
1764 int iov_iter_npages(const struct iov_iter *i, int maxpages)
1765 {
1766         if (unlikely(!i->count))
1767                 return 0;
1768         /* iovec and kvec have identical layouts */
1769         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1770                 return iov_npages(i, maxpages);
1771         if (iov_iter_is_bvec(i))
1772                 return bvec_npages(i, maxpages);
1773         if (iov_iter_is_pipe(i)) {
1774                 unsigned int iter_head;
1775                 int npages;
1776                 size_t off;
1777
1778                 if (!sanity(i))
1779                         return 0;
1780
1781                 data_start(i, &iter_head, &off);
1782                 /* some of this one + all after this one */
1783                 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1784                 return min(npages, maxpages);
1785         }
1786         if (iov_iter_is_xarray(i)) {
1787                 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1788                 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1789                 return min(npages, maxpages);
1790         }
1791         return 0;
1792 }
1793 EXPORT_SYMBOL(iov_iter_npages);
1794
1795 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1796 {
1797         *new = *old;
1798         if (unlikely(iov_iter_is_pipe(new))) {
1799                 WARN_ON(1);
1800                 return NULL;
1801         }
1802         if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
1803                 return NULL;
1804         if (iov_iter_is_bvec(new))
1805                 return new->bvec = kmemdup(new->bvec,
1806                                     new->nr_segs * sizeof(struct bio_vec),
1807                                     flags);
1808         else
1809                 /* iovec and kvec have identical layout */
1810                 return new->iov = kmemdup(new->iov,
1811                                    new->nr_segs * sizeof(struct iovec),
1812                                    flags);
1813 }
1814 EXPORT_SYMBOL(dup_iter);
1815
1816 static int copy_compat_iovec_from_user(struct iovec *iov,
1817                 const struct iovec __user *uvec, unsigned long nr_segs)
1818 {
1819         const struct compat_iovec __user *uiov =
1820                 (const struct compat_iovec __user *)uvec;
1821         int ret = -EFAULT, i;
1822
1823         if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1824                 return -EFAULT;
1825
1826         for (i = 0; i < nr_segs; i++) {
1827                 compat_uptr_t buf;
1828                 compat_ssize_t len;
1829
1830                 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1831                 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1832
1833                 /* check for compat_size_t not fitting in compat_ssize_t .. */
1834                 if (len < 0) {
1835                         ret = -EINVAL;
1836                         goto uaccess_end;
1837                 }
1838                 iov[i].iov_base = compat_ptr(buf);
1839                 iov[i].iov_len = len;
1840         }
1841
1842         ret = 0;
1843 uaccess_end:
1844         user_access_end();
1845         return ret;
1846 }
1847
1848 static int copy_iovec_from_user(struct iovec *iov,
1849                 const struct iovec __user *uvec, unsigned long nr_segs)
1850 {
1851         unsigned long seg;
1852
1853         if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1854                 return -EFAULT;
1855         for (seg = 0; seg < nr_segs; seg++) {
1856                 if ((ssize_t)iov[seg].iov_len < 0)
1857                         return -EINVAL;
1858         }
1859
1860         return 0;
1861 }
1862
1863 struct iovec *iovec_from_user(const struct iovec __user *uvec,
1864                 unsigned long nr_segs, unsigned long fast_segs,
1865                 struct iovec *fast_iov, bool compat)
1866 {
1867         struct iovec *iov = fast_iov;
1868         int ret;
1869
1870         /*
1871          * SuS says "The readv() function *may* fail if the iovcnt argument was
1872          * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1873          * traditionally returned zero for zero segments, so...
1874          */
1875         if (nr_segs == 0)
1876                 return iov;
1877         if (nr_segs > UIO_MAXIOV)
1878                 return ERR_PTR(-EINVAL);
1879         if (nr_segs > fast_segs) {
1880                 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1881                 if (!iov)
1882                         return ERR_PTR(-ENOMEM);
1883         }
1884
1885         if (compat)
1886                 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1887         else
1888                 ret = copy_iovec_from_user(iov, uvec, nr_segs);
1889         if (ret) {
1890                 if (iov != fast_iov)
1891                         kfree(iov);
1892                 return ERR_PTR(ret);
1893         }
1894
1895         return iov;
1896 }
1897
1898 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1899                  unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1900                  struct iov_iter *i, bool compat)
1901 {
1902         ssize_t total_len = 0;
1903         unsigned long seg;
1904         struct iovec *iov;
1905
1906         iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1907         if (IS_ERR(iov)) {
1908                 *iovp = NULL;
1909                 return PTR_ERR(iov);
1910         }
1911
1912         /*
1913          * According to the Single Unix Specification we should return EINVAL if
1914          * an element length is < 0 when cast to ssize_t or if the total length
1915          * would overflow the ssize_t return value of the system call.
1916          *
1917          * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1918          * overflow case.
1919          */
1920         for (seg = 0; seg < nr_segs; seg++) {
1921                 ssize_t len = (ssize_t)iov[seg].iov_len;
1922
1923                 if (!access_ok(iov[seg].iov_base, len)) {
1924                         if (iov != *iovp)
1925                                 kfree(iov);
1926                         *iovp = NULL;
1927                         return -EFAULT;
1928                 }
1929
1930                 if (len > MAX_RW_COUNT - total_len) {
1931                         len = MAX_RW_COUNT - total_len;
1932                         iov[seg].iov_len = len;
1933                 }
1934                 total_len += len;
1935         }
1936
1937         iov_iter_init(i, type, iov, nr_segs, total_len);
1938         if (iov == *iovp)
1939                 *iovp = NULL;
1940         else
1941                 *iovp = iov;
1942         return total_len;
1943 }
1944
1945 /**
1946  * import_iovec() - Copy an array of &struct iovec from userspace
1947  *     into the kernel, check that it is valid, and initialize a new
1948  *     &struct iov_iter iterator to access it.
1949  *
1950  * @type: One of %READ or %WRITE.
1951  * @uvec: Pointer to the userspace array.
1952  * @nr_segs: Number of elements in userspace array.
1953  * @fast_segs: Number of elements in @iov.
1954  * @iovp: (input and output parameter) Pointer to pointer to (usually small
1955  *     on-stack) kernel array.
1956  * @i: Pointer to iterator that will be initialized on success.
1957  *
1958  * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1959  * then this function places %NULL in *@iov on return. Otherwise, a new
1960  * array will be allocated and the result placed in *@iov. This means that
1961  * the caller may call kfree() on *@iov regardless of whether the small
1962  * on-stack array was used or not (and regardless of whether this function
1963  * returns an error or not).
1964  *
1965  * Return: Negative error code on error, bytes imported on success
1966  */
1967 ssize_t import_iovec(int type, const struct iovec __user *uvec,
1968                  unsigned nr_segs, unsigned fast_segs,
1969                  struct iovec **iovp, struct iov_iter *i)
1970 {
1971         return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1972                               in_compat_syscall());
1973 }
1974 EXPORT_SYMBOL(import_iovec);
1975
1976 int import_single_range(int rw, void __user *buf, size_t len,
1977                  struct iovec *iov, struct iov_iter *i)
1978 {
1979         if (len > MAX_RW_COUNT)
1980                 len = MAX_RW_COUNT;
1981         if (unlikely(!access_ok(buf, len)))
1982                 return -EFAULT;
1983
1984         iov->iov_base = buf;
1985         iov->iov_len = len;
1986         iov_iter_init(i, rw, iov, 1, len);
1987         return 0;
1988 }
1989 EXPORT_SYMBOL(import_single_range);