fs: don't allow splice read/write without explicit ops
[linux-2.6-microblaze.git] / fs / splice.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * "splice": joining two ropes together by interweaving their strands.
4  *
5  * This is the "extended pipe" functionality, where a pipe is used as
6  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7  * buffer that you can use to transfer data from one end to the other.
8  *
9  * The traditional unix read/write is extended with a "splice()" operation
10  * that transfers data buffers to or from a pipe buffer.
11  *
12  * Named by Larry McVoy, original implementation from Linus, extended by
13  * Jens to support splicing to files, network, direct splicing, etc and
14  * fixing lots of bugs.
15  *
16  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19  *
20  */
21 #include <linux/bvec.h>
22 #include <linux/fs.h>
23 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/splice.h>
26 #include <linux/memcontrol.h>
27 #include <linux/mm_inline.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/export.h>
31 #include <linux/syscalls.h>
32 #include <linux/uio.h>
33 #include <linux/security.h>
34 #include <linux/gfp.h>
35 #include <linux/socket.h>
36 #include <linux/compat.h>
37 #include <linux/sched/signal.h>
38
39 #include "internal.h"
40
41 /*
42  * Attempt to steal a page from a pipe buffer. This should perhaps go into
43  * a vm helper function, it's already simplified quite a bit by the
44  * addition of remove_mapping(). If success is returned, the caller may
45  * attempt to reuse this page for another destination.
46  */
47 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
48                 struct pipe_buffer *buf)
49 {
50         struct page *page = buf->page;
51         struct address_space *mapping;
52
53         lock_page(page);
54
55         mapping = page_mapping(page);
56         if (mapping) {
57                 WARN_ON(!PageUptodate(page));
58
59                 /*
60                  * At least for ext2 with nobh option, we need to wait on
61                  * writeback completing on this page, since we'll remove it
62                  * from the pagecache.  Otherwise truncate wont wait on the
63                  * page, allowing the disk blocks to be reused by someone else
64                  * before we actually wrote our data to them. fs corruption
65                  * ensues.
66                  */
67                 wait_on_page_writeback(page);
68
69                 if (page_has_private(page) &&
70                     !try_to_release_page(page, GFP_KERNEL))
71                         goto out_unlock;
72
73                 /*
74                  * If we succeeded in removing the mapping, set LRU flag
75                  * and return good.
76                  */
77                 if (remove_mapping(mapping, page)) {
78                         buf->flags |= PIPE_BUF_FLAG_LRU;
79                         return true;
80                 }
81         }
82
83         /*
84          * Raced with truncate or failed to remove page from current
85          * address space, unlock and return failure.
86          */
87 out_unlock:
88         unlock_page(page);
89         return false;
90 }
91
92 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
93                                         struct pipe_buffer *buf)
94 {
95         put_page(buf->page);
96         buf->flags &= ~PIPE_BUF_FLAG_LRU;
97 }
98
99 /*
100  * Check whether the contents of buf is OK to access. Since the content
101  * is a page cache page, IO may be in flight.
102  */
103 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
104                                        struct pipe_buffer *buf)
105 {
106         struct page *page = buf->page;
107         int err;
108
109         if (!PageUptodate(page)) {
110                 lock_page(page);
111
112                 /*
113                  * Page got truncated/unhashed. This will cause a 0-byte
114                  * splice, if this is the first page.
115                  */
116                 if (!page->mapping) {
117                         err = -ENODATA;
118                         goto error;
119                 }
120
121                 /*
122                  * Uh oh, read-error from disk.
123                  */
124                 if (!PageUptodate(page)) {
125                         err = -EIO;
126                         goto error;
127                 }
128
129                 /*
130                  * Page is ok afterall, we are done.
131                  */
132                 unlock_page(page);
133         }
134
135         return 0;
136 error:
137         unlock_page(page);
138         return err;
139 }
140
141 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
142         .confirm        = page_cache_pipe_buf_confirm,
143         .release        = page_cache_pipe_buf_release,
144         .try_steal      = page_cache_pipe_buf_try_steal,
145         .get            = generic_pipe_buf_get,
146 };
147
148 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
149                 struct pipe_buffer *buf)
150 {
151         if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
152                 return false;
153
154         buf->flags |= PIPE_BUF_FLAG_LRU;
155         return generic_pipe_buf_try_steal(pipe, buf);
156 }
157
158 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
159         .release        = page_cache_pipe_buf_release,
160         .try_steal      = user_page_pipe_buf_try_steal,
161         .get            = generic_pipe_buf_get,
162 };
163
164 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
165 {
166         smp_mb();
167         if (waitqueue_active(&pipe->rd_wait))
168                 wake_up_interruptible(&pipe->rd_wait);
169         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
170 }
171
172 /**
173  * splice_to_pipe - fill passed data into a pipe
174  * @pipe:       pipe to fill
175  * @spd:        data to fill
176  *
177  * Description:
178  *    @spd contains a map of pages and len/offset tuples, along with
179  *    the struct pipe_buf_operations associated with these pages. This
180  *    function will link that data to the pipe.
181  *
182  */
183 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
184                        struct splice_pipe_desc *spd)
185 {
186         unsigned int spd_pages = spd->nr_pages;
187         unsigned int tail = pipe->tail;
188         unsigned int head = pipe->head;
189         unsigned int mask = pipe->ring_size - 1;
190         int ret = 0, page_nr = 0;
191
192         if (!spd_pages)
193                 return 0;
194
195         if (unlikely(!pipe->readers)) {
196                 send_sig(SIGPIPE, current, 0);
197                 ret = -EPIPE;
198                 goto out;
199         }
200
201         while (!pipe_full(head, tail, pipe->max_usage)) {
202                 struct pipe_buffer *buf = &pipe->bufs[head & mask];
203
204                 buf->page = spd->pages[page_nr];
205                 buf->offset = spd->partial[page_nr].offset;
206                 buf->len = spd->partial[page_nr].len;
207                 buf->private = spd->partial[page_nr].private;
208                 buf->ops = spd->ops;
209                 buf->flags = 0;
210
211                 head++;
212                 pipe->head = head;
213                 page_nr++;
214                 ret += buf->len;
215
216                 if (!--spd->nr_pages)
217                         break;
218         }
219
220         if (!ret)
221                 ret = -EAGAIN;
222
223 out:
224         while (page_nr < spd_pages)
225                 spd->spd_release(spd, page_nr++);
226
227         return ret;
228 }
229 EXPORT_SYMBOL_GPL(splice_to_pipe);
230
231 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
232 {
233         unsigned int head = pipe->head;
234         unsigned int tail = pipe->tail;
235         unsigned int mask = pipe->ring_size - 1;
236         int ret;
237
238         if (unlikely(!pipe->readers)) {
239                 send_sig(SIGPIPE, current, 0);
240                 ret = -EPIPE;
241         } else if (pipe_full(head, tail, pipe->max_usage)) {
242                 ret = -EAGAIN;
243         } else {
244                 pipe->bufs[head & mask] = *buf;
245                 pipe->head = head + 1;
246                 return buf->len;
247         }
248         pipe_buf_release(pipe, buf);
249         return ret;
250 }
251 EXPORT_SYMBOL(add_to_pipe);
252
253 /*
254  * Check if we need to grow the arrays holding pages and partial page
255  * descriptions.
256  */
257 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
258 {
259         unsigned int max_usage = READ_ONCE(pipe->max_usage);
260
261         spd->nr_pages_max = max_usage;
262         if (max_usage <= PIPE_DEF_BUFFERS)
263                 return 0;
264
265         spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
266         spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
267                                      GFP_KERNEL);
268
269         if (spd->pages && spd->partial)
270                 return 0;
271
272         kfree(spd->pages);
273         kfree(spd->partial);
274         return -ENOMEM;
275 }
276
277 void splice_shrink_spd(struct splice_pipe_desc *spd)
278 {
279         if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
280                 return;
281
282         kfree(spd->pages);
283         kfree(spd->partial);
284 }
285
286 /**
287  * generic_file_splice_read - splice data from file to a pipe
288  * @in:         file to splice from
289  * @ppos:       position in @in
290  * @pipe:       pipe to splice to
291  * @len:        number of bytes to splice
292  * @flags:      splice modifier flags
293  *
294  * Description:
295  *    Will read pages from given file and fill them into a pipe. Can be
296  *    used as long as it has more or less sane ->read_iter().
297  *
298  */
299 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
300                                  struct pipe_inode_info *pipe, size_t len,
301                                  unsigned int flags)
302 {
303         struct iov_iter to;
304         struct kiocb kiocb;
305         unsigned int i_head;
306         int ret;
307
308         iov_iter_pipe(&to, READ, pipe, len);
309         i_head = to.head;
310         init_sync_kiocb(&kiocb, in);
311         kiocb.ki_pos = *ppos;
312         ret = call_read_iter(in, &kiocb, &to);
313         if (ret > 0) {
314                 *ppos = kiocb.ki_pos;
315                 file_accessed(in);
316         } else if (ret < 0) {
317                 to.head = i_head;
318                 to.iov_offset = 0;
319                 iov_iter_advance(&to, 0); /* to free what was emitted */
320                 /*
321                  * callers of ->splice_read() expect -EAGAIN on
322                  * "can't put anything in there", rather than -EFAULT.
323                  */
324                 if (ret == -EFAULT)
325                         ret = -EAGAIN;
326         }
327
328         return ret;
329 }
330 EXPORT_SYMBOL(generic_file_splice_read);
331
332 const struct pipe_buf_operations default_pipe_buf_ops = {
333         .release        = generic_pipe_buf_release,
334         .try_steal      = generic_pipe_buf_try_steal,
335         .get            = generic_pipe_buf_get,
336 };
337
338 /* Pipe buffer operations for a socket and similar. */
339 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
340         .release        = generic_pipe_buf_release,
341         .get            = generic_pipe_buf_get,
342 };
343 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
344
345 /*
346  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
347  * using sendpage(). Return the number of bytes sent.
348  */
349 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
350                             struct pipe_buffer *buf, struct splice_desc *sd)
351 {
352         struct file *file = sd->u.file;
353         loff_t pos = sd->pos;
354         int more;
355
356         if (!likely(file->f_op->sendpage))
357                 return -EINVAL;
358
359         more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
360
361         if (sd->len < sd->total_len &&
362             pipe_occupancy(pipe->head, pipe->tail) > 1)
363                 more |= MSG_SENDPAGE_NOTLAST;
364
365         return file->f_op->sendpage(file, buf->page, buf->offset,
366                                     sd->len, &pos, more);
367 }
368
369 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
370 {
371         smp_mb();
372         if (waitqueue_active(&pipe->wr_wait))
373                 wake_up_interruptible(&pipe->wr_wait);
374         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
375 }
376
377 /**
378  * splice_from_pipe_feed - feed available data from a pipe to a file
379  * @pipe:       pipe to splice from
380  * @sd:         information to @actor
381  * @actor:      handler that splices the data
382  *
383  * Description:
384  *    This function loops over the pipe and calls @actor to do the
385  *    actual moving of a single struct pipe_buffer to the desired
386  *    destination.  It returns when there's no more buffers left in
387  *    the pipe or if the requested number of bytes (@sd->total_len)
388  *    have been copied.  It returns a positive number (one) if the
389  *    pipe needs to be filled with more data, zero if the required
390  *    number of bytes have been copied and -errno on error.
391  *
392  *    This, together with splice_from_pipe_{begin,end,next}, may be
393  *    used to implement the functionality of __splice_from_pipe() when
394  *    locking is required around copying the pipe buffers to the
395  *    destination.
396  */
397 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
398                           splice_actor *actor)
399 {
400         unsigned int head = pipe->head;
401         unsigned int tail = pipe->tail;
402         unsigned int mask = pipe->ring_size - 1;
403         int ret;
404
405         while (!pipe_empty(head, tail)) {
406                 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
407
408                 sd->len = buf->len;
409                 if (sd->len > sd->total_len)
410                         sd->len = sd->total_len;
411
412                 ret = pipe_buf_confirm(pipe, buf);
413                 if (unlikely(ret)) {
414                         if (ret == -ENODATA)
415                                 ret = 0;
416                         return ret;
417                 }
418
419                 ret = actor(pipe, buf, sd);
420                 if (ret <= 0)
421                         return ret;
422
423                 buf->offset += ret;
424                 buf->len -= ret;
425
426                 sd->num_spliced += ret;
427                 sd->len -= ret;
428                 sd->pos += ret;
429                 sd->total_len -= ret;
430
431                 if (!buf->len) {
432                         pipe_buf_release(pipe, buf);
433                         tail++;
434                         pipe->tail = tail;
435                         if (pipe->files)
436                                 sd->need_wakeup = true;
437                 }
438
439                 if (!sd->total_len)
440                         return 0;
441         }
442
443         return 1;
444 }
445
446 /**
447  * splice_from_pipe_next - wait for some data to splice from
448  * @pipe:       pipe to splice from
449  * @sd:         information about the splice operation
450  *
451  * Description:
452  *    This function will wait for some data and return a positive
453  *    value (one) if pipe buffers are available.  It will return zero
454  *    or -errno if no more data needs to be spliced.
455  */
456 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
457 {
458         /*
459          * Check for signal early to make process killable when there are
460          * always buffers available
461          */
462         if (signal_pending(current))
463                 return -ERESTARTSYS;
464
465         while (pipe_empty(pipe->head, pipe->tail)) {
466                 if (!pipe->writers)
467                         return 0;
468
469                 if (sd->num_spliced)
470                         return 0;
471
472                 if (sd->flags & SPLICE_F_NONBLOCK)
473                         return -EAGAIN;
474
475                 if (signal_pending(current))
476                         return -ERESTARTSYS;
477
478                 if (sd->need_wakeup) {
479                         wakeup_pipe_writers(pipe);
480                         sd->need_wakeup = false;
481                 }
482
483                 pipe_wait(pipe);
484         }
485
486         return 1;
487 }
488
489 /**
490  * splice_from_pipe_begin - start splicing from pipe
491  * @sd:         information about the splice operation
492  *
493  * Description:
494  *    This function should be called before a loop containing
495  *    splice_from_pipe_next() and splice_from_pipe_feed() to
496  *    initialize the necessary fields of @sd.
497  */
498 static void splice_from_pipe_begin(struct splice_desc *sd)
499 {
500         sd->num_spliced = 0;
501         sd->need_wakeup = false;
502 }
503
504 /**
505  * splice_from_pipe_end - finish splicing from pipe
506  * @pipe:       pipe to splice from
507  * @sd:         information about the splice operation
508  *
509  * Description:
510  *    This function will wake up pipe writers if necessary.  It should
511  *    be called after a loop containing splice_from_pipe_next() and
512  *    splice_from_pipe_feed().
513  */
514 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
515 {
516         if (sd->need_wakeup)
517                 wakeup_pipe_writers(pipe);
518 }
519
520 /**
521  * __splice_from_pipe - splice data from a pipe to given actor
522  * @pipe:       pipe to splice from
523  * @sd:         information to @actor
524  * @actor:      handler that splices the data
525  *
526  * Description:
527  *    This function does little more than loop over the pipe and call
528  *    @actor to do the actual moving of a single struct pipe_buffer to
529  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
530  *    pipe_to_user.
531  *
532  */
533 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
534                            splice_actor *actor)
535 {
536         int ret;
537
538         splice_from_pipe_begin(sd);
539         do {
540                 cond_resched();
541                 ret = splice_from_pipe_next(pipe, sd);
542                 if (ret > 0)
543                         ret = splice_from_pipe_feed(pipe, sd, actor);
544         } while (ret > 0);
545         splice_from_pipe_end(pipe, sd);
546
547         return sd->num_spliced ? sd->num_spliced : ret;
548 }
549 EXPORT_SYMBOL(__splice_from_pipe);
550
551 /**
552  * splice_from_pipe - splice data from a pipe to a file
553  * @pipe:       pipe to splice from
554  * @out:        file to splice to
555  * @ppos:       position in @out
556  * @len:        how many bytes to splice
557  * @flags:      splice modifier flags
558  * @actor:      handler that splices the data
559  *
560  * Description:
561  *    See __splice_from_pipe. This function locks the pipe inode,
562  *    otherwise it's identical to __splice_from_pipe().
563  *
564  */
565 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
566                          loff_t *ppos, size_t len, unsigned int flags,
567                          splice_actor *actor)
568 {
569         ssize_t ret;
570         struct splice_desc sd = {
571                 .total_len = len,
572                 .flags = flags,
573                 .pos = *ppos,
574                 .u.file = out,
575         };
576
577         pipe_lock(pipe);
578         ret = __splice_from_pipe(pipe, &sd, actor);
579         pipe_unlock(pipe);
580
581         return ret;
582 }
583
584 /**
585  * iter_file_splice_write - splice data from a pipe to a file
586  * @pipe:       pipe info
587  * @out:        file to write to
588  * @ppos:       position in @out
589  * @len:        number of bytes to splice
590  * @flags:      splice modifier flags
591  *
592  * Description:
593  *    Will either move or copy pages (determined by @flags options) from
594  *    the given pipe inode to the given file.
595  *    This one is ->write_iter-based.
596  *
597  */
598 ssize_t
599 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
600                           loff_t *ppos, size_t len, unsigned int flags)
601 {
602         struct splice_desc sd = {
603                 .total_len = len,
604                 .flags = flags,
605                 .pos = *ppos,
606                 .u.file = out,
607         };
608         int nbufs = pipe->max_usage;
609         struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
610                                         GFP_KERNEL);
611         ssize_t ret;
612
613         if (unlikely(!array))
614                 return -ENOMEM;
615
616         pipe_lock(pipe);
617
618         splice_from_pipe_begin(&sd);
619         while (sd.total_len) {
620                 struct iov_iter from;
621                 unsigned int head, tail, mask;
622                 size_t left;
623                 int n;
624
625                 ret = splice_from_pipe_next(pipe, &sd);
626                 if (ret <= 0)
627                         break;
628
629                 if (unlikely(nbufs < pipe->max_usage)) {
630                         kfree(array);
631                         nbufs = pipe->max_usage;
632                         array = kcalloc(nbufs, sizeof(struct bio_vec),
633                                         GFP_KERNEL);
634                         if (!array) {
635                                 ret = -ENOMEM;
636                                 break;
637                         }
638                 }
639
640                 head = pipe->head;
641                 tail = pipe->tail;
642                 mask = pipe->ring_size - 1;
643
644                 /* build the vector */
645                 left = sd.total_len;
646                 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) {
647                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
648                         size_t this_len = buf->len;
649
650                         if (this_len > left)
651                                 this_len = left;
652
653                         ret = pipe_buf_confirm(pipe, buf);
654                         if (unlikely(ret)) {
655                                 if (ret == -ENODATA)
656                                         ret = 0;
657                                 goto done;
658                         }
659
660                         array[n].bv_page = buf->page;
661                         array[n].bv_len = this_len;
662                         array[n].bv_offset = buf->offset;
663                         left -= this_len;
664                 }
665
666                 iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
667                 ret = vfs_iter_write(out, &from, &sd.pos, 0);
668                 if (ret <= 0)
669                         break;
670
671                 sd.num_spliced += ret;
672                 sd.total_len -= ret;
673                 *ppos = sd.pos;
674
675                 /* dismiss the fully eaten buffers, adjust the partial one */
676                 tail = pipe->tail;
677                 while (ret) {
678                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
679                         if (ret >= buf->len) {
680                                 ret -= buf->len;
681                                 buf->len = 0;
682                                 pipe_buf_release(pipe, buf);
683                                 tail++;
684                                 pipe->tail = tail;
685                                 if (pipe->files)
686                                         sd.need_wakeup = true;
687                         } else {
688                                 buf->offset += ret;
689                                 buf->len -= ret;
690                                 ret = 0;
691                         }
692                 }
693         }
694 done:
695         kfree(array);
696         splice_from_pipe_end(pipe, &sd);
697
698         pipe_unlock(pipe);
699
700         if (sd.num_spliced)
701                 ret = sd.num_spliced;
702
703         return ret;
704 }
705
706 EXPORT_SYMBOL(iter_file_splice_write);
707
708 /**
709  * generic_splice_sendpage - splice data from a pipe to a socket
710  * @pipe:       pipe to splice from
711  * @out:        socket to write to
712  * @ppos:       position in @out
713  * @len:        number of bytes to splice
714  * @flags:      splice modifier flags
715  *
716  * Description:
717  *    Will send @len bytes from the pipe to a network socket. No data copying
718  *    is involved.
719  *
720  */
721 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
722                                 loff_t *ppos, size_t len, unsigned int flags)
723 {
724         return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
725 }
726
727 EXPORT_SYMBOL(generic_splice_sendpage);
728
729 static int warn_unsupported(struct file *file, const char *op)
730 {
731         pr_debug_ratelimited(
732                 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
733                 op, file, current->pid, current->comm);
734         return -EINVAL;
735 }
736
737 /*
738  * Attempt to initiate a splice from pipe to file.
739  */
740 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
741                            loff_t *ppos, size_t len, unsigned int flags)
742 {
743         if (unlikely(!out->f_op->splice_write))
744                 return warn_unsupported(out, "write");
745         return out->f_op->splice_write(pipe, out, ppos, len, flags);
746 }
747
748 /*
749  * Attempt to initiate a splice from a file to a pipe.
750  */
751 static long do_splice_to(struct file *in, loff_t *ppos,
752                          struct pipe_inode_info *pipe, size_t len,
753                          unsigned int flags)
754 {
755         int ret;
756
757         if (unlikely(!(in->f_mode & FMODE_READ)))
758                 return -EBADF;
759
760         ret = rw_verify_area(READ, in, ppos, len);
761         if (unlikely(ret < 0))
762                 return ret;
763
764         if (unlikely(len > MAX_RW_COUNT))
765                 len = MAX_RW_COUNT;
766
767         if (unlikely(!in->f_op->splice_read))
768                 return warn_unsupported(in, "read");
769         return in->f_op->splice_read(in, ppos, pipe, len, flags);
770 }
771
772 /**
773  * splice_direct_to_actor - splices data directly between two non-pipes
774  * @in:         file to splice from
775  * @sd:         actor information on where to splice to
776  * @actor:      handles the data splicing
777  *
778  * Description:
779  *    This is a special case helper to splice directly between two
780  *    points, without requiring an explicit pipe. Internally an allocated
781  *    pipe is cached in the process, and reused during the lifetime of
782  *    that process.
783  *
784  */
785 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
786                                splice_direct_actor *actor)
787 {
788         struct pipe_inode_info *pipe;
789         long ret, bytes;
790         umode_t i_mode;
791         size_t len;
792         int i, flags, more;
793
794         /*
795          * We require the input being a regular file, as we don't want to
796          * randomly drop data for eg socket -> socket splicing. Use the
797          * piped splicing for that!
798          */
799         i_mode = file_inode(in)->i_mode;
800         if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
801                 return -EINVAL;
802
803         /*
804          * neither in nor out is a pipe, setup an internal pipe attached to
805          * 'out' and transfer the wanted data from 'in' to 'out' through that
806          */
807         pipe = current->splice_pipe;
808         if (unlikely(!pipe)) {
809                 pipe = alloc_pipe_info();
810                 if (!pipe)
811                         return -ENOMEM;
812
813                 /*
814                  * We don't have an immediate reader, but we'll read the stuff
815                  * out of the pipe right after the splice_to_pipe(). So set
816                  * PIPE_READERS appropriately.
817                  */
818                 pipe->readers = 1;
819
820                 current->splice_pipe = pipe;
821         }
822
823         /*
824          * Do the splice.
825          */
826         ret = 0;
827         bytes = 0;
828         len = sd->total_len;
829         flags = sd->flags;
830
831         /*
832          * Don't block on output, we have to drain the direct pipe.
833          */
834         sd->flags &= ~SPLICE_F_NONBLOCK;
835         more = sd->flags & SPLICE_F_MORE;
836
837         WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
838
839         while (len) {
840                 unsigned int p_space;
841                 size_t read_len;
842                 loff_t pos = sd->pos, prev_pos = pos;
843
844                 /* Don't try to read more the pipe has space for. */
845                 p_space = pipe->max_usage -
846                         pipe_occupancy(pipe->head, pipe->tail);
847                 read_len = min_t(size_t, len, p_space << PAGE_SHIFT);
848                 ret = do_splice_to(in, &pos, pipe, read_len, flags);
849                 if (unlikely(ret <= 0))
850                         goto out_release;
851
852                 read_len = ret;
853                 sd->total_len = read_len;
854
855                 /*
856                  * If more data is pending, set SPLICE_F_MORE
857                  * If this is the last data and SPLICE_F_MORE was not set
858                  * initially, clears it.
859                  */
860                 if (read_len < len)
861                         sd->flags |= SPLICE_F_MORE;
862                 else if (!more)
863                         sd->flags &= ~SPLICE_F_MORE;
864                 /*
865                  * NOTE: nonblocking mode only applies to the input. We
866                  * must not do the output in nonblocking mode as then we
867                  * could get stuck data in the internal pipe:
868                  */
869                 ret = actor(pipe, sd);
870                 if (unlikely(ret <= 0)) {
871                         sd->pos = prev_pos;
872                         goto out_release;
873                 }
874
875                 bytes += ret;
876                 len -= ret;
877                 sd->pos = pos;
878
879                 if (ret < read_len) {
880                         sd->pos = prev_pos + ret;
881                         goto out_release;
882                 }
883         }
884
885 done:
886         pipe->tail = pipe->head = 0;
887         file_accessed(in);
888         return bytes;
889
890 out_release:
891         /*
892          * If we did an incomplete transfer we must release
893          * the pipe buffers in question:
894          */
895         for (i = 0; i < pipe->ring_size; i++) {
896                 struct pipe_buffer *buf = &pipe->bufs[i];
897
898                 if (buf->ops)
899                         pipe_buf_release(pipe, buf);
900         }
901
902         if (!bytes)
903                 bytes = ret;
904
905         goto done;
906 }
907 EXPORT_SYMBOL(splice_direct_to_actor);
908
909 static int direct_splice_actor(struct pipe_inode_info *pipe,
910                                struct splice_desc *sd)
911 {
912         struct file *file = sd->u.file;
913
914         return do_splice_from(pipe, file, sd->opos, sd->total_len,
915                               sd->flags);
916 }
917
918 /**
919  * do_splice_direct - splices data directly between two files
920  * @in:         file to splice from
921  * @ppos:       input file offset
922  * @out:        file to splice to
923  * @opos:       output file offset
924  * @len:        number of bytes to splice
925  * @flags:      splice modifier flags
926  *
927  * Description:
928  *    For use by do_sendfile(). splice can easily emulate sendfile, but
929  *    doing it in the application would incur an extra system call
930  *    (splice in + splice out, as compared to just sendfile()). So this helper
931  *    can splice directly through a process-private pipe.
932  *
933  */
934 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
935                       loff_t *opos, size_t len, unsigned int flags)
936 {
937         struct splice_desc sd = {
938                 .len            = len,
939                 .total_len      = len,
940                 .flags          = flags,
941                 .pos            = *ppos,
942                 .u.file         = out,
943                 .opos           = opos,
944         };
945         long ret;
946
947         if (unlikely(!(out->f_mode & FMODE_WRITE)))
948                 return -EBADF;
949
950         if (unlikely(out->f_flags & O_APPEND))
951                 return -EINVAL;
952
953         ret = rw_verify_area(WRITE, out, opos, len);
954         if (unlikely(ret < 0))
955                 return ret;
956
957         ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
958         if (ret > 0)
959                 *ppos = sd.pos;
960
961         return ret;
962 }
963 EXPORT_SYMBOL(do_splice_direct);
964
965 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
966 {
967         for (;;) {
968                 if (unlikely(!pipe->readers)) {
969                         send_sig(SIGPIPE, current, 0);
970                         return -EPIPE;
971                 }
972                 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
973                         return 0;
974                 if (flags & SPLICE_F_NONBLOCK)
975                         return -EAGAIN;
976                 if (signal_pending(current))
977                         return -ERESTARTSYS;
978                 pipe_wait(pipe);
979         }
980 }
981
982 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
983                                struct pipe_inode_info *opipe,
984                                size_t len, unsigned int flags);
985
986 /*
987  * Determine where to splice to/from.
988  */
989 long do_splice(struct file *in, loff_t __user *off_in,
990                 struct file *out, loff_t __user *off_out,
991                 size_t len, unsigned int flags)
992 {
993         struct pipe_inode_info *ipipe;
994         struct pipe_inode_info *opipe;
995         loff_t offset;
996         long ret;
997
998         if (unlikely(!(in->f_mode & FMODE_READ) ||
999                      !(out->f_mode & FMODE_WRITE)))
1000                 return -EBADF;
1001
1002         ipipe = get_pipe_info(in, true);
1003         opipe = get_pipe_info(out, true);
1004
1005         if (ipipe && opipe) {
1006                 if (off_in || off_out)
1007                         return -ESPIPE;
1008
1009                 /* Splicing to self would be fun, but... */
1010                 if (ipipe == opipe)
1011                         return -EINVAL;
1012
1013                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1014                         flags |= SPLICE_F_NONBLOCK;
1015
1016                 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1017         }
1018
1019         if (ipipe) {
1020                 if (off_in)
1021                         return -ESPIPE;
1022                 if (off_out) {
1023                         if (!(out->f_mode & FMODE_PWRITE))
1024                                 return -EINVAL;
1025                         if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1026                                 return -EFAULT;
1027                 } else {
1028                         offset = out->f_pos;
1029                 }
1030
1031                 if (unlikely(out->f_flags & O_APPEND))
1032                         return -EINVAL;
1033
1034                 ret = rw_verify_area(WRITE, out, &offset, len);
1035                 if (unlikely(ret < 0))
1036                         return ret;
1037
1038                 if (in->f_flags & O_NONBLOCK)
1039                         flags |= SPLICE_F_NONBLOCK;
1040
1041                 file_start_write(out);
1042                 ret = do_splice_from(ipipe, out, &offset, len, flags);
1043                 file_end_write(out);
1044
1045                 if (!off_out)
1046                         out->f_pos = offset;
1047                 else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1048                         ret = -EFAULT;
1049
1050                 return ret;
1051         }
1052
1053         if (opipe) {
1054                 if (off_out)
1055                         return -ESPIPE;
1056                 if (off_in) {
1057                         if (!(in->f_mode & FMODE_PREAD))
1058                                 return -EINVAL;
1059                         if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1060                                 return -EFAULT;
1061                 } else {
1062                         offset = in->f_pos;
1063                 }
1064
1065                 if (out->f_flags & O_NONBLOCK)
1066                         flags |= SPLICE_F_NONBLOCK;
1067
1068                 pipe_lock(opipe);
1069                 ret = wait_for_space(opipe, flags);
1070                 if (!ret) {
1071                         unsigned int p_space;
1072
1073                         /* Don't try to read more the pipe has space for. */
1074                         p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);
1075                         len = min_t(size_t, len, p_space << PAGE_SHIFT);
1076
1077                         ret = do_splice_to(in, &offset, opipe, len, flags);
1078                 }
1079                 pipe_unlock(opipe);
1080                 if (ret > 0)
1081                         wakeup_pipe_readers(opipe);
1082                 if (!off_in)
1083                         in->f_pos = offset;
1084                 else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1085                         ret = -EFAULT;
1086
1087                 return ret;
1088         }
1089
1090         return -EINVAL;
1091 }
1092
1093 static int iter_to_pipe(struct iov_iter *from,
1094                         struct pipe_inode_info *pipe,
1095                         unsigned flags)
1096 {
1097         struct pipe_buffer buf = {
1098                 .ops = &user_page_pipe_buf_ops,
1099                 .flags = flags
1100         };
1101         size_t total = 0;
1102         int ret = 0;
1103         bool failed = false;
1104
1105         while (iov_iter_count(from) && !failed) {
1106                 struct page *pages[16];
1107                 ssize_t copied;
1108                 size_t start;
1109                 int n;
1110
1111                 copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1112                 if (copied <= 0) {
1113                         ret = copied;
1114                         break;
1115                 }
1116
1117                 for (n = 0; copied; n++, start = 0) {
1118                         int size = min_t(int, copied, PAGE_SIZE - start);
1119                         if (!failed) {
1120                                 buf.page = pages[n];
1121                                 buf.offset = start;
1122                                 buf.len = size;
1123                                 ret = add_to_pipe(pipe, &buf);
1124                                 if (unlikely(ret < 0)) {
1125                                         failed = true;
1126                                 } else {
1127                                         iov_iter_advance(from, ret);
1128                                         total += ret;
1129                                 }
1130                         } else {
1131                                 put_page(pages[n]);
1132                         }
1133                         copied -= size;
1134                 }
1135         }
1136         return total ? total : ret;
1137 }
1138
1139 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1140                         struct splice_desc *sd)
1141 {
1142         int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1143         return n == sd->len ? n : -EFAULT;
1144 }
1145
1146 /*
1147  * For lack of a better implementation, implement vmsplice() to userspace
1148  * as a simple copy of the pipes pages to the user iov.
1149  */
1150 static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1151                              unsigned int flags)
1152 {
1153         struct pipe_inode_info *pipe = get_pipe_info(file, true);
1154         struct splice_desc sd = {
1155                 .total_len = iov_iter_count(iter),
1156                 .flags = flags,
1157                 .u.data = iter
1158         };
1159         long ret = 0;
1160
1161         if (!pipe)
1162                 return -EBADF;
1163
1164         if (sd.total_len) {
1165                 pipe_lock(pipe);
1166                 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1167                 pipe_unlock(pipe);
1168         }
1169
1170         return ret;
1171 }
1172
1173 /*
1174  * vmsplice splices a user address range into a pipe. It can be thought of
1175  * as splice-from-memory, where the regular splice is splice-from-file (or
1176  * to file). In both cases the output is a pipe, naturally.
1177  */
1178 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1179                              unsigned int flags)
1180 {
1181         struct pipe_inode_info *pipe;
1182         long ret = 0;
1183         unsigned buf_flag = 0;
1184
1185         if (flags & SPLICE_F_GIFT)
1186                 buf_flag = PIPE_BUF_FLAG_GIFT;
1187
1188         pipe = get_pipe_info(file, true);
1189         if (!pipe)
1190                 return -EBADF;
1191
1192         pipe_lock(pipe);
1193         ret = wait_for_space(pipe, flags);
1194         if (!ret)
1195                 ret = iter_to_pipe(iter, pipe, buf_flag);
1196         pipe_unlock(pipe);
1197         if (ret > 0)
1198                 wakeup_pipe_readers(pipe);
1199         return ret;
1200 }
1201
1202 static int vmsplice_type(struct fd f, int *type)
1203 {
1204         if (!f.file)
1205                 return -EBADF;
1206         if (f.file->f_mode & FMODE_WRITE) {
1207                 *type = WRITE;
1208         } else if (f.file->f_mode & FMODE_READ) {
1209                 *type = READ;
1210         } else {
1211                 fdput(f);
1212                 return -EBADF;
1213         }
1214         return 0;
1215 }
1216
1217 /*
1218  * Note that vmsplice only really supports true splicing _from_ user memory
1219  * to a pipe, not the other way around. Splicing from user memory is a simple
1220  * operation that can be supported without any funky alignment restrictions
1221  * or nasty vm tricks. We simply map in the user memory and fill them into
1222  * a pipe. The reverse isn't quite as easy, though. There are two possible
1223  * solutions for that:
1224  *
1225  *      - memcpy() the data internally, at which point we might as well just
1226  *        do a regular read() on the buffer anyway.
1227  *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1228  *        has restriction limitations on both ends of the pipe).
1229  *
1230  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1231  *
1232  */
1233 static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags)
1234 {
1235         if (unlikely(flags & ~SPLICE_F_ALL))
1236                 return -EINVAL;
1237
1238         if (!iov_iter_count(iter))
1239                 return 0;
1240
1241         if (iov_iter_rw(iter) == WRITE)
1242                 return vmsplice_to_pipe(f, iter, flags);
1243         else
1244                 return vmsplice_to_user(f, iter, flags);
1245 }
1246
1247 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1248                 unsigned long, nr_segs, unsigned int, flags)
1249 {
1250         struct iovec iovstack[UIO_FASTIOV];
1251         struct iovec *iov = iovstack;
1252         struct iov_iter iter;
1253         ssize_t error;
1254         struct fd f;
1255         int type;
1256
1257         f = fdget(fd);
1258         error = vmsplice_type(f, &type);
1259         if (error)
1260                 return error;
1261
1262         error = import_iovec(type, uiov, nr_segs,
1263                              ARRAY_SIZE(iovstack), &iov, &iter);
1264         if (error >= 0) {
1265                 error = do_vmsplice(f.file, &iter, flags);
1266                 kfree(iov);
1267         }
1268         fdput(f);
1269         return error;
1270 }
1271
1272 #ifdef CONFIG_COMPAT
1273 COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1274                     unsigned int, nr_segs, unsigned int, flags)
1275 {
1276         struct iovec iovstack[UIO_FASTIOV];
1277         struct iovec *iov = iovstack;
1278         struct iov_iter iter;
1279         ssize_t error;
1280         struct fd f;
1281         int type;
1282
1283         f = fdget(fd);
1284         error = vmsplice_type(f, &type);
1285         if (error)
1286                 return error;
1287
1288         error = compat_import_iovec(type, iov32, nr_segs,
1289                              ARRAY_SIZE(iovstack), &iov, &iter);
1290         if (error >= 0) {
1291                 error = do_vmsplice(f.file, &iter, flags);
1292                 kfree(iov);
1293         }
1294         fdput(f);
1295         return error;
1296 }
1297 #endif
1298
1299 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1300                 int, fd_out, loff_t __user *, off_out,
1301                 size_t, len, unsigned int, flags)
1302 {
1303         struct fd in, out;
1304         long error;
1305
1306         if (unlikely(!len))
1307                 return 0;
1308
1309         if (unlikely(flags & ~SPLICE_F_ALL))
1310                 return -EINVAL;
1311
1312         error = -EBADF;
1313         in = fdget(fd_in);
1314         if (in.file) {
1315                 out = fdget(fd_out);
1316                 if (out.file) {
1317                         error = do_splice(in.file, off_in, out.file, off_out,
1318                                           len, flags);
1319                         fdput(out);
1320                 }
1321                 fdput(in);
1322         }
1323         return error;
1324 }
1325
1326 /*
1327  * Make sure there's data to read. Wait for input if we can, otherwise
1328  * return an appropriate error.
1329  */
1330 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1331 {
1332         int ret;
1333
1334         /*
1335          * Check the pipe occupancy without the inode lock first. This function
1336          * is speculative anyways, so missing one is ok.
1337          */
1338         if (!pipe_empty(pipe->head, pipe->tail))
1339                 return 0;
1340
1341         ret = 0;
1342         pipe_lock(pipe);
1343
1344         while (pipe_empty(pipe->head, pipe->tail)) {
1345                 if (signal_pending(current)) {
1346                         ret = -ERESTARTSYS;
1347                         break;
1348                 }
1349                 if (!pipe->writers)
1350                         break;
1351                 if (flags & SPLICE_F_NONBLOCK) {
1352                         ret = -EAGAIN;
1353                         break;
1354                 }
1355                 pipe_wait(pipe);
1356         }
1357
1358         pipe_unlock(pipe);
1359         return ret;
1360 }
1361
1362 /*
1363  * Make sure there's writeable room. Wait for room if we can, otherwise
1364  * return an appropriate error.
1365  */
1366 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1367 {
1368         int ret;
1369
1370         /*
1371          * Check pipe occupancy without the inode lock first. This function
1372          * is speculative anyways, so missing one is ok.
1373          */
1374         if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1375                 return 0;
1376
1377         ret = 0;
1378         pipe_lock(pipe);
1379
1380         while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1381                 if (!pipe->readers) {
1382                         send_sig(SIGPIPE, current, 0);
1383                         ret = -EPIPE;
1384                         break;
1385                 }
1386                 if (flags & SPLICE_F_NONBLOCK) {
1387                         ret = -EAGAIN;
1388                         break;
1389                 }
1390                 if (signal_pending(current)) {
1391                         ret = -ERESTARTSYS;
1392                         break;
1393                 }
1394                 pipe_wait(pipe);
1395         }
1396
1397         pipe_unlock(pipe);
1398         return ret;
1399 }
1400
1401 /*
1402  * Splice contents of ipipe to opipe.
1403  */
1404 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1405                                struct pipe_inode_info *opipe,
1406                                size_t len, unsigned int flags)
1407 {
1408         struct pipe_buffer *ibuf, *obuf;
1409         unsigned int i_head, o_head;
1410         unsigned int i_tail, o_tail;
1411         unsigned int i_mask, o_mask;
1412         int ret = 0;
1413         bool input_wakeup = false;
1414
1415
1416 retry:
1417         ret = ipipe_prep(ipipe, flags);
1418         if (ret)
1419                 return ret;
1420
1421         ret = opipe_prep(opipe, flags);
1422         if (ret)
1423                 return ret;
1424
1425         /*
1426          * Potential ABBA deadlock, work around it by ordering lock
1427          * grabbing by pipe info address. Otherwise two different processes
1428          * could deadlock (one doing tee from A -> B, the other from B -> A).
1429          */
1430         pipe_double_lock(ipipe, opipe);
1431
1432         i_tail = ipipe->tail;
1433         i_mask = ipipe->ring_size - 1;
1434         o_head = opipe->head;
1435         o_mask = opipe->ring_size - 1;
1436
1437         do {
1438                 size_t o_len;
1439
1440                 if (!opipe->readers) {
1441                         send_sig(SIGPIPE, current, 0);
1442                         if (!ret)
1443                                 ret = -EPIPE;
1444                         break;
1445                 }
1446
1447                 i_head = ipipe->head;
1448                 o_tail = opipe->tail;
1449
1450                 if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1451                         break;
1452
1453                 /*
1454                  * Cannot make any progress, because either the input
1455                  * pipe is empty or the output pipe is full.
1456                  */
1457                 if (pipe_empty(i_head, i_tail) ||
1458                     pipe_full(o_head, o_tail, opipe->max_usage)) {
1459                         /* Already processed some buffers, break */
1460                         if (ret)
1461                                 break;
1462
1463                         if (flags & SPLICE_F_NONBLOCK) {
1464                                 ret = -EAGAIN;
1465                                 break;
1466                         }
1467
1468                         /*
1469                          * We raced with another reader/writer and haven't
1470                          * managed to process any buffers.  A zero return
1471                          * value means EOF, so retry instead.
1472                          */
1473                         pipe_unlock(ipipe);
1474                         pipe_unlock(opipe);
1475                         goto retry;
1476                 }
1477
1478                 ibuf = &ipipe->bufs[i_tail & i_mask];
1479                 obuf = &opipe->bufs[o_head & o_mask];
1480
1481                 if (len >= ibuf->len) {
1482                         /*
1483                          * Simply move the whole buffer from ipipe to opipe
1484                          */
1485                         *obuf = *ibuf;
1486                         ibuf->ops = NULL;
1487                         i_tail++;
1488                         ipipe->tail = i_tail;
1489                         input_wakeup = true;
1490                         o_len = obuf->len;
1491                         o_head++;
1492                         opipe->head = o_head;
1493                 } else {
1494                         /*
1495                          * Get a reference to this pipe buffer,
1496                          * so we can copy the contents over.
1497                          */
1498                         if (!pipe_buf_get(ipipe, ibuf)) {
1499                                 if (ret == 0)
1500                                         ret = -EFAULT;
1501                                 break;
1502                         }
1503                         *obuf = *ibuf;
1504
1505                         /*
1506                          * Don't inherit the gift and merge flags, we need to
1507                          * prevent multiple steals of this page.
1508                          */
1509                         obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1510                         obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1511
1512                         obuf->len = len;
1513                         ibuf->offset += len;
1514                         ibuf->len -= len;
1515                         o_len = len;
1516                         o_head++;
1517                         opipe->head = o_head;
1518                 }
1519                 ret += o_len;
1520                 len -= o_len;
1521         } while (len);
1522
1523         pipe_unlock(ipipe);
1524         pipe_unlock(opipe);
1525
1526         /*
1527          * If we put data in the output pipe, wakeup any potential readers.
1528          */
1529         if (ret > 0)
1530                 wakeup_pipe_readers(opipe);
1531
1532         if (input_wakeup)
1533                 wakeup_pipe_writers(ipipe);
1534
1535         return ret;
1536 }
1537
1538 /*
1539  * Link contents of ipipe to opipe.
1540  */
1541 static int link_pipe(struct pipe_inode_info *ipipe,
1542                      struct pipe_inode_info *opipe,
1543                      size_t len, unsigned int flags)
1544 {
1545         struct pipe_buffer *ibuf, *obuf;
1546         unsigned int i_head, o_head;
1547         unsigned int i_tail, o_tail;
1548         unsigned int i_mask, o_mask;
1549         int ret = 0;
1550
1551         /*
1552          * Potential ABBA deadlock, work around it by ordering lock
1553          * grabbing by pipe info address. Otherwise two different processes
1554          * could deadlock (one doing tee from A -> B, the other from B -> A).
1555          */
1556         pipe_double_lock(ipipe, opipe);
1557
1558         i_tail = ipipe->tail;
1559         i_mask = ipipe->ring_size - 1;
1560         o_head = opipe->head;
1561         o_mask = opipe->ring_size - 1;
1562
1563         do {
1564                 if (!opipe->readers) {
1565                         send_sig(SIGPIPE, current, 0);
1566                         if (!ret)
1567                                 ret = -EPIPE;
1568                         break;
1569                 }
1570
1571                 i_head = ipipe->head;
1572                 o_tail = opipe->tail;
1573
1574                 /*
1575                  * If we have iterated all input buffers or run out of
1576                  * output room, break.
1577                  */
1578                 if (pipe_empty(i_head, i_tail) ||
1579                     pipe_full(o_head, o_tail, opipe->max_usage))
1580                         break;
1581
1582                 ibuf = &ipipe->bufs[i_tail & i_mask];
1583                 obuf = &opipe->bufs[o_head & o_mask];
1584
1585                 /*
1586                  * Get a reference to this pipe buffer,
1587                  * so we can copy the contents over.
1588                  */
1589                 if (!pipe_buf_get(ipipe, ibuf)) {
1590                         if (ret == 0)
1591                                 ret = -EFAULT;
1592                         break;
1593                 }
1594
1595                 *obuf = *ibuf;
1596
1597                 /*
1598                  * Don't inherit the gift and merge flag, we need to prevent
1599                  * multiple steals of this page.
1600                  */
1601                 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1602                 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1603
1604                 if (obuf->len > len)
1605                         obuf->len = len;
1606                 ret += obuf->len;
1607                 len -= obuf->len;
1608
1609                 o_head++;
1610                 opipe->head = o_head;
1611                 i_tail++;
1612         } while (len);
1613
1614         pipe_unlock(ipipe);
1615         pipe_unlock(opipe);
1616
1617         /*
1618          * If we put data in the output pipe, wakeup any potential readers.
1619          */
1620         if (ret > 0)
1621                 wakeup_pipe_readers(opipe);
1622
1623         return ret;
1624 }
1625
1626 /*
1627  * This is a tee(1) implementation that works on pipes. It doesn't copy
1628  * any data, it simply references the 'in' pages on the 'out' pipe.
1629  * The 'flags' used are the SPLICE_F_* variants, currently the only
1630  * applicable one is SPLICE_F_NONBLOCK.
1631  */
1632 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1633 {
1634         struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1635         struct pipe_inode_info *opipe = get_pipe_info(out, true);
1636         int ret = -EINVAL;
1637
1638         if (unlikely(!(in->f_mode & FMODE_READ) ||
1639                      !(out->f_mode & FMODE_WRITE)))
1640                 return -EBADF;
1641
1642         /*
1643          * Duplicate the contents of ipipe to opipe without actually
1644          * copying the data.
1645          */
1646         if (ipipe && opipe && ipipe != opipe) {
1647                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1648                         flags |= SPLICE_F_NONBLOCK;
1649
1650                 /*
1651                  * Keep going, unless we encounter an error. The ipipe/opipe
1652                  * ordering doesn't really matter.
1653                  */
1654                 ret = ipipe_prep(ipipe, flags);
1655                 if (!ret) {
1656                         ret = opipe_prep(opipe, flags);
1657                         if (!ret)
1658                                 ret = link_pipe(ipipe, opipe, len, flags);
1659                 }
1660         }
1661
1662         return ret;
1663 }
1664
1665 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1666 {
1667         struct fd in, out;
1668         int error;
1669
1670         if (unlikely(flags & ~SPLICE_F_ALL))
1671                 return -EINVAL;
1672
1673         if (unlikely(!len))
1674                 return 0;
1675
1676         error = -EBADF;
1677         in = fdget(fdin);
1678         if (in.file) {
1679                 out = fdget(fdout);
1680                 if (out.file) {
1681                         error = do_tee(in.file, out.file, len, flags);
1682                         fdput(out);
1683                 }
1684                 fdput(in);
1685         }
1686
1687         return error;
1688 }