Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * "splice": joining two ropes together by interweaving their strands.
4 : *
5 : * This is the "extended pipe" functionality, where a pipe is used as
6 : * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7 : * buffer that you can use to transfer data from one end to the other.
8 : *
9 : * The traditional unix read/write is extended with a "splice()" operation
10 : * that transfers data buffers to or from a pipe buffer.
11 : *
12 : * Named by Larry McVoy, original implementation from Linus, extended by
13 : * Jens to support splicing to files, network, direct splicing, etc and
14 : * fixing lots of bugs.
15 : *
16 : * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17 : * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18 : * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19 : *
20 : */
21 : #include <linux/bvec.h>
22 : #include <linux/fs.h>
23 : #include <linux/file.h>
24 : #include <linux/pagemap.h>
25 : #include <linux/splice.h>
26 : #include <linux/memcontrol.h>
27 : #include <linux/mm_inline.h>
28 : #include <linux/swap.h>
29 : #include <linux/writeback.h>
30 : #include <linux/export.h>
31 : #include <linux/syscalls.h>
32 : #include <linux/uio.h>
33 : #include <linux/fsnotify.h>
34 : #include <linux/security.h>
35 : #include <linux/gfp.h>
36 : #include <linux/net.h>
37 : #include <linux/socket.h>
38 : #include <linux/sched/signal.h>
39 :
40 : #include "internal.h"
41 :
42 : /*
43 : * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
44 : * indicate they support non-blocking reads or writes, we must clear it
45 : * here if set to avoid blocking other users of this pipe if splice is
46 : * being done on it.
47 : */
48 2567300 : static noinline void noinline pipe_clear_nowait(struct file *file)
49 : {
50 2567300 : fmode_t fmode = READ_ONCE(file->f_mode);
51 :
52 2567300 : do {
53 2567300 : if (!(fmode & FMODE_NOWAIT))
54 : break;
55 2443298 : } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
56 2567301 : }
57 :
58 : /*
59 : * Attempt to steal a page from a pipe buffer. This should perhaps go into
60 : * a vm helper function, it's already simplified quite a bit by the
61 : * addition of remove_mapping(). If success is returned, the caller may
62 : * attempt to reuse this page for another destination.
63 : */
64 0 : static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
65 : struct pipe_buffer *buf)
66 : {
67 0 : struct folio *folio = page_folio(buf->page);
68 0 : struct address_space *mapping;
69 :
70 0 : folio_lock(folio);
71 :
72 0 : mapping = folio_mapping(folio);
73 0 : if (mapping) {
74 0 : WARN_ON(!folio_test_uptodate(folio));
75 :
76 : /*
77 : * At least for ext2 with nobh option, we need to wait on
78 : * writeback completing on this folio, since we'll remove it
79 : * from the pagecache. Otherwise truncate wont wait on the
80 : * folio, allowing the disk blocks to be reused by someone else
81 : * before we actually wrote our data to them. fs corruption
82 : * ensues.
83 : */
84 0 : folio_wait_writeback(folio);
85 :
86 0 : if (folio_has_private(folio) &&
87 0 : !filemap_release_folio(folio, GFP_KERNEL))
88 0 : goto out_unlock;
89 :
90 : /*
91 : * If we succeeded in removing the mapping, set LRU flag
92 : * and return good.
93 : */
94 0 : if (remove_mapping(mapping, folio)) {
95 0 : buf->flags |= PIPE_BUF_FLAG_LRU;
96 0 : return true;
97 : }
98 : }
99 :
100 : /*
101 : * Raced with truncate or failed to remove folio from current
102 : * address space, unlock and return failure.
103 : */
104 0 : out_unlock:
105 0 : folio_unlock(folio);
106 0 : return false;
107 : }
108 :
109 12548559 : static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
110 : struct pipe_buffer *buf)
111 : {
112 12548559 : put_page(buf->page);
113 12548609 : buf->flags &= ~PIPE_BUF_FLAG_LRU;
114 12548609 : }
115 :
116 : /*
117 : * Check whether the contents of buf is OK to access. Since the content
118 : * is a page cache page, IO may be in flight.
119 : */
120 12546465 : static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
121 : struct pipe_buffer *buf)
122 : {
123 12546465 : struct page *page = buf->page;
124 12546465 : int err;
125 :
126 12546465 : if (!PageUptodate(page)) {
127 0 : lock_page(page);
128 :
129 : /*
130 : * Page got truncated/unhashed. This will cause a 0-byte
131 : * splice, if this is the first page.
132 : */
133 0 : if (!page->mapping) {
134 0 : err = -ENODATA;
135 0 : goto error;
136 : }
137 :
138 : /*
139 : * Uh oh, read-error from disk.
140 : */
141 0 : if (!PageUptodate(page)) {
142 0 : err = -EIO;
143 0 : goto error;
144 : }
145 :
146 : /*
147 : * Page is ok afterall, we are done.
148 : */
149 0 : unlock_page(page);
150 : }
151 :
152 : return 0;
153 0 : error:
154 0 : unlock_page(page);
155 0 : return err;
156 : }
157 :
158 : const struct pipe_buf_operations page_cache_pipe_buf_ops = {
159 : .confirm = page_cache_pipe_buf_confirm,
160 : .release = page_cache_pipe_buf_release,
161 : .try_steal = page_cache_pipe_buf_try_steal,
162 : .get = generic_pipe_buf_get,
163 : };
164 :
165 0 : static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
166 : struct pipe_buffer *buf)
167 : {
168 0 : if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
169 : return false;
170 :
171 0 : buf->flags |= PIPE_BUF_FLAG_LRU;
172 0 : return generic_pipe_buf_try_steal(pipe, buf);
173 : }
174 :
175 : static const struct pipe_buf_operations user_page_pipe_buf_ops = {
176 : .release = page_cache_pipe_buf_release,
177 : .try_steal = user_page_pipe_buf_try_steal,
178 : .get = generic_pipe_buf_get,
179 : };
180 :
181 1238626 : static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
182 : {
183 1238626 : smp_mb();
184 1238626 : if (waitqueue_active(&pipe->rd_wait))
185 0 : wake_up_interruptible(&pipe->rd_wait);
186 1238626 : kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
187 1238626 : }
188 :
189 : /**
190 : * splice_to_pipe - fill passed data into a pipe
191 : * @pipe: pipe to fill
192 : * @spd: data to fill
193 : *
194 : * Description:
195 : * @spd contains a map of pages and len/offset tuples, along with
196 : * the struct pipe_buf_operations associated with these pages. This
197 : * function will link that data to the pipe.
198 : *
199 : */
200 64624 : ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
201 : struct splice_pipe_desc *spd)
202 : {
203 64624 : unsigned int spd_pages = spd->nr_pages;
204 64624 : unsigned int tail = pipe->tail;
205 64624 : unsigned int head = pipe->head;
206 64624 : unsigned int mask = pipe->ring_size - 1;
207 64624 : int ret = 0, page_nr = 0;
208 :
209 64624 : if (!spd_pages)
210 : return 0;
211 :
212 64624 : if (unlikely(!pipe->readers)) {
213 0 : send_sig(SIGPIPE, current, 0);
214 0 : ret = -EPIPE;
215 0 : goto out;
216 : }
217 :
218 71436 : while (!pipe_full(head, tail, pipe->max_usage)) {
219 71434 : struct pipe_buffer *buf = &pipe->bufs[head & mask];
220 :
221 71434 : buf->page = spd->pages[page_nr];
222 71434 : buf->offset = spd->partial[page_nr].offset;
223 71434 : buf->len = spd->partial[page_nr].len;
224 71434 : buf->private = spd->partial[page_nr].private;
225 71434 : buf->ops = spd->ops;
226 71434 : buf->flags = 0;
227 :
228 71434 : head++;
229 71434 : pipe->head = head;
230 71434 : page_nr++;
231 71434 : ret += buf->len;
232 :
233 71434 : if (!--spd->nr_pages)
234 : break;
235 : }
236 :
237 64624 : if (!ret)
238 1 : ret = -EAGAIN;
239 :
240 64624 : out:
241 64626 : while (page_nr < spd_pages)
242 2 : spd->spd_release(spd, page_nr++);
243 :
244 64624 : return ret;
245 : }
246 : EXPORT_SYMBOL_GPL(splice_to_pipe);
247 :
248 2560 : ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
249 : {
250 2560 : unsigned int head = pipe->head;
251 2560 : unsigned int tail = pipe->tail;
252 2560 : unsigned int mask = pipe->ring_size - 1;
253 2560 : int ret;
254 :
255 2560 : if (unlikely(!pipe->readers)) {
256 0 : send_sig(SIGPIPE, current, 0);
257 0 : ret = -EPIPE;
258 2560 : } else if (pipe_full(head, tail, pipe->max_usage)) {
259 : ret = -EAGAIN;
260 : } else {
261 2559 : pipe->bufs[head & mask] = *buf;
262 2559 : pipe->head = head + 1;
263 2559 : return buf->len;
264 : }
265 1 : pipe_buf_release(pipe, buf);
266 0 : return ret;
267 : }
268 : EXPORT_SYMBOL(add_to_pipe);
269 :
270 : /*
271 : * Check if we need to grow the arrays holding pages and partial page
272 : * descriptions.
273 : */
274 0 : int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
275 : {
276 0 : unsigned int max_usage = READ_ONCE(pipe->max_usage);
277 :
278 0 : spd->nr_pages_max = max_usage;
279 0 : if (max_usage <= PIPE_DEF_BUFFERS)
280 : return 0;
281 :
282 0 : spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
283 0 : spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
284 : GFP_KERNEL);
285 :
286 0 : if (spd->pages && spd->partial)
287 : return 0;
288 :
289 0 : kfree(spd->pages);
290 0 : kfree(spd->partial);
291 0 : return -ENOMEM;
292 : }
293 :
294 0 : void splice_shrink_spd(struct splice_pipe_desc *spd)
295 : {
296 0 : if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
297 : return;
298 :
299 0 : kfree(spd->pages);
300 0 : kfree(spd->partial);
301 : }
302 :
303 : /**
304 : * copy_splice_read - Copy data from a file and splice the copy into a pipe
305 : * @in: The file to read from
306 : * @ppos: Pointer to the file position to read from
307 : * @pipe: The pipe to splice into
308 : * @len: The amount to splice
309 : * @flags: The SPLICE_F_* flags
310 : *
311 : * This function allocates a bunch of pages sufficient to hold the requested
312 : * amount of data (but limited by the remaining pipe capacity), passes it to
313 : * the file's ->read_iter() to read into and then splices the used pages into
314 : * the pipe.
315 : *
316 : * Return: On success, the number of bytes read will be returned and *@ppos
317 : * will be updated if appropriate; 0 will be returned if there is no more data
318 : * to be read; -EAGAIN will be returned if the pipe had no space, and some
319 : * other negative error code will be returned on error. A short read may occur
320 : * if the pipe has insufficient space, we reach the end of the data or we hit a
321 : * hole.
322 : */
323 918 : ssize_t copy_splice_read(struct file *in, loff_t *ppos,
324 : struct pipe_inode_info *pipe,
325 : size_t len, unsigned int flags)
326 : {
327 918 : struct iov_iter to;
328 918 : struct bio_vec *bv;
329 918 : struct kiocb kiocb;
330 918 : struct page **pages;
331 918 : ssize_t ret;
332 918 : size_t used, npages, chunk, remain, keep = 0;
333 918 : int i;
334 :
335 : /* Work out how much data we can actually add into the pipe */
336 918 : used = pipe_occupancy(pipe->head, pipe->tail);
337 918 : npages = max_t(ssize_t, pipe->max_usage - used, 0);
338 918 : len = min_t(size_t, len, npages * PAGE_SIZE);
339 918 : npages = DIV_ROUND_UP(len, PAGE_SIZE);
340 :
341 918 : bv = kzalloc(array_size(npages, sizeof(bv[0])) +
342 : array_size(npages, sizeof(struct page *)), GFP_KERNEL);
343 918 : if (!bv)
344 : return -ENOMEM;
345 :
346 918 : pages = (struct page **)(bv + npages);
347 918 : npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
348 918 : if (!npages) {
349 0 : kfree(bv);
350 0 : return -ENOMEM;
351 : }
352 :
353 918 : remain = len = min_t(size_t, len, npages * PAGE_SIZE);
354 :
355 1840 : for (i = 0; i < npages; i++) {
356 922 : chunk = min_t(size_t, PAGE_SIZE, remain);
357 922 : bv[i].bv_page = pages[i];
358 922 : bv[i].bv_offset = 0;
359 922 : bv[i].bv_len = chunk;
360 922 : remain -= chunk;
361 : }
362 :
363 : /* Do the I/O */
364 918 : iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
365 918 : init_sync_kiocb(&kiocb, in);
366 918 : kiocb.ki_pos = *ppos;
367 918 : ret = call_read_iter(in, &kiocb, &to);
368 :
369 918 : if (ret > 0) {
370 918 : keep = DIV_ROUND_UP(ret, PAGE_SIZE);
371 918 : *ppos = kiocb.ki_pos;
372 : }
373 :
374 : /*
375 : * Callers of ->splice_read() expect -EAGAIN on "can't put anything in
376 : * there", rather than -EFAULT.
377 : */
378 918 : if (ret == -EFAULT)
379 0 : ret = -EAGAIN;
380 :
381 : /* Free any pages that didn't get touched at all. */
382 918 : if (keep < npages)
383 0 : release_pages(pages + keep, npages - keep);
384 :
385 : /* Push the remaining pages into the pipe. */
386 918 : remain = ret;
387 1840 : for (i = 0; i < keep; i++) {
388 922 : struct pipe_buffer *buf = pipe_head_buf(pipe);
389 :
390 922 : chunk = min_t(size_t, remain, PAGE_SIZE);
391 922 : *buf = (struct pipe_buffer) {
392 : .ops = &default_pipe_buf_ops,
393 922 : .page = bv[i].bv_page,
394 : .offset = 0,
395 : .len = chunk,
396 : };
397 922 : pipe->head++;
398 922 : remain -= chunk;
399 : }
400 :
401 918 : kfree(bv);
402 918 : return ret;
403 : }
404 : EXPORT_SYMBOL(copy_splice_read);
405 :
406 : const struct pipe_buf_operations default_pipe_buf_ops = {
407 : .release = generic_pipe_buf_release,
408 : .try_steal = generic_pipe_buf_try_steal,
409 : .get = generic_pipe_buf_get,
410 : };
411 :
412 : /* Pipe buffer operations for a socket and similar. */
413 : const struct pipe_buf_operations nosteal_pipe_buf_ops = {
414 : .release = generic_pipe_buf_release,
415 : .get = generic_pipe_buf_get,
416 : };
417 : EXPORT_SYMBOL(nosteal_pipe_buf_ops);
418 :
419 1218379 : static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
420 : {
421 1218379 : smp_mb();
422 1218379 : if (waitqueue_active(&pipe->wr_wait))
423 0 : wake_up_interruptible(&pipe->wr_wait);
424 1218379 : kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
425 1218379 : }
426 :
427 : /**
428 : * splice_from_pipe_feed - feed available data from a pipe to a file
429 : * @pipe: pipe to splice from
430 : * @sd: information to @actor
431 : * @actor: handler that splices the data
432 : *
433 : * Description:
434 : * This function loops over the pipe and calls @actor to do the
435 : * actual moving of a single struct pipe_buffer to the desired
436 : * destination. It returns when there's no more buffers left in
437 : * the pipe or if the requested number of bytes (@sd->total_len)
438 : * have been copied. It returns a positive number (one) if the
439 : * pipe needs to be filled with more data, zero if the required
440 : * number of bytes have been copied and -errno on error.
441 : *
442 : * This, together with splice_from_pipe_{begin,end,next}, may be
443 : * used to implement the functionality of __splice_from_pipe() when
444 : * locking is required around copying the pipe buffers to the
445 : * destination.
446 : */
447 397 : static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
448 : splice_actor *actor)
449 : {
450 397 : unsigned int head = pipe->head;
451 397 : unsigned int tail = pipe->tail;
452 397 : unsigned int mask = pipe->ring_size - 1;
453 397 : int ret;
454 :
455 794 : while (!pipe_empty(head, tail)) {
456 397 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
457 :
458 397 : sd->len = buf->len;
459 397 : if (sd->len > sd->total_len)
460 0 : sd->len = sd->total_len;
461 :
462 397 : ret = pipe_buf_confirm(pipe, buf);
463 397 : if (unlikely(ret)) {
464 0 : if (ret == -ENODATA)
465 0 : ret = 0;
466 0 : return ret;
467 : }
468 :
469 397 : ret = actor(pipe, buf, sd);
470 397 : if (ret <= 0)
471 0 : return ret;
472 :
473 397 : buf->offset += ret;
474 397 : buf->len -= ret;
475 :
476 397 : sd->num_spliced += ret;
477 397 : sd->len -= ret;
478 397 : sd->pos += ret;
479 397 : sd->total_len -= ret;
480 :
481 397 : if (!buf->len) {
482 397 : pipe_buf_release(pipe, buf);
483 397 : tail++;
484 397 : pipe->tail = tail;
485 397 : if (pipe->files)
486 397 : sd->need_wakeup = true;
487 : }
488 :
489 397 : if (!sd->total_len)
490 : return 0;
491 : }
492 :
493 : return 1;
494 : }
495 :
496 : /* We know we have a pipe buffer, but maybe it's empty? */
497 6349755 : static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
498 : {
499 6349755 : unsigned int tail = pipe->tail;
500 6349755 : unsigned int mask = pipe->ring_size - 1;
501 6349755 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
502 :
503 6349755 : if (unlikely(!buf->len)) {
504 0 : pipe_buf_release(pipe, buf);
505 0 : pipe->tail = tail+1;
506 0 : return true;
507 : }
508 :
509 : return false;
510 : }
511 :
512 : /**
513 : * splice_from_pipe_next - wait for some data to splice from
514 : * @pipe: pipe to splice from
515 : * @sd: information about the splice operation
516 : *
517 : * Description:
518 : * This function will wait for some data and return a positive
519 : * value (one) if pipe buffers are available. It will return zero
520 : * or -errno if no more data needs to be spliced.
521 : */
522 6427142 : static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
523 : {
524 : /*
525 : * Check for signal early to make process killable when there are
526 : * always buffers available
527 : */
528 6427142 : if (signal_pending(current))
529 : return -ERESTARTSYS;
530 :
531 6427183 : repeat:
532 6462813 : while (pipe_empty(pipe->head, pipe->tail)) {
533 113057 : if (!pipe->writers)
534 : return 0;
535 :
536 35979 : if (sd->num_spliced)
537 : return 0;
538 :
539 35631 : if (sd->flags & SPLICE_F_NONBLOCK)
540 : return -EAGAIN;
541 :
542 35631 : if (signal_pending(current))
543 : return -ERESTARTSYS;
544 :
545 35631 : if (sd->need_wakeup) {
546 0 : wakeup_pipe_writers(pipe);
547 0 : sd->need_wakeup = false;
548 : }
549 :
550 35631 : pipe_wait_readable(pipe);
551 : }
552 :
553 6349756 : if (eat_empty_buffer(pipe))
554 0 : goto repeat;
555 :
556 : return 1;
557 : }
558 :
559 : /**
560 : * splice_from_pipe_begin - start splicing from pipe
561 : * @sd: information about the splice operation
562 : *
563 : * Description:
564 : * This function should be called before a loop containing
565 : * splice_from_pipe_next() and splice_from_pipe_feed() to
566 : * initialize the necessary fields of @sd.
567 : */
568 : static void splice_from_pipe_begin(struct splice_desc *sd)
569 : {
570 6426504 : sd->num_spliced = 0;
571 6426504 : sd->need_wakeup = false;
572 6426504 : }
573 :
574 : /**
575 : * splice_from_pipe_end - finish splicing from pipe
576 : * @pipe: pipe to splice from
577 : * @sd: information about the splice operation
578 : *
579 : * Description:
580 : * This function will wake up pipe writers if necessary. It should
581 : * be called after a loop containing splice_from_pipe_next() and
582 : * splice_from_pipe_feed().
583 : */
584 : static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
585 : {
586 6426527 : if (sd->need_wakeup)
587 1154002 : wakeup_pipe_writers(pipe);
588 : }
589 :
590 : /**
591 : * __splice_from_pipe - splice data from a pipe to given actor
592 : * @pipe: pipe to splice from
593 : * @sd: information to @actor
594 : * @actor: handler that splices the data
595 : *
596 : * Description:
597 : * This function does little more than loop over the pipe and call
598 : * @actor to do the actual moving of a single struct pipe_buffer to
599 : * the desired destination. See pipe_to_file, pipe_to_sendmsg, or
600 : * pipe_to_user.
601 : *
602 : */
603 77433 : ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
604 : splice_actor *actor)
605 : {
606 77433 : int ret;
607 :
608 77433 : splice_from_pipe_begin(sd);
609 77830 : do {
610 77830 : cond_resched();
611 77830 : ret = splice_from_pipe_next(pipe, sd);
612 77824 : if (ret > 0)
613 397 : ret = splice_from_pipe_feed(pipe, sd, actor);
614 77824 : } while (ret > 0);
615 77427 : splice_from_pipe_end(pipe, sd);
616 :
617 77427 : return sd->num_spliced ? sd->num_spliced : ret;
618 : }
619 : EXPORT_SYMBOL(__splice_from_pipe);
620 :
621 : /**
622 : * splice_from_pipe - splice data from a pipe to a file
623 : * @pipe: pipe to splice from
624 : * @out: file to splice to
625 : * @ppos: position in @out
626 : * @len: how many bytes to splice
627 : * @flags: splice modifier flags
628 : * @actor: handler that splices the data
629 : *
630 : * Description:
631 : * See __splice_from_pipe. This function locks the pipe inode,
632 : * otherwise it's identical to __splice_from_pipe().
633 : *
634 : */
635 77432 : ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
636 : loff_t *ppos, size_t len, unsigned int flags,
637 : splice_actor *actor)
638 : {
639 77432 : ssize_t ret;
640 77432 : struct splice_desc sd = {
641 : .total_len = len,
642 : .flags = flags,
643 77432 : .pos = *ppos,
644 : .u.file = out,
645 : };
646 :
647 77432 : pipe_lock(pipe);
648 77431 : ret = __splice_from_pipe(pipe, &sd, actor);
649 77423 : pipe_unlock(pipe);
650 :
651 77424 : return ret;
652 : }
653 :
654 : /**
655 : * iter_file_splice_write - splice data from a pipe to a file
656 : * @pipe: pipe info
657 : * @out: file to write to
658 : * @ppos: position in @out
659 : * @len: number of bytes to splice
660 : * @flags: splice modifier flags
661 : *
662 : * Description:
663 : * Will either move or copy pages (determined by @flags options) from
664 : * the given pipe inode to the given file.
665 : * This one is ->write_iter-based.
666 : *
667 : */
668 : ssize_t
669 6349082 : iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
670 : loff_t *ppos, size_t len, unsigned int flags)
671 : {
672 6349082 : struct splice_desc sd = {
673 : .total_len = len,
674 : .flags = flags,
675 6349082 : .pos = *ppos,
676 : .u.file = out,
677 : };
678 6349082 : int nbufs = pipe->max_usage;
679 6349082 : struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
680 : GFP_KERNEL);
681 6349069 : ssize_t ret;
682 :
683 6349069 : if (unlikely(!array))
684 : return -ENOMEM;
685 :
686 6349069 : pipe_lock(pipe);
687 :
688 6349071 : splice_from_pipe_begin(&sd);
689 12594149 : while (sd.total_len) {
690 6349345 : struct iov_iter from;
691 6349345 : unsigned int head, tail, mask;
692 6349345 : size_t left;
693 6349345 : int n;
694 :
695 6349345 : ret = splice_from_pipe_next(pipe, &sd);
696 6349340 : if (ret <= 0)
697 : break;
698 :
699 6349334 : if (unlikely(nbufs < pipe->max_usage)) {
700 0 : kfree(array);
701 0 : nbufs = pipe->max_usage;
702 0 : array = kcalloc(nbufs, sizeof(struct bio_vec),
703 : GFP_KERNEL);
704 0 : if (!array) {
705 : ret = -ENOMEM;
706 : break;
707 : }
708 : }
709 :
710 6349334 : head = pipe->head;
711 6349334 : tail = pipe->tail;
712 6349334 : mask = pipe->ring_size - 1;
713 :
714 : /* build the vector */
715 6349334 : left = sd.total_len;
716 18899274 : for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
717 12549942 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
718 12549942 : size_t this_len = buf->len;
719 :
720 : /* zero-length bvecs are not supported, skip them */
721 12549942 : if (!this_len)
722 0 : continue;
723 12549942 : this_len = min(this_len, left);
724 :
725 12549942 : ret = pipe_buf_confirm(pipe, buf);
726 12549940 : if (unlikely(ret)) {
727 0 : if (ret == -ENODATA)
728 0 : ret = 0;
729 0 : goto done;
730 : }
731 :
732 12549940 : bvec_set_page(&array[n], buf->page, this_len,
733 : buf->offset);
734 12549940 : left -= this_len;
735 12549940 : n++;
736 : }
737 :
738 6349332 : iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
739 6349322 : ret = vfs_iter_write(out, &from, &sd.pos, 0);
740 6349348 : if (ret <= 0)
741 : break;
742 :
743 6245062 : sd.num_spliced += ret;
744 6245062 : sd.total_len -= ret;
745 6245062 : *ppos = sd.pos;
746 :
747 : /* dismiss the fully eaten buffers, adjust the partial one */
748 6245062 : tail = pipe->tail;
749 18574009 : while (ret) {
750 12329206 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
751 12329206 : if (ret >= buf->len) {
752 12328931 : ret -= buf->len;
753 12328931 : buf->len = 0;
754 12328931 : pipe_buf_release(pipe, buf);
755 12328947 : tail++;
756 12328947 : pipe->tail = tail;
757 12328947 : if (pipe->files)
758 2295503 : sd.need_wakeup = true;
759 : } else {
760 275 : buf->offset += ret;
761 275 : buf->len -= ret;
762 275 : ret = 0;
763 : }
764 : }
765 : }
766 6244804 : done:
767 6349096 : kfree(array);
768 6349100 : splice_from_pipe_end(pipe, &sd);
769 :
770 6349100 : pipe_unlock(pipe);
771 :
772 6349099 : if (sd.num_spliced)
773 6245029 : ret = sd.num_spliced;
774 :
775 : return ret;
776 : }
777 :
778 : EXPORT_SYMBOL(iter_file_splice_write);
779 :
780 : #ifdef CONFIG_NET
781 : /**
782 : * splice_to_socket - splice data from a pipe to a socket
783 : * @pipe: pipe to splice from
784 : * @out: socket to write to
785 : * @ppos: position in @out
786 : * @len: number of bytes to splice
787 : * @flags: splice modifier flags
788 : *
789 : * Description:
790 : * Will send @len bytes from the pipe to a network socket. No data copying
791 : * is involved.
792 : *
793 : */
794 65536 : ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
795 : loff_t *ppos, size_t len, unsigned int flags)
796 : {
797 65536 : struct socket *sock = sock_from_file(out);
798 65536 : struct bio_vec bvec[16];
799 65536 : struct msghdr msg = {};
800 65536 : ssize_t ret = 0;
801 65536 : size_t spliced = 0;
802 65536 : bool need_wakeup = false;
803 :
804 65536 : pipe_lock(pipe);
805 :
806 131072 : while (len > 0) {
807 66703 : unsigned int head, tail, mask, bc = 0;
808 66703 : size_t remain = len;
809 :
810 : /*
811 : * Check for signal early to make process killable when there
812 : * are always buffers available
813 : */
814 66703 : ret = -ERESTARTSYS;
815 66703 : if (signal_pending(current))
816 : break;
817 :
818 66703 : while (pipe_empty(pipe->head, pipe->tail)) {
819 0 : ret = 0;
820 0 : if (!pipe->writers)
821 0 : goto out;
822 :
823 0 : if (spliced)
824 0 : goto out;
825 :
826 0 : ret = -EAGAIN;
827 0 : if (flags & SPLICE_F_NONBLOCK)
828 0 : goto out;
829 :
830 0 : ret = -ERESTARTSYS;
831 0 : if (signal_pending(current))
832 0 : goto out;
833 :
834 0 : if (need_wakeup) {
835 0 : wakeup_pipe_writers(pipe);
836 0 : need_wakeup = false;
837 : }
838 :
839 0 : pipe_wait_readable(pipe);
840 : }
841 :
842 66703 : head = pipe->head;
843 66703 : tail = pipe->tail;
844 66703 : mask = pipe->ring_size - 1;
845 :
846 74686 : while (!pipe_empty(head, tail)) {
847 74686 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
848 74686 : size_t seg;
849 :
850 74686 : if (!buf->len) {
851 0 : tail++;
852 0 : continue;
853 : }
854 :
855 74686 : seg = min_t(size_t, remain, buf->len);
856 :
857 74686 : ret = pipe_buf_confirm(pipe, buf);
858 74686 : if (unlikely(ret)) {
859 0 : if (ret == -ENODATA)
860 0 : ret = 0;
861 : break;
862 : }
863 :
864 74686 : bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
865 74686 : remain -= seg;
866 74686 : if (remain == 0 || bc >= ARRAY_SIZE(bvec))
867 : break;
868 7983 : tail++;
869 : }
870 :
871 66703 : if (!bc)
872 : break;
873 :
874 66703 : msg.msg_flags = MSG_SPLICE_PAGES;
875 66703 : if (flags & SPLICE_F_MORE)
876 0 : msg.msg_flags |= MSG_MORE;
877 66703 : if (remain && pipe_occupancy(pipe->head, tail) > 0)
878 0 : msg.msg_flags |= MSG_MORE;
879 66703 : if (out->f_flags & O_NONBLOCK)
880 66703 : msg.msg_flags |= MSG_DONTWAIT;
881 :
882 66703 : iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
883 : len - remain);
884 66703 : ret = sock_sendmsg(sock, &msg);
885 66703 : if (ret <= 0)
886 : break;
887 :
888 65536 : spliced += ret;
889 65536 : len -= ret;
890 65536 : tail = pipe->tail;
891 138137 : while (ret > 0) {
892 72601 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
893 72601 : size_t seg = min_t(size_t, ret, buf->len);
894 :
895 72601 : buf->offset += seg;
896 72601 : buf->len -= seg;
897 72601 : ret -= seg;
898 :
899 72601 : if (!buf->len) {
900 71434 : pipe_buf_release(pipe, buf);
901 71434 : tail++;
902 : }
903 : }
904 :
905 65536 : if (tail != pipe->tail) {
906 64377 : pipe->tail = tail;
907 64377 : if (pipe->files)
908 64377 : need_wakeup = true;
909 : }
910 : }
911 :
912 65536 : out:
913 65536 : pipe_unlock(pipe);
914 65536 : if (need_wakeup)
915 64377 : wakeup_pipe_writers(pipe);
916 65536 : return spliced ?: ret;
917 : }
918 : #endif
919 :
920 : static int warn_unsupported(struct file *file, const char *op)
921 : {
922 : pr_debug_ratelimited(
923 : "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
924 : op, file, current->pid, current->comm);
925 : return -EINVAL;
926 : }
927 :
928 : /*
929 : * Attempt to initiate a splice from pipe to file.
930 : */
931 : static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
932 : loff_t *ppos, size_t len, unsigned int flags)
933 : {
934 6492031 : if (unlikely(!out->f_op->splice_write))
935 : return warn_unsupported(out, "write");
936 6492031 : return out->f_op->splice_write(pipe, out, ppos, len, flags);
937 : }
938 :
939 : /*
940 : * Indicate to the caller that there was a premature EOF when reading from the
941 : * source and the caller didn't indicate they would be sending more data after
942 : * this.
943 : */
944 : static void do_splice_eof(struct splice_desc *sd)
945 : {
946 0 : if (sd->splice_eof)
947 0 : sd->splice_eof(sd);
948 : }
949 :
950 : /**
951 : * vfs_splice_read - Read data from a file and splice it into a pipe
952 : * @in: File to splice from
953 : * @ppos: Input file offset
954 : * @pipe: Pipe to splice to
955 : * @len: Number of bytes to splice
956 : * @flags: Splice modifier flags (SPLICE_F_*)
957 : *
958 : * Splice the requested amount of data from the input file to the pipe. This
959 : * is synchronous as the caller must hold the pipe lock across the entire
960 : * operation.
961 : *
962 : * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
963 : * a hole and a negative error code otherwise.
964 : */
965 6422408 : long vfs_splice_read(struct file *in, loff_t *ppos,
966 : struct pipe_inode_info *pipe, size_t len,
967 : unsigned int flags)
968 : {
969 6422408 : unsigned int p_space;
970 6422408 : int ret;
971 :
972 6422408 : if (unlikely(!(in->f_mode & FMODE_READ)))
973 : return -EBADF;
974 6422408 : if (!len)
975 : return 0;
976 :
977 : /* Don't try to read more the pipe has space for. */
978 6422410 : p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
979 6422410 : len = min_t(size_t, len, p_space << PAGE_SHIFT);
980 :
981 6422410 : ret = rw_verify_area(READ, in, ppos, len);
982 6422372 : if (unlikely(ret < 0))
983 0 : return ret;
984 :
985 6422372 : if (unlikely(len > MAX_RW_COUNT))
986 0 : len = MAX_RW_COUNT;
987 :
988 6422372 : if (unlikely(!in->f_op->splice_read))
989 : return warn_unsupported(in, "read");
990 : /*
991 : * O_DIRECT and DAX don't deal with the pagecache, so we allocate a
992 : * buffer, copy into it and splice that into the pipe.
993 : */
994 6422372 : if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
995 918 : return copy_splice_read(in, ppos, pipe, len, flags);
996 6421454 : return in->f_op->splice_read(in, ppos, pipe, len, flags);
997 : }
998 : EXPORT_SYMBOL_GPL(vfs_splice_read);
999 :
1000 : /**
1001 : * splice_direct_to_actor - splices data directly between two non-pipes
1002 : * @in: file to splice from
1003 : * @sd: actor information on where to splice to
1004 : * @actor: handles the data splicing
1005 : *
1006 : * Description:
1007 : * This is a special case helper to splice directly between two
1008 : * points, without requiring an explicit pipe. Internally an allocated
1009 : * pipe is cached in the process, and reused during the lifetime of
1010 : * that process.
1011 : *
1012 : */
1013 5174825 : ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1014 : splice_direct_actor *actor)
1015 : {
1016 5174825 : struct pipe_inode_info *pipe;
1017 5174825 : long ret, bytes;
1018 5174825 : size_t len;
1019 5174825 : int i, flags, more;
1020 :
1021 : /*
1022 : * We require the input to be seekable, as we don't want to randomly
1023 : * drop data for eg socket -> socket splicing. Use the piped splicing
1024 : * for that!
1025 : */
1026 5174825 : if (unlikely(!(in->f_mode & FMODE_LSEEK)))
1027 : return -EINVAL;
1028 :
1029 : /*
1030 : * neither in nor out is a pipe, setup an internal pipe attached to
1031 : * 'out' and transfer the wanted data from 'in' to 'out' through that
1032 : */
1033 5174825 : pipe = current->splice_pipe;
1034 5174825 : if (unlikely(!pipe)) {
1035 31848 : pipe = alloc_pipe_info();
1036 31848 : if (!pipe)
1037 : return -ENOMEM;
1038 :
1039 : /*
1040 : * We don't have an immediate reader, but we'll read the stuff
1041 : * out of the pipe right after the splice_to_pipe(). So set
1042 : * PIPE_READERS appropriately.
1043 : */
1044 31848 : pipe->readers = 1;
1045 :
1046 31848 : current->splice_pipe = pipe;
1047 : }
1048 :
1049 : /*
1050 : * Do the splice.
1051 : */
1052 5174825 : bytes = 0;
1053 5174825 : len = sd->total_len;
1054 :
1055 : /* Don't block on output, we have to drain the direct pipe. */
1056 5174825 : flags = sd->flags;
1057 5174825 : sd->flags &= ~SPLICE_F_NONBLOCK;
1058 :
1059 : /*
1060 : * We signal MORE until we've read sufficient data to fulfill the
1061 : * request and we keep signalling it if the caller set it.
1062 : */
1063 5174825 : more = sd->flags & SPLICE_F_MORE;
1064 5174825 : sd->flags |= SPLICE_F_MORE;
1065 :
1066 5174825 : WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
1067 :
1068 10266044 : while (len) {
1069 5174920 : size_t read_len;
1070 5174920 : loff_t pos = sd->pos, prev_pos = pos;
1071 :
1072 5174920 : ret = vfs_splice_read(in, &pos, pipe, len, flags);
1073 5174938 : if (unlikely(ret <= 0))
1074 117 : goto read_failure;
1075 :
1076 5174821 : read_len = ret;
1077 5174821 : sd->total_len = read_len;
1078 :
1079 : /*
1080 : * If we now have sufficient data to fulfill the request then
1081 : * we clear SPLICE_F_MORE if it was not set initially.
1082 : */
1083 5174821 : if (read_len >= len && !more)
1084 5174725 : sd->flags &= ~SPLICE_F_MORE;
1085 :
1086 : /*
1087 : * NOTE: nonblocking mode only applies to the input. We
1088 : * must not do the output in nonblocking mode as then we
1089 : * could get stuck data in the internal pipe:
1090 : */
1091 5174821 : ret = actor(pipe, sd);
1092 5174825 : if (unlikely(ret <= 0)) {
1093 83416 : sd->pos = prev_pos;
1094 83606 : goto out_release;
1095 : }
1096 :
1097 5091409 : bytes += ret;
1098 5091409 : len -= ret;
1099 5091409 : sd->pos = pos;
1100 :
1101 5091409 : if (ret < read_len) {
1102 190 : sd->pos = prev_pos + ret;
1103 190 : goto out_release;
1104 : }
1105 : }
1106 :
1107 5091124 : done:
1108 5174857 : pipe->tail = pipe->head = 0;
1109 5174857 : file_accessed(in);
1110 5174857 : return bytes;
1111 :
1112 : read_failure:
1113 : /*
1114 : * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
1115 : * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
1116 : * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
1117 : * least 1 byte *then* we will also do the ->splice_eof() call.
1118 : */
1119 117 : if (ret == 0 && !more && len > 0 && bytes)
1120 0 : do_splice_eof(sd);
1121 117 : out_release:
1122 : /*
1123 : * If we did an incomplete transfer we must release
1124 : * the pipe buffers in question:
1125 : */
1126 1423447 : for (i = 0; i < pipe->ring_size; i++) {
1127 1339714 : struct pipe_buffer *buf = &pipe->bufs[i];
1128 :
1129 1339714 : if (buf->ops)
1130 176828 : pipe_buf_release(pipe, buf);
1131 : }
1132 :
1133 83733 : if (!bytes)
1134 83513 : bytes = ret;
1135 :
1136 83733 : goto done;
1137 : }
1138 : EXPORT_SYMBOL(splice_direct_to_actor);
1139 :
1140 5174804 : static int direct_splice_actor(struct pipe_inode_info *pipe,
1141 : struct splice_desc *sd)
1142 : {
1143 5174804 : struct file *file = sd->u.file;
1144 :
1145 5174804 : return do_splice_from(pipe, file, sd->opos, sd->total_len,
1146 : sd->flags);
1147 : }
1148 :
1149 0 : static void direct_file_splice_eof(struct splice_desc *sd)
1150 : {
1151 0 : struct file *file = sd->u.file;
1152 :
1153 0 : if (file->f_op->splice_eof)
1154 0 : file->f_op->splice_eof(file);
1155 0 : }
1156 :
1157 : /**
1158 : * do_splice_direct - splices data directly between two files
1159 : * @in: file to splice from
1160 : * @ppos: input file offset
1161 : * @out: file to splice to
1162 : * @opos: output file offset
1163 : * @len: number of bytes to splice
1164 : * @flags: splice modifier flags
1165 : *
1166 : * Description:
1167 : * For use by do_sendfile(). splice can easily emulate sendfile, but
1168 : * doing it in the application would incur an extra system call
1169 : * (splice in + splice out, as compared to just sendfile()). So this helper
1170 : * can splice directly through a process-private pipe.
1171 : *
1172 : */
1173 5174854 : long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1174 : loff_t *opos, size_t len, unsigned int flags)
1175 : {
1176 5174854 : struct splice_desc sd = {
1177 : .len = len,
1178 : .total_len = len,
1179 : .flags = flags,
1180 5174854 : .pos = *ppos,
1181 : .u.file = out,
1182 : .splice_eof = direct_file_splice_eof,
1183 : .opos = opos,
1184 : };
1185 5174854 : long ret;
1186 :
1187 5174854 : if (unlikely(!(out->f_mode & FMODE_WRITE)))
1188 : return -EBADF;
1189 :
1190 5174854 : if (unlikely(out->f_flags & O_APPEND))
1191 : return -EINVAL;
1192 :
1193 5174854 : ret = rw_verify_area(WRITE, out, opos, len);
1194 5174847 : if (unlikely(ret < 0))
1195 : return ret;
1196 :
1197 5174847 : ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1198 5174847 : if (ret > 0)
1199 5091337 : *ppos = sd.pos;
1200 :
1201 : return ret;
1202 : }
1203 : EXPORT_SYMBOL(do_splice_direct);
1204 :
1205 1250057 : static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1206 : {
1207 1250057 : for (;;) {
1208 1250057 : if (unlikely(!pipe->readers)) {
1209 0 : send_sig(SIGPIPE, current, 0);
1210 0 : return -EPIPE;
1211 : }
1212 1250057 : if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1213 : return 0;
1214 2 : if (flags & SPLICE_F_NONBLOCK)
1215 : return -EAGAIN;
1216 2 : if (signal_pending(current))
1217 : return -ERESTARTSYS;
1218 0 : pipe_wait_writable(pipe);
1219 : }
1220 : }
1221 :
1222 : static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1223 : struct pipe_inode_info *opipe,
1224 : size_t len, unsigned int flags);
1225 :
1226 1247498 : long splice_file_to_pipe(struct file *in,
1227 : struct pipe_inode_info *opipe,
1228 : loff_t *offset,
1229 : size_t len, unsigned int flags)
1230 : {
1231 1247498 : long ret;
1232 :
1233 1247498 : pipe_lock(opipe);
1234 1247503 : ret = wait_for_space(opipe, flags);
1235 1247495 : if (!ret)
1236 1247496 : ret = vfs_splice_read(in, offset, opipe, len, flags);
1237 1247503 : pipe_unlock(opipe);
1238 1247503 : if (ret > 0)
1239 1236065 : wakeup_pipe_readers(opipe);
1240 1247504 : return ret;
1241 : }
1242 :
1243 : /*
1244 : * Determine where to splice to/from.
1245 : */
1246 2564746 : long do_splice(struct file *in, loff_t *off_in, struct file *out,
1247 : loff_t *off_out, size_t len, unsigned int flags)
1248 : {
1249 2564746 : struct pipe_inode_info *ipipe;
1250 2564746 : struct pipe_inode_info *opipe;
1251 2564746 : loff_t offset;
1252 2564746 : long ret;
1253 :
1254 2564746 : if (unlikely(!(in->f_mode & FMODE_READ) ||
1255 : !(out->f_mode & FMODE_WRITE)))
1256 : return -EBADF;
1257 :
1258 2564746 : ipipe = get_pipe_info(in, true);
1259 2564734 : opipe = get_pipe_info(out, true);
1260 :
1261 2564735 : if (ipipe && opipe) {
1262 0 : if (off_in || off_out)
1263 : return -ESPIPE;
1264 :
1265 : /* Splicing to self would be fun, but... */
1266 0 : if (ipipe == opipe)
1267 : return -EINVAL;
1268 :
1269 0 : if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1270 0 : flags |= SPLICE_F_NONBLOCK;
1271 :
1272 0 : return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1273 : }
1274 :
1275 2564735 : if (ipipe) {
1276 1317235 : if (off_in)
1277 : return -ESPIPE;
1278 1317235 : if (off_out) {
1279 1174276 : if (!(out->f_mode & FMODE_PWRITE))
1280 : return -EINVAL;
1281 1174276 : offset = *off_out;
1282 : } else {
1283 142959 : offset = out->f_pos;
1284 : }
1285 :
1286 1317235 : if (unlikely(out->f_flags & O_APPEND))
1287 : return -EINVAL;
1288 :
1289 1317235 : ret = rw_verify_area(WRITE, out, &offset, len);
1290 1317232 : if (unlikely(ret < 0))
1291 : return ret;
1292 :
1293 1317232 : if (in->f_flags & O_NONBLOCK)
1294 65536 : flags |= SPLICE_F_NONBLOCK;
1295 :
1296 1317232 : file_start_write(out);
1297 1317227 : ret = do_splice_from(ipipe, out, &offset, len, flags);
1298 1317238 : file_end_write(out);
1299 :
1300 1317239 : if (ret > 0)
1301 1219556 : fsnotify_modify(out);
1302 :
1303 1317238 : if (!off_out)
1304 142960 : out->f_pos = offset;
1305 : else
1306 1174278 : *off_out = offset;
1307 :
1308 1317238 : return ret;
1309 : }
1310 :
1311 1247500 : if (opipe) {
1312 1247500 : if (off_out)
1313 : return -ESPIPE;
1314 1247500 : if (off_in) {
1315 1171706 : if (!(in->f_mode & FMODE_PREAD))
1316 : return -EINVAL;
1317 1171706 : offset = *off_in;
1318 : } else {
1319 75794 : offset = in->f_pos;
1320 : }
1321 :
1322 1247500 : if (out->f_flags & O_NONBLOCK)
1323 75786 : flags |= SPLICE_F_NONBLOCK;
1324 :
1325 1247500 : ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1326 :
1327 1247501 : if (ret > 0)
1328 1236067 : fsnotify_access(in);
1329 :
1330 1247501 : if (!off_in)
1331 75794 : in->f_pos = offset;
1332 : else
1333 1171707 : *off_in = offset;
1334 :
1335 1247501 : return ret;
1336 : }
1337 :
1338 : return -EINVAL;
1339 : }
1340 :
1341 2564742 : static long __do_splice(struct file *in, loff_t __user *off_in,
1342 : struct file *out, loff_t __user *off_out,
1343 : size_t len, unsigned int flags)
1344 : {
1345 2564742 : struct pipe_inode_info *ipipe;
1346 2564742 : struct pipe_inode_info *opipe;
1347 2564742 : loff_t offset, *__off_in = NULL, *__off_out = NULL;
1348 2564742 : long ret;
1349 :
1350 2564742 : ipipe = get_pipe_info(in, true);
1351 2564733 : opipe = get_pipe_info(out, true);
1352 :
1353 2564734 : if (ipipe) {
1354 1317235 : if (off_in)
1355 : return -ESPIPE;
1356 1317235 : pipe_clear_nowait(in);
1357 : }
1358 2564743 : if (opipe) {
1359 1247502 : if (off_out)
1360 : return -ESPIPE;
1361 1247502 : pipe_clear_nowait(out);
1362 : }
1363 :
1364 2564746 : if (off_out) {
1365 1174274 : if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1366 : return -EFAULT;
1367 : __off_out = &offset;
1368 : }
1369 2564746 : if (off_in) {
1370 1171707 : if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1371 : return -EFAULT;
1372 : __off_in = &offset;
1373 : }
1374 :
1375 2564746 : ret = do_splice(in, __off_in, out, __off_out, len, flags);
1376 2564738 : if (ret < 0)
1377 : return ret;
1378 :
1379 3697615 : if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1380 0 : return -EFAULT;
1381 3715678 : if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1382 0 : return -EFAULT;
1383 :
1384 : return ret;
1385 : }
1386 :
1387 2557 : static int iter_to_pipe(struct iov_iter *from,
1388 : struct pipe_inode_info *pipe,
1389 : unsigned flags)
1390 : {
1391 2557 : struct pipe_buffer buf = {
1392 : .ops = &user_page_pipe_buf_ops,
1393 : .flags = flags
1394 : };
1395 2557 : size_t total = 0;
1396 2557 : int ret = 0;
1397 :
1398 5117 : while (iov_iter_count(from)) {
1399 2557 : struct page *pages[16];
1400 2557 : ssize_t left;
1401 2557 : size_t start;
1402 2557 : int i, n;
1403 :
1404 2557 : left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1405 2560 : if (left <= 0) {
1406 0 : ret = left;
1407 0 : break;
1408 : }
1409 :
1410 2560 : n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1411 5118 : for (i = 0; i < n; i++) {
1412 2558 : int size = min_t(int, left, PAGE_SIZE - start);
1413 :
1414 2558 : buf.page = pages[i];
1415 2558 : buf.offset = start;
1416 2558 : buf.len = size;
1417 2558 : ret = add_to_pipe(pipe, &buf);
1418 2558 : if (unlikely(ret < 0)) {
1419 0 : iov_iter_revert(from, left);
1420 : // this one got dropped by add_to_pipe()
1421 0 : while (++i < n)
1422 0 : put_page(pages[i]);
1423 0 : goto out;
1424 : }
1425 2558 : total += ret;
1426 2558 : left -= size;
1427 2558 : start = 0;
1428 : }
1429 : }
1430 2560 : out:
1431 2560 : return total ? total : ret;
1432 : }
1433 :
1434 0 : static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1435 : struct splice_desc *sd)
1436 : {
1437 0 : int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1438 0 : return n == sd->len ? n : -EFAULT;
1439 : }
1440 :
1441 : /*
1442 : * For lack of a better implementation, implement vmsplice() to userspace
1443 : * as a simple copy of the pipes pages to the user iov.
1444 : */
1445 0 : static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1446 : unsigned int flags)
1447 : {
1448 0 : struct pipe_inode_info *pipe = get_pipe_info(file, true);
1449 0 : struct splice_desc sd = {
1450 : .total_len = iov_iter_count(iter),
1451 : .flags = flags,
1452 : .u.data = iter
1453 : };
1454 0 : long ret = 0;
1455 :
1456 0 : if (!pipe)
1457 : return -EBADF;
1458 :
1459 0 : pipe_clear_nowait(file);
1460 :
1461 0 : if (sd.total_len) {
1462 0 : pipe_lock(pipe);
1463 0 : ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1464 0 : pipe_unlock(pipe);
1465 : }
1466 :
1467 : return ret;
1468 : }
1469 :
1470 : /*
1471 : * vmsplice splices a user address range into a pipe. It can be thought of
1472 : * as splice-from-memory, where the regular splice is splice-from-file (or
1473 : * to file). In both cases the output is a pipe, naturally.
1474 : */
1475 2559 : static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1476 : unsigned int flags)
1477 : {
1478 2559 : struct pipe_inode_info *pipe;
1479 2559 : long ret = 0;
1480 2559 : unsigned buf_flag = 0;
1481 :
1482 2559 : if (flags & SPLICE_F_GIFT)
1483 0 : buf_flag = PIPE_BUF_FLAG_GIFT;
1484 :
1485 2559 : pipe = get_pipe_info(file, true);
1486 2557 : if (!pipe)
1487 : return -EBADF;
1488 :
1489 2557 : pipe_clear_nowait(file);
1490 :
1491 2557 : pipe_lock(pipe);
1492 2557 : ret = wait_for_space(pipe, flags);
1493 2560 : if (!ret)
1494 2557 : ret = iter_to_pipe(iter, pipe, buf_flag);
1495 2562 : pipe_unlock(pipe);
1496 2558 : if (ret > 0)
1497 2558 : wakeup_pipe_readers(pipe);
1498 : return ret;
1499 : }
1500 :
1501 2559 : static int vmsplice_type(struct fd f, int *type)
1502 : {
1503 2559 : if (!f.file)
1504 : return -EBADF;
1505 2559 : if (f.file->f_mode & FMODE_WRITE) {
1506 2559 : *type = ITER_SOURCE;
1507 0 : } else if (f.file->f_mode & FMODE_READ) {
1508 0 : *type = ITER_DEST;
1509 : } else {
1510 0 : fdput(f);
1511 0 : return -EBADF;
1512 : }
1513 : return 0;
1514 : }
1515 :
1516 : /*
1517 : * Note that vmsplice only really supports true splicing _from_ user memory
1518 : * to a pipe, not the other way around. Splicing from user memory is a simple
1519 : * operation that can be supported without any funky alignment restrictions
1520 : * or nasty vm tricks. We simply map in the user memory and fill them into
1521 : * a pipe. The reverse isn't quite as easy, though. There are two possible
1522 : * solutions for that:
1523 : *
1524 : * - memcpy() the data internally, at which point we might as well just
1525 : * do a regular read() on the buffer anyway.
1526 : * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1527 : * has restriction limitations on both ends of the pipe).
1528 : *
1529 : * Currently we punt and implement it as a normal copy, see pipe_to_user().
1530 : *
1531 : */
1532 5117 : SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1533 : unsigned long, nr_segs, unsigned int, flags)
1534 : {
1535 2560 : struct iovec iovstack[UIO_FASTIOV];
1536 2560 : struct iovec *iov = iovstack;
1537 2560 : struct iov_iter iter;
1538 2560 : ssize_t error;
1539 2560 : struct fd f;
1540 2560 : int type;
1541 :
1542 2560 : if (unlikely(flags & ~SPLICE_F_ALL))
1543 : return -EINVAL;
1544 :
1545 2560 : f = fdget(fd);
1546 2560 : error = vmsplice_type(f, &type);
1547 2558 : if (error)
1548 : return error;
1549 :
1550 2558 : error = import_iovec(type, uiov, nr_segs,
1551 : ARRAY_SIZE(iovstack), &iov, &iter);
1552 2557 : if (error < 0)
1553 0 : goto out_fdput;
1554 :
1555 2557 : if (!iov_iter_count(&iter))
1556 : error = 0;
1557 2557 : else if (type == ITER_SOURCE)
1558 2557 : error = vmsplice_to_pipe(f.file, &iter, flags);
1559 : else
1560 0 : error = vmsplice_to_user(f.file, &iter, flags);
1561 :
1562 2558 : kfree(iov);
1563 2558 : out_fdput:
1564 2558 : fdput(f);
1565 2558 : return error;
1566 : }
1567 :
1568 5129484 : SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1569 : int, fd_out, loff_t __user *, off_out,
1570 : size_t, len, unsigned int, flags)
1571 : {
1572 2564744 : struct fd in, out;
1573 2564744 : long error;
1574 :
1575 2564744 : if (unlikely(!len))
1576 : return 0;
1577 :
1578 2564744 : if (unlikely(flags & ~SPLICE_F_ALL))
1579 : return -EINVAL;
1580 :
1581 2564744 : error = -EBADF;
1582 2564744 : in = fdget(fd_in);
1583 2564744 : if (in.file) {
1584 2564744 : out = fdget(fd_out);
1585 2564727 : if (out.file) {
1586 2564727 : error = __do_splice(in.file, off_in, out.file, off_out,
1587 : len, flags);
1588 2564736 : fdput(out);
1589 : }
1590 2564736 : fdput(in);
1591 : }
1592 : return error;
1593 : }
1594 :
1595 : /*
1596 : * Make sure there's data to read. Wait for input if we can, otherwise
1597 : * return an appropriate error.
1598 : */
1599 0 : static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1600 : {
1601 0 : int ret;
1602 :
1603 : /*
1604 : * Check the pipe occupancy without the inode lock first. This function
1605 : * is speculative anyways, so missing one is ok.
1606 : */
1607 0 : if (!pipe_empty(pipe->head, pipe->tail))
1608 : return 0;
1609 :
1610 0 : ret = 0;
1611 0 : pipe_lock(pipe);
1612 :
1613 0 : while (pipe_empty(pipe->head, pipe->tail)) {
1614 0 : if (signal_pending(current)) {
1615 : ret = -ERESTARTSYS;
1616 : break;
1617 : }
1618 0 : if (!pipe->writers)
1619 : break;
1620 0 : if (flags & SPLICE_F_NONBLOCK) {
1621 : ret = -EAGAIN;
1622 : break;
1623 : }
1624 0 : pipe_wait_readable(pipe);
1625 : }
1626 :
1627 0 : pipe_unlock(pipe);
1628 0 : return ret;
1629 : }
1630 :
1631 : /*
1632 : * Make sure there's writeable room. Wait for room if we can, otherwise
1633 : * return an appropriate error.
1634 : */
1635 0 : static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1636 : {
1637 0 : int ret;
1638 :
1639 : /*
1640 : * Check pipe occupancy without the inode lock first. This function
1641 : * is speculative anyways, so missing one is ok.
1642 : */
1643 0 : if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1644 : return 0;
1645 :
1646 0 : ret = 0;
1647 0 : pipe_lock(pipe);
1648 :
1649 0 : while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1650 0 : if (!pipe->readers) {
1651 0 : send_sig(SIGPIPE, current, 0);
1652 0 : ret = -EPIPE;
1653 0 : break;
1654 : }
1655 0 : if (flags & SPLICE_F_NONBLOCK) {
1656 : ret = -EAGAIN;
1657 : break;
1658 : }
1659 0 : if (signal_pending(current)) {
1660 : ret = -ERESTARTSYS;
1661 : break;
1662 : }
1663 0 : pipe_wait_writable(pipe);
1664 : }
1665 :
1666 0 : pipe_unlock(pipe);
1667 0 : return ret;
1668 : }
1669 :
1670 : /*
1671 : * Splice contents of ipipe to opipe.
1672 : */
1673 0 : static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1674 : struct pipe_inode_info *opipe,
1675 : size_t len, unsigned int flags)
1676 : {
1677 0 : struct pipe_buffer *ibuf, *obuf;
1678 0 : unsigned int i_head, o_head;
1679 0 : unsigned int i_tail, o_tail;
1680 0 : unsigned int i_mask, o_mask;
1681 0 : int ret = 0;
1682 0 : bool input_wakeup = false;
1683 :
1684 :
1685 0 : retry:
1686 0 : ret = ipipe_prep(ipipe, flags);
1687 0 : if (ret)
1688 0 : return ret;
1689 :
1690 0 : ret = opipe_prep(opipe, flags);
1691 0 : if (ret)
1692 0 : return ret;
1693 :
1694 : /*
1695 : * Potential ABBA deadlock, work around it by ordering lock
1696 : * grabbing by pipe info address. Otherwise two different processes
1697 : * could deadlock (one doing tee from A -> B, the other from B -> A).
1698 : */
1699 0 : pipe_double_lock(ipipe, opipe);
1700 :
1701 0 : i_tail = ipipe->tail;
1702 0 : i_mask = ipipe->ring_size - 1;
1703 0 : o_head = opipe->head;
1704 0 : o_mask = opipe->ring_size - 1;
1705 :
1706 0 : do {
1707 0 : size_t o_len;
1708 :
1709 0 : if (!opipe->readers) {
1710 0 : send_sig(SIGPIPE, current, 0);
1711 0 : if (!ret)
1712 0 : ret = -EPIPE;
1713 : break;
1714 : }
1715 :
1716 0 : i_head = ipipe->head;
1717 0 : o_tail = opipe->tail;
1718 :
1719 0 : if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1720 : break;
1721 :
1722 : /*
1723 : * Cannot make any progress, because either the input
1724 : * pipe is empty or the output pipe is full.
1725 : */
1726 0 : if (pipe_empty(i_head, i_tail) ||
1727 0 : pipe_full(o_head, o_tail, opipe->max_usage)) {
1728 : /* Already processed some buffers, break */
1729 0 : if (ret)
1730 : break;
1731 :
1732 0 : if (flags & SPLICE_F_NONBLOCK) {
1733 : ret = -EAGAIN;
1734 : break;
1735 : }
1736 :
1737 : /*
1738 : * We raced with another reader/writer and haven't
1739 : * managed to process any buffers. A zero return
1740 : * value means EOF, so retry instead.
1741 : */
1742 0 : pipe_unlock(ipipe);
1743 0 : pipe_unlock(opipe);
1744 0 : goto retry;
1745 : }
1746 :
1747 0 : ibuf = &ipipe->bufs[i_tail & i_mask];
1748 0 : obuf = &opipe->bufs[o_head & o_mask];
1749 :
1750 0 : if (len >= ibuf->len) {
1751 : /*
1752 : * Simply move the whole buffer from ipipe to opipe
1753 : */
1754 0 : *obuf = *ibuf;
1755 0 : ibuf->ops = NULL;
1756 0 : i_tail++;
1757 0 : ipipe->tail = i_tail;
1758 0 : input_wakeup = true;
1759 0 : o_len = obuf->len;
1760 0 : o_head++;
1761 0 : opipe->head = o_head;
1762 : } else {
1763 : /*
1764 : * Get a reference to this pipe buffer,
1765 : * so we can copy the contents over.
1766 : */
1767 0 : if (!pipe_buf_get(ipipe, ibuf)) {
1768 0 : if (ret == 0)
1769 0 : ret = -EFAULT;
1770 : break;
1771 : }
1772 0 : *obuf = *ibuf;
1773 :
1774 : /*
1775 : * Don't inherit the gift and merge flags, we need to
1776 : * prevent multiple steals of this page.
1777 : */
1778 0 : obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1779 0 : obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1780 :
1781 0 : obuf->len = len;
1782 0 : ibuf->offset += len;
1783 0 : ibuf->len -= len;
1784 0 : o_len = len;
1785 0 : o_head++;
1786 0 : opipe->head = o_head;
1787 : }
1788 0 : ret += o_len;
1789 0 : len -= o_len;
1790 0 : } while (len);
1791 :
1792 0 : pipe_unlock(ipipe);
1793 0 : pipe_unlock(opipe);
1794 :
1795 : /*
1796 : * If we put data in the output pipe, wakeup any potential readers.
1797 : */
1798 0 : if (ret > 0)
1799 0 : wakeup_pipe_readers(opipe);
1800 :
1801 0 : if (input_wakeup)
1802 0 : wakeup_pipe_writers(ipipe);
1803 :
1804 : return ret;
1805 : }
1806 :
1807 : /*
1808 : * Link contents of ipipe to opipe.
1809 : */
1810 0 : static int link_pipe(struct pipe_inode_info *ipipe,
1811 : struct pipe_inode_info *opipe,
1812 : size_t len, unsigned int flags)
1813 : {
1814 0 : struct pipe_buffer *ibuf, *obuf;
1815 0 : unsigned int i_head, o_head;
1816 0 : unsigned int i_tail, o_tail;
1817 0 : unsigned int i_mask, o_mask;
1818 0 : int ret = 0;
1819 :
1820 : /*
1821 : * Potential ABBA deadlock, work around it by ordering lock
1822 : * grabbing by pipe info address. Otherwise two different processes
1823 : * could deadlock (one doing tee from A -> B, the other from B -> A).
1824 : */
1825 0 : pipe_double_lock(ipipe, opipe);
1826 :
1827 0 : i_tail = ipipe->tail;
1828 0 : i_mask = ipipe->ring_size - 1;
1829 0 : o_head = opipe->head;
1830 0 : o_mask = opipe->ring_size - 1;
1831 :
1832 0 : do {
1833 0 : if (!opipe->readers) {
1834 0 : send_sig(SIGPIPE, current, 0);
1835 0 : if (!ret)
1836 0 : ret = -EPIPE;
1837 : break;
1838 : }
1839 :
1840 0 : i_head = ipipe->head;
1841 0 : o_tail = opipe->tail;
1842 :
1843 : /*
1844 : * If we have iterated all input buffers or run out of
1845 : * output room, break.
1846 : */
1847 0 : if (pipe_empty(i_head, i_tail) ||
1848 0 : pipe_full(o_head, o_tail, opipe->max_usage))
1849 : break;
1850 :
1851 0 : ibuf = &ipipe->bufs[i_tail & i_mask];
1852 0 : obuf = &opipe->bufs[o_head & o_mask];
1853 :
1854 : /*
1855 : * Get a reference to this pipe buffer,
1856 : * so we can copy the contents over.
1857 : */
1858 0 : if (!pipe_buf_get(ipipe, ibuf)) {
1859 0 : if (ret == 0)
1860 0 : ret = -EFAULT;
1861 : break;
1862 : }
1863 :
1864 0 : *obuf = *ibuf;
1865 :
1866 : /*
1867 : * Don't inherit the gift and merge flag, we need to prevent
1868 : * multiple steals of this page.
1869 : */
1870 0 : obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1871 0 : obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1872 :
1873 0 : if (obuf->len > len)
1874 0 : obuf->len = len;
1875 0 : ret += obuf->len;
1876 0 : len -= obuf->len;
1877 :
1878 0 : o_head++;
1879 0 : opipe->head = o_head;
1880 0 : i_tail++;
1881 0 : } while (len);
1882 :
1883 0 : pipe_unlock(ipipe);
1884 0 : pipe_unlock(opipe);
1885 :
1886 : /*
1887 : * If we put data in the output pipe, wakeup any potential readers.
1888 : */
1889 0 : if (ret > 0)
1890 0 : wakeup_pipe_readers(opipe);
1891 :
1892 0 : return ret;
1893 : }
1894 :
1895 : /*
1896 : * This is a tee(1) implementation that works on pipes. It doesn't copy
1897 : * any data, it simply references the 'in' pages on the 'out' pipe.
1898 : * The 'flags' used are the SPLICE_F_* variants, currently the only
1899 : * applicable one is SPLICE_F_NONBLOCK.
1900 : */
1901 0 : long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1902 : {
1903 0 : struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1904 0 : struct pipe_inode_info *opipe = get_pipe_info(out, true);
1905 0 : int ret = -EINVAL;
1906 :
1907 0 : if (unlikely(!(in->f_mode & FMODE_READ) ||
1908 : !(out->f_mode & FMODE_WRITE)))
1909 : return -EBADF;
1910 :
1911 : /*
1912 : * Duplicate the contents of ipipe to opipe without actually
1913 : * copying the data.
1914 : */
1915 0 : if (ipipe && opipe && ipipe != opipe) {
1916 0 : if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1917 0 : flags |= SPLICE_F_NONBLOCK;
1918 :
1919 : /*
1920 : * Keep going, unless we encounter an error. The ipipe/opipe
1921 : * ordering doesn't really matter.
1922 : */
1923 0 : ret = ipipe_prep(ipipe, flags);
1924 0 : if (!ret) {
1925 0 : ret = opipe_prep(opipe, flags);
1926 0 : if (!ret)
1927 0 : ret = link_pipe(ipipe, opipe, len, flags);
1928 : }
1929 : }
1930 :
1931 0 : return ret;
1932 : }
1933 :
1934 0 : SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1935 : {
1936 0 : struct fd in, out;
1937 0 : int error;
1938 :
1939 0 : if (unlikely(flags & ~SPLICE_F_ALL))
1940 : return -EINVAL;
1941 :
1942 0 : if (unlikely(!len))
1943 : return 0;
1944 :
1945 0 : error = -EBADF;
1946 0 : in = fdget(fdin);
1947 0 : if (in.file) {
1948 0 : out = fdget(fdout);
1949 0 : if (out.file) {
1950 0 : error = do_tee(in.file, out.file, len, flags);
1951 0 : fdput(out);
1952 : }
1953 0 : fdput(in);
1954 : }
1955 :
1956 0 : return error;
1957 : }
|