Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * "splice": joining two ropes together by interweaving their strands.
4 : *
5 : * This is the "extended pipe" functionality, where a pipe is used as
6 : * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7 : * buffer that you can use to transfer data from one end to the other.
8 : *
9 : * The traditional unix read/write is extended with a "splice()" operation
10 : * that transfers data buffers to or from a pipe buffer.
11 : *
12 : * Named by Larry McVoy, original implementation from Linus, extended by
13 : * Jens to support splicing to files, network, direct splicing, etc and
14 : * fixing lots of bugs.
15 : *
16 : * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17 : * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18 : * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19 : *
20 : */
21 : #include <linux/bvec.h>
22 : #include <linux/fs.h>
23 : #include <linux/file.h>
24 : #include <linux/pagemap.h>
25 : #include <linux/splice.h>
26 : #include <linux/memcontrol.h>
27 : #include <linux/mm_inline.h>
28 : #include <linux/swap.h>
29 : #include <linux/writeback.h>
30 : #include <linux/export.h>
31 : #include <linux/syscalls.h>
32 : #include <linux/uio.h>
33 : #include <linux/fsnotify.h>
34 : #include <linux/security.h>
35 : #include <linux/gfp.h>
36 : #include <linux/net.h>
37 : #include <linux/socket.h>
38 : #include <linux/sched/signal.h>
39 :
40 : #include "internal.h"
41 :
42 : /*
43 : * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
44 : * indicate they support non-blocking reads or writes, we must clear it
45 : * here if set to avoid blocking other users of this pipe if splice is
46 : * being done on it.
47 : */
48 8950498 : static noinline void noinline pipe_clear_nowait(struct file *file)
49 : {
50 8950498 : fmode_t fmode = READ_ONCE(file->f_mode);
51 :
52 8950498 : do {
53 8950498 : if (!(fmode & FMODE_NOWAIT))
54 : break;
55 4094298 : } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
56 8950424 : }
57 :
58 : /*
59 : * Attempt to steal a page from a pipe buffer. This should perhaps go into
60 : * a vm helper function, it's already simplified quite a bit by the
61 : * addition of remove_mapping(). If success is returned, the caller may
62 : * attempt to reuse this page for another destination.
63 : */
64 0 : static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
65 : struct pipe_buffer *buf)
66 : {
67 0 : struct folio *folio = page_folio(buf->page);
68 0 : struct address_space *mapping;
69 :
70 0 : folio_lock(folio);
71 :
72 0 : mapping = folio_mapping(folio);
73 0 : if (mapping) {
74 0 : WARN_ON(!folio_test_uptodate(folio));
75 :
76 : /*
77 : * At least for ext2 with nobh option, we need to wait on
78 : * writeback completing on this folio, since we'll remove it
79 : * from the pagecache. Otherwise truncate wont wait on the
80 : * folio, allowing the disk blocks to be reused by someone else
81 : * before we actually wrote our data to them. fs corruption
82 : * ensues.
83 : */
84 0 : folio_wait_writeback(folio);
85 :
86 0 : if (folio_has_private(folio) &&
87 0 : !filemap_release_folio(folio, GFP_KERNEL))
88 0 : goto out_unlock;
89 :
90 : /*
91 : * If we succeeded in removing the mapping, set LRU flag
92 : * and return good.
93 : */
94 0 : if (remove_mapping(mapping, folio)) {
95 0 : buf->flags |= PIPE_BUF_FLAG_LRU;
96 0 : return true;
97 : }
98 : }
99 :
100 : /*
101 : * Raced with truncate or failed to remove folio from current
102 : * address space, unlock and return failure.
103 : */
104 0 : out_unlock:
105 0 : folio_unlock(folio);
106 0 : return false;
107 : }
108 :
109 171462343 : static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
110 : struct pipe_buffer *buf)
111 : {
112 171462343 : put_page(buf->page);
113 171462650 : buf->flags &= ~PIPE_BUF_FLAG_LRU;
114 171462650 : }
115 :
116 : /*
117 : * Check whether the contents of buf is OK to access. Since the content
118 : * is a page cache page, IO may be in flight.
119 : */
120 171801140 : static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
121 : struct pipe_buffer *buf)
122 : {
123 171801140 : struct page *page = buf->page;
124 171801140 : int err;
125 :
126 171801140 : if (!PageUptodate(page)) {
127 0 : lock_page(page);
128 :
129 : /*
130 : * Page got truncated/unhashed. This will cause a 0-byte
131 : * splice, if this is the first page.
132 : */
133 0 : if (!page->mapping) {
134 0 : err = -ENODATA;
135 0 : goto error;
136 : }
137 :
138 : /*
139 : * Uh oh, read-error from disk.
140 : */
141 0 : if (!PageUptodate(page)) {
142 0 : err = -EIO;
143 0 : goto error;
144 : }
145 :
146 : /*
147 : * Page is ok afterall, we are done.
148 : */
149 0 : unlock_page(page);
150 : }
151 :
152 : return 0;
153 0 : error:
154 0 : unlock_page(page);
155 0 : return err;
156 : }
157 :
158 : const struct pipe_buf_operations page_cache_pipe_buf_ops = {
159 : .confirm = page_cache_pipe_buf_confirm,
160 : .release = page_cache_pipe_buf_release,
161 : .try_steal = page_cache_pipe_buf_try_steal,
162 : .get = generic_pipe_buf_get,
163 : };
164 :
165 0 : static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
166 : struct pipe_buffer *buf)
167 : {
168 0 : if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
169 : return false;
170 :
171 0 : buf->flags |= PIPE_BUF_FLAG_LRU;
172 0 : return generic_pipe_buf_try_steal(pipe, buf);
173 : }
174 :
175 : static const struct pipe_buf_operations user_page_pipe_buf_ops = {
176 : .release = page_cache_pipe_buf_release,
177 : .try_steal = user_page_pipe_buf_try_steal,
178 : .get = generic_pipe_buf_get,
179 : };
180 :
181 2935831 : static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
182 : {
183 2935831 : smp_mb();
184 2935854 : if (waitqueue_active(&pipe->rd_wait))
185 10254 : wake_up_interruptible(&pipe->rd_wait);
186 2935854 : kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
187 2935784 : }
188 :
189 : /**
190 : * splice_to_pipe - fill passed data into a pipe
191 : * @pipe: pipe to fill
192 : * @spd: data to fill
193 : *
194 : * Description:
195 : * @spd contains a map of pages and len/offset tuples, along with
196 : * the struct pipe_buf_operations associated with these pages. This
197 : * function will link that data to the pipe.
198 : *
199 : */
200 192056 : ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
201 : struct splice_pipe_desc *spd)
202 : {
203 192056 : unsigned int spd_pages = spd->nr_pages;
204 192056 : unsigned int tail = pipe->tail;
205 192056 : unsigned int head = pipe->head;
206 192056 : unsigned int mask = pipe->ring_size - 1;
207 192056 : int ret = 0, page_nr = 0;
208 :
209 192056 : if (!spd_pages)
210 : return 0;
211 :
212 192056 : if (unlikely(!pipe->readers)) {
213 0 : send_sig(SIGPIPE, current, 0);
214 0 : ret = -EPIPE;
215 0 : goto out;
216 : }
217 :
218 214654 : while (!pipe_full(head, tail, pipe->max_usage)) {
219 214654 : struct pipe_buffer *buf = &pipe->bufs[head & mask];
220 :
221 214654 : buf->page = spd->pages[page_nr];
222 214654 : buf->offset = spd->partial[page_nr].offset;
223 214654 : buf->len = spd->partial[page_nr].len;
224 214654 : buf->private = spd->partial[page_nr].private;
225 214654 : buf->ops = spd->ops;
226 214654 : buf->flags = 0;
227 :
228 214654 : head++;
229 214654 : pipe->head = head;
230 214654 : page_nr++;
231 214654 : ret += buf->len;
232 :
233 214654 : if (!--spd->nr_pages)
234 : break;
235 : }
236 :
237 192056 : if (!ret)
238 0 : ret = -EAGAIN;
239 :
240 192056 : out:
241 192056 : while (page_nr < spd_pages)
242 0 : spd->spd_release(spd, page_nr++);
243 :
244 192056 : return ret;
245 : }
246 : EXPORT_SYMBOL_GPL(splice_to_pipe);
247 :
248 33065 : ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
249 : {
250 33065 : unsigned int head = pipe->head;
251 33065 : unsigned int tail = pipe->tail;
252 33065 : unsigned int mask = pipe->ring_size - 1;
253 33065 : int ret;
254 :
255 33065 : if (unlikely(!pipe->readers)) {
256 0 : send_sig(SIGPIPE, current, 0);
257 0 : ret = -EPIPE;
258 33065 : } else if (pipe_full(head, tail, pipe->max_usage)) {
259 : ret = -EAGAIN;
260 : } else {
261 33075 : pipe->bufs[head & mask] = *buf;
262 33075 : pipe->head = head + 1;
263 33075 : return buf->len;
264 : }
265 4 : pipe_buf_release(pipe, buf);
266 0 : return ret;
267 : }
268 : EXPORT_SYMBOL(add_to_pipe);
269 :
270 : /*
271 : * Check if we need to grow the arrays holding pages and partial page
272 : * descriptions.
273 : */
274 0 : int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
275 : {
276 0 : unsigned int max_usage = READ_ONCE(pipe->max_usage);
277 :
278 0 : spd->nr_pages_max = max_usage;
279 0 : if (max_usage <= PIPE_DEF_BUFFERS)
280 : return 0;
281 :
282 0 : spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
283 0 : spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
284 : GFP_KERNEL);
285 :
286 0 : if (spd->pages && spd->partial)
287 : return 0;
288 :
289 0 : kfree(spd->pages);
290 0 : kfree(spd->partial);
291 0 : return -ENOMEM;
292 : }
293 :
294 0 : void splice_shrink_spd(struct splice_pipe_desc *spd)
295 : {
296 0 : if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
297 : return;
298 :
299 0 : kfree(spd->pages);
300 0 : kfree(spd->partial);
301 : }
302 :
303 : /**
304 : * copy_splice_read - Copy data from a file and splice the copy into a pipe
305 : * @in: The file to read from
306 : * @ppos: Pointer to the file position to read from
307 : * @pipe: The pipe to splice into
308 : * @len: The amount to splice
309 : * @flags: The SPLICE_F_* flags
310 : *
311 : * This function allocates a bunch of pages sufficient to hold the requested
312 : * amount of data (but limited by the remaining pipe capacity), passes it to
313 : * the file's ->read_iter() to read into and then splices the used pages into
314 : * the pipe.
315 : *
316 : * Return: On success, the number of bytes read will be returned and *@ppos
317 : * will be updated if appropriate; 0 will be returned if there is no more data
318 : * to be read; -EAGAIN will be returned if the pipe had no space, and some
319 : * other negative error code will be returned on error. A short read may occur
320 : * if the pipe has insufficient space, we reach the end of the data or we hit a
321 : * hole.
322 : */
323 436468 : ssize_t copy_splice_read(struct file *in, loff_t *ppos,
324 : struct pipe_inode_info *pipe,
325 : size_t len, unsigned int flags)
326 : {
327 436468 : struct iov_iter to;
328 436468 : struct bio_vec *bv;
329 436468 : struct kiocb kiocb;
330 436468 : struct page **pages;
331 436468 : ssize_t ret;
332 436468 : size_t used, npages, chunk, remain, keep = 0;
333 436468 : int i;
334 :
335 : /* Work out how much data we can actually add into the pipe */
336 436468 : used = pipe_occupancy(pipe->head, pipe->tail);
337 436468 : npages = max_t(ssize_t, pipe->max_usage - used, 0);
338 436468 : len = min_t(size_t, len, npages * PAGE_SIZE);
339 436468 : npages = DIV_ROUND_UP(len, PAGE_SIZE);
340 :
341 436468 : bv = kzalloc(array_size(npages, sizeof(bv[0])) +
342 : array_size(npages, sizeof(struct page *)), GFP_KERNEL);
343 436468 : if (!bv)
344 : return -ENOMEM;
345 :
346 436468 : pages = (struct page **)(bv + npages);
347 436468 : npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
348 436468 : if (!npages) {
349 0 : kfree(bv);
350 0 : return -ENOMEM;
351 : }
352 :
353 436468 : remain = len = min_t(size_t, len, npages * PAGE_SIZE);
354 :
355 4554204 : for (i = 0; i < npages; i++) {
356 4117736 : chunk = min_t(size_t, PAGE_SIZE, remain);
357 4117736 : bv[i].bv_page = pages[i];
358 4117736 : bv[i].bv_offset = 0;
359 4117736 : bv[i].bv_len = chunk;
360 4117736 : remain -= chunk;
361 : }
362 :
363 : /* Do the I/O */
364 436468 : iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
365 436468 : init_sync_kiocb(&kiocb, in);
366 436468 : kiocb.ki_pos = *ppos;
367 436468 : ret = call_read_iter(in, &kiocb, &to);
368 :
369 436468 : if (ret > 0) {
370 436468 : keep = DIV_ROUND_UP(ret, PAGE_SIZE);
371 436468 : *ppos = kiocb.ki_pos;
372 : }
373 :
374 : /*
375 : * Callers of ->splice_read() expect -EAGAIN on "can't put anything in
376 : * there", rather than -EFAULT.
377 : */
378 436468 : if (ret == -EFAULT)
379 0 : ret = -EAGAIN;
380 :
381 : /* Free any pages that didn't get touched at all. */
382 436468 : if (keep < npages)
383 0 : release_pages(pages + keep, npages - keep);
384 :
385 : /* Push the remaining pages into the pipe. */
386 436468 : remain = ret;
387 4554204 : for (i = 0; i < keep; i++) {
388 4117736 : struct pipe_buffer *buf = pipe_head_buf(pipe);
389 :
390 4117736 : chunk = min_t(size_t, remain, PAGE_SIZE);
391 4117736 : *buf = (struct pipe_buffer) {
392 : .ops = &default_pipe_buf_ops,
393 4117736 : .page = bv[i].bv_page,
394 : .offset = 0,
395 : .len = chunk,
396 : };
397 4117736 : pipe->head++;
398 4117736 : remain -= chunk;
399 : }
400 :
401 436468 : kfree(bv);
402 436468 : return ret;
403 : }
404 : EXPORT_SYMBOL(copy_splice_read);
405 :
406 : const struct pipe_buf_operations default_pipe_buf_ops = {
407 : .release = generic_pipe_buf_release,
408 : .try_steal = generic_pipe_buf_try_steal,
409 : .get = generic_pipe_buf_get,
410 : };
411 :
412 : /* Pipe buffer operations for a socket and similar. */
413 : const struct pipe_buf_operations nosteal_pipe_buf_ops = {
414 : .release = generic_pipe_buf_release,
415 : .get = generic_pipe_buf_get,
416 : };
417 : EXPORT_SYMBOL(nosteal_pipe_buf_ops);
418 :
419 5420619 : static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
420 : {
421 5420619 : smp_mb();
422 5421158 : if (waitqueue_active(&pipe->wr_wait))
423 512744 : wake_up_interruptible(&pipe->wr_wait);
424 5420688 : kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
425 5420349 : }
426 :
427 : /**
428 : * splice_from_pipe_feed - feed available data from a pipe to a file
429 : * @pipe: pipe to splice from
430 : * @sd: information to @actor
431 : * @actor: handler that splices the data
432 : *
433 : * Description:
434 : * This function loops over the pipe and calls @actor to do the
435 : * actual moving of a single struct pipe_buffer to the desired
436 : * destination. It returns when there's no more buffers left in
437 : * the pipe or if the requested number of bytes (@sd->total_len)
438 : * have been copied. It returns a positive number (one) if the
439 : * pipe needs to be filled with more data, zero if the required
440 : * number of bytes have been copied and -errno on error.
441 : *
442 : * This, together with splice_from_pipe_{begin,end,next}, may be
443 : * used to implement the functionality of __splice_from_pipe() when
444 : * locking is required around copying the pipe buffers to the
445 : * destination.
446 : */
447 2536865 : static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
448 : splice_actor *actor)
449 : {
450 2536865 : unsigned int head = pipe->head;
451 2536865 : unsigned int tail = pipe->tail;
452 2536865 : unsigned int mask = pipe->ring_size - 1;
453 2536865 : int ret;
454 :
455 14700355 : while (!pipe_empty(head, tail)) {
456 12162922 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
457 :
458 12162922 : sd->len = buf->len;
459 12162922 : if (sd->len > sd->total_len)
460 0 : sd->len = sd->total_len;
461 :
462 12162922 : ret = pipe_buf_confirm(pipe, buf);
463 12162475 : if (unlikely(ret)) {
464 0 : if (ret == -ENODATA)
465 0 : ret = 0;
466 0 : return ret;
467 : }
468 :
469 12162475 : ret = actor(pipe, buf, sd);
470 12162298 : if (ret <= 0)
471 0 : return ret;
472 :
473 12162298 : buf->offset += ret;
474 12162298 : buf->len -= ret;
475 :
476 12162298 : sd->num_spliced += ret;
477 12162298 : sd->len -= ret;
478 12162298 : sd->pos += ret;
479 12162298 : sd->total_len -= ret;
480 :
481 12162298 : if (!buf->len) {
482 12161843 : pipe_buf_release(pipe, buf);
483 12163035 : tail++;
484 12163035 : pipe->tail = tail;
485 12163035 : if (pipe->files)
486 12162914 : sd->need_wakeup = true;
487 : }
488 :
489 12163490 : if (!sd->total_len)
490 : return 0;
491 : }
492 :
493 : return 1;
494 : }
495 :
496 : /* We know we have a pipe buffer, but maybe it's empty? */
497 18704710 : static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
498 : {
499 18704710 : unsigned int tail = pipe->tail;
500 18704710 : unsigned int mask = pipe->ring_size - 1;
501 18704710 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
502 :
503 18704710 : if (unlikely(!buf->len)) {
504 0 : pipe_buf_release(pipe, buf);
505 0 : pipe->tail = tail+1;
506 0 : return true;
507 : }
508 :
509 : return false;
510 : }
511 :
512 : /**
513 : * splice_from_pipe_next - wait for some data to splice from
514 : * @pipe: pipe to splice from
515 : * @sd: information about the splice operation
516 : *
517 : * Description:
518 : * This function will wait for some data and return a positive
519 : * value (one) if pipe buffers are available. It will return zero
520 : * or -errno if no more data needs to be spliced.
521 : */
522 21739663 : static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
523 : {
524 : /*
525 : * Check for signal early to make process killable when there are
526 : * always buffers available
527 : */
528 21739663 : if (signal_pending(current))
529 : return -ERESTARTSYS;
530 :
531 21738946 : repeat:
532 24232356 : while (pipe_empty(pipe->head, pipe->tail)) {
533 5526963 : if (!pipe->writers)
534 : return 0;
535 :
536 5047747 : if (sd->num_spliced)
537 : return 0;
538 :
539 2493303 : if (sd->flags & SPLICE_F_NONBLOCK)
540 : return -EAGAIN;
541 :
542 2493303 : if (signal_pending(current))
543 : return -ERESTARTSYS;
544 :
545 2493302 : if (sd->need_wakeup) {
546 0 : wakeup_pipe_writers(pipe);
547 0 : sd->need_wakeup = false;
548 : }
549 :
550 2493302 : pipe_wait_readable(pipe);
551 : }
552 :
553 18705393 : if (eat_empty_buffer(pipe))
554 0 : goto repeat;
555 :
556 : return 1;
557 : }
558 :
559 : /**
560 : * splice_from_pipe_begin - start splicing from pipe
561 : * @sd: information about the splice operation
562 : *
563 : * Description:
564 : * This function should be called before a loop containing
565 : * splice_from_pipe_next() and splice_from_pipe_feed() to
566 : * initialize the necessary fields of @sd.
567 : */
568 : static void splice_from_pipe_begin(struct splice_desc *sd)
569 : {
570 19148297 : sd->num_spliced = 0;
571 19148297 : sd->need_wakeup = false;
572 19148297 : }
573 :
574 : /**
575 : * splice_from_pipe_end - finish splicing from pipe
576 : * @pipe: pipe to splice from
577 : * @sd: information about the splice operation
578 : *
579 : * Description:
580 : * This function will wake up pipe writers if necessary. It should
581 : * be called after a loop containing splice_from_pipe_next() and
582 : * splice_from_pipe_feed().
583 : */
584 19148109 : static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
585 : {
586 19148109 : if (sd->need_wakeup)
587 5167437 : wakeup_pipe_writers(pipe);
588 19148355 : }
589 :
590 : /**
591 : * __splice_from_pipe - splice data from a pipe to given actor
592 : * @pipe: pipe to splice from
593 : * @sd: information to @actor
594 : * @actor: handler that splices the data
595 : *
596 : * Description:
597 : * This function does little more than loop over the pipe and call
598 : * @actor to do the actual moving of a single struct pipe_buffer to
599 : * the desired destination. See pipe_to_file, pipe_to_sendmsg, or
600 : * pipe_to_user.
601 : *
602 : */
603 3015963 : ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
604 : splice_actor *actor)
605 : {
606 3015963 : int ret;
607 :
608 3015963 : splice_from_pipe_begin(sd);
609 5553399 : do {
610 5553399 : cond_resched();
611 5553294 : ret = splice_from_pipe_next(pipe, sd);
612 5552876 : if (ret > 0)
613 2537071 : ret = splice_from_pipe_feed(pipe, sd, actor);
614 5553228 : } while (ret > 0);
615 3015792 : splice_from_pipe_end(pipe, sd);
616 :
617 3016053 : return sd->num_spliced ? sd->num_spliced : ret;
618 : }
619 : EXPORT_SYMBOL(__splice_from_pipe);
620 :
621 : /**
622 : * splice_from_pipe - splice data from a pipe to a file
623 : * @pipe: pipe to splice from
624 : * @out: file to splice to
625 : * @ppos: position in @out
626 : * @len: how many bytes to splice
627 : * @flags: splice modifier flags
628 : * @actor: handler that splices the data
629 : *
630 : * Description:
631 : * See __splice_from_pipe. This function locks the pipe inode,
632 : * otherwise it's identical to __splice_from_pipe().
633 : *
634 : */
635 3015790 : ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
636 : loff_t *ppos, size_t len, unsigned int flags,
637 : splice_actor *actor)
638 : {
639 3015790 : ssize_t ret;
640 3015790 : struct splice_desc sd = {
641 : .total_len = len,
642 : .flags = flags,
643 3015790 : .pos = *ppos,
644 : .u.file = out,
645 : };
646 :
647 3015790 : pipe_lock(pipe);
648 3016068 : ret = __splice_from_pipe(pipe, &sd, actor);
649 3016180 : pipe_unlock(pipe);
650 :
651 3016254 : return ret;
652 : }
653 :
654 : /**
655 : * iter_file_splice_write - splice data from a pipe to a file
656 : * @pipe: pipe info
657 : * @out: file to write to
658 : * @ppos: position in @out
659 : * @len: number of bytes to splice
660 : * @flags: splice modifier flags
661 : *
662 : * Description:
663 : * Will either move or copy pages (determined by @flags options) from
664 : * the given pipe inode to the given file.
665 : * This one is ->write_iter-based.
666 : *
667 : */
668 : ssize_t
669 16132315 : iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
670 : loff_t *ppos, size_t len, unsigned int flags)
671 : {
672 16132315 : struct splice_desc sd = {
673 : .total_len = len,
674 : .flags = flags,
675 16132315 : .pos = *ppos,
676 : .u.file = out,
677 : };
678 16132315 : int nbufs = pipe->max_usage;
679 16132315 : struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
680 : GFP_KERNEL);
681 16132317 : ssize_t ret;
682 :
683 16132317 : if (unlikely(!array))
684 : return -ENOMEM;
685 :
686 16132317 : pipe_lock(pipe);
687 :
688 16132334 : splice_from_pipe_begin(&sd);
689 31898280 : while (sd.total_len) {
690 16186410 : struct iov_iter from;
691 16186410 : unsigned int head, tail, mask;
692 16186410 : size_t left;
693 16186410 : int n;
694 :
695 16186410 : ret = splice_from_pipe_next(pipe, &sd);
696 16186080 : if (ret <= 0)
697 : break;
698 :
699 16168481 : if (unlikely(nbufs < pipe->max_usage)) {
700 0 : kfree(array);
701 0 : nbufs = pipe->max_usage;
702 0 : array = kcalloc(nbufs, sizeof(struct bio_vec),
703 : GFP_KERNEL);
704 0 : if (!array) {
705 : ret = -ENOMEM;
706 : break;
707 : }
708 : }
709 :
710 16168481 : head = pipe->head;
711 16168481 : tail = pipe->tail;
712 16168481 : mask = pipe->ring_size - 1;
713 :
714 : /* build the vector */
715 16168481 : left = sd.total_len;
716 192158228 : for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
717 175989301 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
718 175989301 : size_t this_len = buf->len;
719 :
720 : /* zero-length bvecs are not supported, skip them */
721 175989301 : if (!this_len)
722 0 : continue;
723 175989301 : this_len = min(this_len, left);
724 :
725 175989301 : ret = pipe_buf_confirm(pipe, buf);
726 175989747 : if (unlikely(ret)) {
727 0 : if (ret == -ENODATA)
728 0 : ret = 0;
729 0 : goto done;
730 : }
731 :
732 175989747 : bvec_set_page(&array[n], buf->page, this_len,
733 : buf->offset);
734 175989747 : left -= this_len;
735 175989747 : n++;
736 : }
737 :
738 16168927 : iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
739 16168801 : ret = vfs_iter_write(out, &from, &sd.pos, 0);
740 16168696 : if (ret <= 0)
741 : break;
742 :
743 15765815 : sd.num_spliced += ret;
744 15765815 : sd.total_len -= ret;
745 15765815 : *ppos = sd.pos;
746 :
747 : /* dismiss the fully eaten buffers, adjust the partial one */
748 15765815 : tail = pipe->tail;
749 186298931 : while (ret) {
750 170569526 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
751 170569526 : if (ret >= buf->len) {
752 170532985 : ret -= buf->len;
753 170532985 : buf->len = 0;
754 170532985 : pipe_buf_release(pipe, buf);
755 170533116 : tail++;
756 170533116 : pipe->tail = tail;
757 170533116 : if (pipe->files)
758 28827421 : sd.need_wakeup = true;
759 : } else {
760 36541 : buf->offset += ret;
761 36541 : buf->len -= ret;
762 36541 : ret = 0;
763 : }
764 : }
765 : }
766 15711870 : done:
767 16132350 : kfree(array);
768 16132503 : splice_from_pipe_end(pipe, &sd);
769 :
770 16132339 : pipe_unlock(pipe);
771 :
772 16132551 : if (sd.num_spliced)
773 15753982 : ret = sd.num_spliced;
774 :
775 : return ret;
776 : }
777 :
778 : EXPORT_SYMBOL(iter_file_splice_write);
779 :
780 : #ifdef CONFIG_NET
781 : /**
782 : * splice_to_socket - splice data from a pipe to a socket
783 : * @pipe: pipe to splice from
784 : * @out: socket to write to
785 : * @ppos: position in @out
786 : * @len: number of bytes to splice
787 : * @flags: splice modifier flags
788 : *
789 : * Description:
790 : * Will send @len bytes from the pipe to a network socket. No data copying
791 : * is involved.
792 : *
793 : */
794 197097 : ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
795 : loff_t *ppos, size_t len, unsigned int flags)
796 : {
797 197097 : struct socket *sock = sock_from_file(out);
798 197097 : struct bio_vec bvec[16];
799 197097 : struct msghdr msg = {};
800 197097 : ssize_t ret = 0;
801 197097 : size_t spliced = 0;
802 197097 : bool need_wakeup = false;
803 :
804 197097 : pipe_lock(pipe);
805 :
806 394195 : while (len > 0) {
807 203164 : unsigned int head, tail, mask, bc = 0;
808 203164 : size_t remain = len;
809 :
810 : /*
811 : * Check for signal early to make process killable when there
812 : * are always buffers available
813 : */
814 203164 : ret = -ERESTARTSYS;
815 203164 : if (signal_pending(current))
816 : break;
817 :
818 203164 : while (pipe_empty(pipe->head, pipe->tail)) {
819 0 : ret = 0;
820 0 : if (!pipe->writers)
821 0 : goto out;
822 :
823 0 : if (spliced)
824 0 : goto out;
825 :
826 0 : ret = -EAGAIN;
827 0 : if (flags & SPLICE_F_NONBLOCK)
828 0 : goto out;
829 :
830 0 : ret = -ERESTARTSYS;
831 0 : if (signal_pending(current))
832 0 : goto out;
833 :
834 0 : if (need_wakeup) {
835 0 : wakeup_pipe_writers(pipe);
836 0 : need_wakeup = false;
837 : }
838 :
839 0 : pipe_wait_readable(pipe);
840 : }
841 :
842 203164 : head = pipe->head;
843 203164 : tail = pipe->tail;
844 203164 : mask = pipe->ring_size - 1;
845 :
846 231621 : while (!pipe_empty(head, tail)) {
847 231621 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
848 231621 : size_t seg;
849 :
850 231621 : if (!buf->len) {
851 0 : tail++;
852 0 : continue;
853 : }
854 :
855 231621 : seg = min_t(size_t, remain, buf->len);
856 :
857 231621 : ret = pipe_buf_confirm(pipe, buf);
858 231621 : if (unlikely(ret)) {
859 0 : if (ret == -ENODATA)
860 0 : ret = 0;
861 : break;
862 : }
863 :
864 231621 : bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
865 231621 : remain -= seg;
866 231621 : if (remain == 0 || bc >= ARRAY_SIZE(bvec))
867 : break;
868 28457 : tail++;
869 : }
870 :
871 203164 : if (!bc)
872 : break;
873 :
874 203164 : msg.msg_flags = MSG_SPLICE_PAGES;
875 203164 : if (flags & SPLICE_F_MORE)
876 0 : msg.msg_flags |= MSG_MORE;
877 203164 : if (remain && pipe_occupancy(pipe->head, tail) > 0)
878 0 : msg.msg_flags |= MSG_MORE;
879 203164 : if (out->f_flags & O_NONBLOCK)
880 203164 : msg.msg_flags |= MSG_DONTWAIT;
881 :
882 203164 : iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
883 : len - remain);
884 203164 : ret = sock_sendmsg(sock, &msg);
885 203164 : if (ret <= 0)
886 : break;
887 :
888 197098 : spliced += ret;
889 197098 : len -= ret;
890 197098 : tail = pipe->tail;
891 417819 : while (ret > 0) {
892 220721 : struct pipe_buffer *buf = &pipe->bufs[tail & mask];
893 220721 : size_t seg = min_t(size_t, ret, buf->len);
894 :
895 220721 : buf->offset += seg;
896 220721 : buf->len -= seg;
897 220721 : ret -= seg;
898 :
899 220721 : if (!buf->len) {
900 214654 : pipe_buf_release(pipe, buf);
901 214654 : tail++;
902 : }
903 : }
904 :
905 197098 : if (tail != pipe->tail) {
906 194903 : pipe->tail = tail;
907 194903 : if (pipe->files)
908 194903 : need_wakeup = true;
909 : }
910 : }
911 :
912 197097 : out:
913 197097 : pipe_unlock(pipe);
914 197097 : if (need_wakeup)
915 194903 : wakeup_pipe_writers(pipe);
916 197097 : return spliced ?: ret;
917 : }
918 : #endif
919 :
920 : static int warn_unsupported(struct file *file, const char *op)
921 : {
922 : pr_debug_ratelimited(
923 : "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
924 : op, file, current->pid, current->comm);
925 : return -EINVAL;
926 : }
927 :
928 : /*
929 : * Attempt to initiate a splice from pipe to file.
930 : */
931 : static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
932 : loff_t *ppos, size_t len, unsigned int flags)
933 : {
934 19345110 : if (unlikely(!out->f_op->splice_write))
935 : return warn_unsupported(out, "write");
936 19345110 : return out->f_op->splice_write(pipe, out, ppos, len, flags);
937 : }
938 :
939 : /*
940 : * Indicate to the caller that there was a premature EOF when reading from the
941 : * source and the caller didn't indicate they would be sending more data after
942 : * this.
943 : */
944 : static void do_splice_eof(struct splice_desc *sd)
945 : {
946 0 : if (sd->splice_eof)
947 0 : sd->splice_eof(sd);
948 : }
949 :
950 : /**
951 : * vfs_splice_read - Read data from a file and splice it into a pipe
952 : * @in: File to splice from
953 : * @ppos: Input file offset
954 : * @pipe: Pipe to splice to
955 : * @len: Number of bytes to splice
956 : * @flags: Splice modifier flags (SPLICE_F_*)
957 : *
958 : * Splice the requested amount of data from the input file to the pipe. This
959 : * is synchronous as the caller must hold the pipe lock across the entire
960 : * operation.
961 : *
962 : * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
963 : * a hole and a negative error code otherwise.
964 : */
965 16318445 : long vfs_splice_read(struct file *in, loff_t *ppos,
966 : struct pipe_inode_info *pipe, size_t len,
967 : unsigned int flags)
968 : {
969 16318445 : unsigned int p_space;
970 16318445 : int ret;
971 :
972 16318445 : if (unlikely(!(in->f_mode & FMODE_READ)))
973 : return -EBADF;
974 16318445 : if (!len)
975 : return 0;
976 :
977 : /* Don't try to read more the pipe has space for. */
978 16318418 : p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
979 16318418 : len = min_t(size_t, len, p_space << PAGE_SHIFT);
980 :
981 16318418 : ret = rw_verify_area(READ, in, ppos, len);
982 16318412 : if (unlikely(ret < 0))
983 0 : return ret;
984 :
985 16318412 : if (unlikely(len > MAX_RW_COUNT))
986 0 : len = MAX_RW_COUNT;
987 :
988 16318412 : if (unlikely(!in->f_op->splice_read))
989 : return warn_unsupported(in, "read");
990 : /*
991 : * O_DIRECT and DAX don't deal with the pagecache, so we allocate a
992 : * buffer, copy into it and splice that into the pipe.
993 : */
994 16318412 : if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
995 436468 : return copy_splice_read(in, ppos, pipe, len, flags);
996 15881944 : return in->f_op->splice_read(in, ppos, pipe, len, flags);
997 : }
998 : EXPORT_SYMBOL_GPL(vfs_splice_read);
999 :
1000 : /**
1001 : * splice_direct_to_actor - splices data directly between two non-pipes
1002 : * @in: file to splice from
1003 : * @sd: actor information on where to splice to
1004 : * @actor: handles the data splicing
1005 : *
1006 : * Description:
1007 : * This is a special case helper to splice directly between two
1008 : * points, without requiring an explicit pipe. Internally an allocated
1009 : * pipe is cached in the process, and reused during the lifetime of
1010 : * that process.
1011 : *
1012 : */
1013 9190306 : ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1014 : splice_direct_actor *actor)
1015 : {
1016 9190306 : struct pipe_inode_info *pipe;
1017 9190306 : long ret, bytes;
1018 9190306 : size_t len;
1019 9190306 : int i, flags, more;
1020 :
1021 : /*
1022 : * We require the input to be seekable, as we don't want to randomly
1023 : * drop data for eg socket -> socket splicing. Use the piped splicing
1024 : * for that!
1025 : */
1026 9190306 : if (unlikely(!(in->f_mode & FMODE_LSEEK)))
1027 : return -EINVAL;
1028 :
1029 : /*
1030 : * neither in nor out is a pipe, setup an internal pipe attached to
1031 : * 'out' and transfer the wanted data from 'in' to 'out' through that
1032 : */
1033 9190306 : pipe = current->splice_pipe;
1034 9190306 : if (unlikely(!pipe)) {
1035 50142 : pipe = alloc_pipe_info();
1036 50147 : if (!pipe)
1037 : return -ENOMEM;
1038 :
1039 : /*
1040 : * We don't have an immediate reader, but we'll read the stuff
1041 : * out of the pipe right after the splice_to_pipe(). So set
1042 : * PIPE_READERS appropriately.
1043 : */
1044 50147 : pipe->readers = 1;
1045 :
1046 50147 : current->splice_pipe = pipe;
1047 : }
1048 :
1049 : /*
1050 : * Do the splice.
1051 : */
1052 9190311 : bytes = 0;
1053 9190311 : len = sd->total_len;
1054 :
1055 : /* Don't block on output, we have to drain the direct pipe. */
1056 9190311 : flags = sd->flags;
1057 9190311 : sd->flags &= ~SPLICE_F_NONBLOCK;
1058 :
1059 : /*
1060 : * We signal MORE until we've read sufficient data to fulfill the
1061 : * request and we keep signalling it if the caller set it.
1062 : */
1063 9190311 : more = sd->flags & SPLICE_F_MORE;
1064 9190311 : sd->flags |= SPLICE_F_MORE;
1065 :
1066 9190311 : WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
1067 :
1068 22291011 : while (len) {
1069 13423137 : size_t read_len;
1070 13423137 : loff_t pos = sd->pos, prev_pos = pos;
1071 :
1072 13423137 : ret = vfs_splice_read(in, &pos, pipe, len, flags);
1073 13423455 : if (unlikely(ret <= 0))
1074 171 : goto read_failure;
1075 :
1076 13423284 : read_len = ret;
1077 13423284 : sd->total_len = read_len;
1078 :
1079 : /*
1080 : * If we now have sufficient data to fulfill the request then
1081 : * we clear SPLICE_F_MORE if it was not set initially.
1082 : */
1083 13423284 : if (read_len >= len && !more)
1084 9023131 : sd->flags &= ~SPLICE_F_MORE;
1085 :
1086 : /*
1087 : * NOTE: nonblocking mode only applies to the input. We
1088 : * must not do the output in nonblocking mode as then we
1089 : * could get stuck data in the internal pipe:
1090 : */
1091 13423284 : ret = actor(pipe, sd);
1092 13423244 : if (unlikely(ret <= 0)) {
1093 302792 : sd->pos = prev_pos;
1094 322544 : goto out_release;
1095 : }
1096 :
1097 13120452 : bytes += ret;
1098 13120452 : len -= ret;
1099 13120452 : sd->pos = pos;
1100 :
1101 13120452 : if (ret < read_len) {
1102 19752 : sd->pos = prev_pos + ret;
1103 19752 : goto out_release;
1104 : }
1105 : }
1106 :
1107 8867874 : done:
1108 9190674 : pipe->tail = pipe->head = 0;
1109 9190674 : file_accessed(in);
1110 9190674 : return bytes;
1111 :
1112 : read_failure:
1113 : /*
1114 : * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
1115 : * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
1116 : * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
1117 : * least 1 byte *then* we will also do the ->splice_eof() call.
1118 : */
1119 171 : if (ret == 0 && !more && len > 0 && bytes)
1120 0 : do_splice_eof(sd);
1121 171 : out_release:
1122 : /*
1123 : * If we did an incomplete transfer we must release
1124 : * the pipe buffers in question:
1125 : */
1126 5485475 : for (i = 0; i < pipe->ring_size; i++) {
1127 5162675 : struct pipe_buffer *buf = &pipe->bufs[i];
1128 :
1129 5162675 : if (buf->ops)
1130 4108372 : pipe_buf_release(pipe, buf);
1131 : }
1132 :
1133 322800 : if (!bytes)
1134 302803 : bytes = ret;
1135 :
1136 322800 : goto done;
1137 : }
1138 : EXPORT_SYMBOL(splice_direct_to_actor);
1139 :
1140 13423171 : static int direct_splice_actor(struct pipe_inode_info *pipe,
1141 : struct splice_desc *sd)
1142 : {
1143 13423171 : struct file *file = sd->u.file;
1144 :
1145 13423171 : return do_splice_from(pipe, file, sd->opos, sd->total_len,
1146 : sd->flags);
1147 : }
1148 :
1149 0 : static void direct_file_splice_eof(struct splice_desc *sd)
1150 : {
1151 0 : struct file *file = sd->u.file;
1152 :
1153 0 : if (file->f_op->splice_eof)
1154 0 : file->f_op->splice_eof(file);
1155 0 : }
1156 :
1157 : /**
1158 : * do_splice_direct - splices data directly between two files
1159 : * @in: file to splice from
1160 : * @ppos: input file offset
1161 : * @out: file to splice to
1162 : * @opos: output file offset
1163 : * @len: number of bytes to splice
1164 : * @flags: splice modifier flags
1165 : *
1166 : * Description:
1167 : * For use by do_sendfile(). splice can easily emulate sendfile, but
1168 : * doing it in the application would incur an extra system call
1169 : * (splice in + splice out, as compared to just sendfile()). So this helper
1170 : * can splice directly through a process-private pipe.
1171 : *
1172 : */
1173 9190356 : long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1174 : loff_t *opos, size_t len, unsigned int flags)
1175 : {
1176 9190356 : struct splice_desc sd = {
1177 : .len = len,
1178 : .total_len = len,
1179 : .flags = flags,
1180 9190356 : .pos = *ppos,
1181 : .u.file = out,
1182 : .splice_eof = direct_file_splice_eof,
1183 : .opos = opos,
1184 : };
1185 9190356 : long ret;
1186 :
1187 9190356 : if (unlikely(!(out->f_mode & FMODE_WRITE)))
1188 : return -EBADF;
1189 :
1190 9190356 : if (unlikely(out->f_flags & O_APPEND))
1191 : return -EINVAL;
1192 :
1193 9190356 : ret = rw_verify_area(WRITE, out, opos, len);
1194 9190004 : if (unlikely(ret < 0))
1195 : return ret;
1196 :
1197 9190004 : ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1198 9190540 : if (ret > 0)
1199 8887847 : *ppos = sd.pos;
1200 :
1201 : return ret;
1202 : }
1203 : EXPORT_SYMBOL(do_splice_direct);
1204 :
1205 2911697 : static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1206 : {
1207 2911749 : for (;;) {
1208 2911723 : if (unlikely(!pipe->readers)) {
1209 24 : send_sig(SIGPIPE, current, 0);
1210 24 : return -EPIPE;
1211 : }
1212 2911699 : if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1213 : return 0;
1214 97 : if (flags & SPLICE_F_NONBLOCK)
1215 : return -EAGAIN;
1216 97 : if (signal_pending(current))
1217 : return -ERESTARTSYS;
1218 26 : pipe_wait_writable(pipe);
1219 : }
1220 : }
1221 :
1222 : static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1223 : struct pipe_inode_info *opipe,
1224 : size_t len, unsigned int flags);
1225 :
1226 2895138 : long splice_file_to_pipe(struct file *in,
1227 : struct pipe_inode_info *opipe,
1228 : loff_t *offset,
1229 : size_t len, unsigned int flags)
1230 : {
1231 2895138 : long ret;
1232 :
1233 2895138 : pipe_lock(opipe);
1234 2895142 : ret = wait_for_space(opipe, flags);
1235 2895081 : if (!ret)
1236 2895058 : ret = vfs_splice_read(in, offset, opipe, len, flags);
1237 2895184 : pipe_unlock(opipe);
1238 2895187 : if (ret > 0)
1239 2861136 : wakeup_pipe_readers(opipe);
1240 2895145 : return ret;
1241 : }
1242 :
1243 : /*
1244 : * Determine where to splice to/from.
1245 : */
1246 8875309 : long do_splice(struct file *in, loff_t *off_in, struct file *out,
1247 : loff_t *off_out, size_t len, unsigned int flags)
1248 : {
1249 8875309 : struct pipe_inode_info *ipipe;
1250 8875309 : struct pipe_inode_info *opipe;
1251 8875309 : loff_t offset;
1252 8875309 : long ret;
1253 :
1254 8875309 : if (unlikely(!(in->f_mode & FMODE_READ) ||
1255 : !(out->f_mode & FMODE_WRITE)))
1256 : return -EBADF;
1257 :
1258 8875309 : ipipe = get_pipe_info(in, true);
1259 8874795 : opipe = get_pipe_info(out, true);
1260 :
1261 8874904 : if (ipipe && opipe) {
1262 58155 : if (off_in || off_out)
1263 : return -ESPIPE;
1264 :
1265 : /* Splicing to self would be fun, but... */
1266 58155 : if (ipipe == opipe)
1267 : return -EINVAL;
1268 :
1269 58155 : if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1270 0 : flags |= SPLICE_F_NONBLOCK;
1271 :
1272 58155 : return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1273 : }
1274 :
1275 8816749 : if (ipipe) {
1276 5921627 : if (off_in)
1277 : return -ESPIPE;
1278 5921627 : if (off_out) {
1279 2691464 : if (!(out->f_mode & FMODE_PWRITE))
1280 : return -EINVAL;
1281 2691464 : offset = *off_out;
1282 : } else {
1283 3230163 : offset = out->f_pos;
1284 : }
1285 :
1286 5921627 : if (unlikely(out->f_flags & O_APPEND))
1287 : return -EINVAL;
1288 :
1289 5921627 : ret = rw_verify_area(WRITE, out, &offset, len);
1290 5921638 : if (unlikely(ret < 0))
1291 : return ret;
1292 :
1293 5921638 : if (in->f_flags & O_NONBLOCK)
1294 197097 : flags |= SPLICE_F_NONBLOCK;
1295 :
1296 5921638 : file_start_write(out);
1297 5921939 : ret = do_splice_from(ipipe, out, &offset, len, flags);
1298 5922651 : file_end_write(out);
1299 :
1300 5922636 : if (ret > 0)
1301 5367921 : fsnotify_modify(out);
1302 :
1303 5922192 : if (!off_out)
1304 3230671 : out->f_pos = offset;
1305 : else
1306 2691521 : *off_out = offset;
1307 :
1308 5922192 : return ret;
1309 : }
1310 :
1311 2895122 : if (opipe) {
1312 2895122 : if (off_out)
1313 : return -ESPIPE;
1314 2895122 : if (off_in) {
1315 2669991 : if (!(in->f_mode & FMODE_PREAD))
1316 : return -EINVAL;
1317 2669991 : offset = *off_in;
1318 : } else {
1319 225131 : offset = in->f_pos;
1320 : }
1321 :
1322 2895122 : if (out->f_flags & O_NONBLOCK)
1323 225027 : flags |= SPLICE_F_NONBLOCK;
1324 :
1325 2895122 : ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1326 :
1327 2895155 : if (ret > 0)
1328 2861102 : fsnotify_access(in);
1329 :
1330 2895151 : if (!off_in)
1331 225131 : in->f_pos = offset;
1332 : else
1333 2670020 : *off_in = offset;
1334 :
1335 2895151 : return ret;
1336 : }
1337 :
1338 : return -EINVAL;
1339 : }
1340 :
1341 8876073 : static long __do_splice(struct file *in, loff_t __user *off_in,
1342 : struct file *out, loff_t __user *off_out,
1343 : size_t len, unsigned int flags)
1344 : {
1345 8876073 : struct pipe_inode_info *ipipe;
1346 8876073 : struct pipe_inode_info *opipe;
1347 8876073 : loff_t offset, *__off_in = NULL, *__off_out = NULL;
1348 8876073 : long ret;
1349 :
1350 8876073 : ipipe = get_pipe_info(in, true);
1351 8875807 : opipe = get_pipe_info(out, true);
1352 :
1353 8875876 : if (ipipe) {
1354 5980752 : if (off_in)
1355 : return -ESPIPE;
1356 5980752 : pipe_clear_nowait(in);
1357 : }
1358 8875353 : if (opipe) {
1359 2953273 : if (off_out)
1360 : return -ESPIPE;
1361 2953273 : pipe_clear_nowait(out);
1362 : }
1363 :
1364 8875587 : if (off_out) {
1365 2691523 : if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1366 : return -EFAULT;
1367 : __off_out = &offset;
1368 : }
1369 8875571 : if (off_in) {
1370 2670050 : if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1371 : return -EFAULT;
1372 : __off_in = &offset;
1373 : }
1374 :
1375 8875562 : ret = do_splice(in, __off_in, out, __off_out, len, flags);
1376 8875601 : if (ret < 0)
1377 : return ret;
1378 :
1379 11415398 : if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1380 0 : return -EFAULT;
1381 11469524 : if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1382 0 : return -EFAULT;
1383 :
1384 : return ret;
1385 : }
1386 :
1387 16557 : static int iter_to_pipe(struct iov_iter *from,
1388 : struct pipe_inode_info *pipe,
1389 : unsigned flags)
1390 : {
1391 16557 : struct pipe_buffer buf = {
1392 : .ops = &user_page_pipe_buf_ops,
1393 : .flags = flags
1394 : };
1395 16557 : size_t total = 0;
1396 16557 : int ret = 0;
1397 :
1398 33133 : while (iov_iter_count(from)) {
1399 16559 : struct page *pages[16];
1400 16559 : ssize_t left;
1401 16559 : size_t start;
1402 16559 : int i, n;
1403 :
1404 16559 : left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1405 16569 : if (left <= 0) {
1406 0 : ret = left;
1407 0 : break;
1408 : }
1409 :
1410 16569 : n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1411 49621 : for (i = 0; i < n; i++) {
1412 33045 : int size = min_t(int, left, PAGE_SIZE - start);
1413 :
1414 33045 : buf.page = pages[i];
1415 33050 : buf.offset = start;
1416 33050 : buf.len = size;
1417 33050 : ret = add_to_pipe(pipe, &buf);
1418 33052 : if (unlikely(ret < 0)) {
1419 0 : iov_iter_revert(from, left);
1420 : // this one got dropped by add_to_pipe()
1421 0 : while (++i < n)
1422 0 : put_page(pages[i]);
1423 0 : goto out;
1424 : }
1425 33052 : total += ret;
1426 33052 : left -= size;
1427 33052 : start = 0;
1428 : }
1429 : }
1430 16574 : out:
1431 16574 : return total ? total : ret;
1432 : }
1433 :
1434 0 : static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1435 : struct splice_desc *sd)
1436 : {
1437 0 : int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1438 0 : return n == sd->len ? n : -EFAULT;
1439 : }
1440 :
1441 : /*
1442 : * For lack of a better implementation, implement vmsplice() to userspace
1443 : * as a simple copy of the pipes pages to the user iov.
1444 : */
1445 0 : static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1446 : unsigned int flags)
1447 : {
1448 0 : struct pipe_inode_info *pipe = get_pipe_info(file, true);
1449 0 : struct splice_desc sd = {
1450 : .total_len = iov_iter_count(iter),
1451 : .flags = flags,
1452 : .u.data = iter
1453 : };
1454 0 : long ret = 0;
1455 :
1456 0 : if (!pipe)
1457 : return -EBADF;
1458 :
1459 0 : pipe_clear_nowait(file);
1460 :
1461 0 : if (sd.total_len) {
1462 0 : pipe_lock(pipe);
1463 0 : ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1464 0 : pipe_unlock(pipe);
1465 : }
1466 :
1467 : return ret;
1468 : }
1469 :
1470 : /*
1471 : * vmsplice splices a user address range into a pipe. It can be thought of
1472 : * as splice-from-memory, where the regular splice is splice-from-file (or
1473 : * to file). In both cases the output is a pipe, naturally.
1474 : */
1475 16573 : static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1476 : unsigned int flags)
1477 : {
1478 16573 : struct pipe_inode_info *pipe;
1479 16573 : long ret = 0;
1480 16573 : unsigned buf_flag = 0;
1481 :
1482 16573 : if (flags & SPLICE_F_GIFT)
1483 0 : buf_flag = PIPE_BUF_FLAG_GIFT;
1484 :
1485 16573 : pipe = get_pipe_info(file, true);
1486 16568 : if (!pipe)
1487 : return -EBADF;
1488 :
1489 16568 : pipe_clear_nowait(file);
1490 :
1491 16540 : pipe_lock(pipe);
1492 16573 : ret = wait_for_space(pipe, flags);
1493 16547 : if (!ret)
1494 16557 : ret = iter_to_pipe(iter, pipe, buf_flag);
1495 16559 : pipe_unlock(pipe);
1496 16629 : if (ret > 0)
1497 16629 : wakeup_pipe_readers(pipe);
1498 : return ret;
1499 : }
1500 :
1501 16578 : static int vmsplice_type(struct fd f, int *type)
1502 : {
1503 16578 : if (!f.file)
1504 : return -EBADF;
1505 16578 : if (f.file->f_mode & FMODE_WRITE) {
1506 16578 : *type = ITER_SOURCE;
1507 0 : } else if (f.file->f_mode & FMODE_READ) {
1508 0 : *type = ITER_DEST;
1509 : } else {
1510 0 : fdput(f);
1511 0 : return -EBADF;
1512 : }
1513 : return 0;
1514 : }
1515 :
1516 : /*
1517 : * Note that vmsplice only really supports true splicing _from_ user memory
1518 : * to a pipe, not the other way around. Splicing from user memory is a simple
1519 : * operation that can be supported without any funky alignment restrictions
1520 : * or nasty vm tricks. We simply map in the user memory and fill them into
1521 : * a pipe. The reverse isn't quite as easy, though. There are two possible
1522 : * solutions for that:
1523 : *
1524 : * - memcpy() the data internally, at which point we might as well just
1525 : * do a regular read() on the buffer anyway.
1526 : * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1527 : * has restriction limitations on both ends of the pipe).
1528 : *
1529 : * Currently we punt and implement it as a normal copy, see pipe_to_user().
1530 : *
1531 : */
1532 33158 : SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1533 : unsigned long, nr_segs, unsigned int, flags)
1534 : {
1535 16569 : struct iovec iovstack[UIO_FASTIOV];
1536 16569 : struct iovec *iov = iovstack;
1537 16569 : struct iov_iter iter;
1538 16569 : ssize_t error;
1539 16569 : struct fd f;
1540 16569 : int type;
1541 :
1542 16569 : if (unlikely(flags & ~SPLICE_F_ALL))
1543 : return -EINVAL;
1544 :
1545 16569 : f = fdget(fd);
1546 16578 : error = vmsplice_type(f, &type);
1547 16579 : if (error)
1548 : return error;
1549 :
1550 16579 : error = import_iovec(type, uiov, nr_segs,
1551 : ARRAY_SIZE(iovstack), &iov, &iter);
1552 16539 : if (error < 0)
1553 0 : goto out_fdput;
1554 :
1555 16539 : if (!iov_iter_count(&iter))
1556 : error = 0;
1557 16576 : else if (type == ITER_SOURCE)
1558 16576 : error = vmsplice_to_pipe(f.file, &iter, flags);
1559 : else
1560 0 : error = vmsplice_to_user(f.file, &iter, flags);
1561 :
1562 16549 : kfree(iov);
1563 16552 : out_fdput:
1564 16552 : fdput(f);
1565 16552 : return error;
1566 : }
1567 :
1568 17750958 : SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1569 : int, fd_out, loff_t __user *, off_out,
1570 : size_t, len, unsigned int, flags)
1571 : {
1572 8875305 : struct fd in, out;
1573 8875305 : long error;
1574 :
1575 8875305 : if (unlikely(!len))
1576 : return 0;
1577 :
1578 8875305 : if (unlikely(flags & ~SPLICE_F_ALL))
1579 : return -EINVAL;
1580 :
1581 8875305 : error = -EBADF;
1582 8875305 : in = fdget(fd_in);
1583 8876090 : if (in.file) {
1584 8876090 : out = fdget(fd_out);
1585 8876121 : if (out.file) {
1586 8876121 : error = __do_splice(in.file, off_in, out.file, off_out,
1587 : len, flags);
1588 8875541 : fdput(out);
1589 : }
1590 8876286 : fdput(in);
1591 : }
1592 : return error;
1593 : }
1594 :
1595 : /*
1596 : * Make sure there's data to read. Wait for input if we can, otherwise
1597 : * return an appropriate error.
1598 : */
1599 58155 : static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1600 : {
1601 58155 : int ret;
1602 :
1603 : /*
1604 : * Check the pipe occupancy without the inode lock first. This function
1605 : * is speculative anyways, so missing one is ok.
1606 : */
1607 58155 : if (!pipe_empty(pipe->head, pipe->tail))
1608 : return 0;
1609 :
1610 28961 : ret = 0;
1611 28961 : pipe_lock(pipe);
1612 :
1613 57612 : while (pipe_empty(pipe->head, pipe->tail)) {
1614 28721 : if (signal_pending(current)) {
1615 : ret = -ERESTARTSYS;
1616 : break;
1617 : }
1618 28721 : if (!pipe->writers)
1619 : break;
1620 28651 : if (flags & SPLICE_F_NONBLOCK) {
1621 : ret = -EAGAIN;
1622 : break;
1623 : }
1624 28651 : pipe_wait_readable(pipe);
1625 : }
1626 :
1627 28961 : pipe_unlock(pipe);
1628 28961 : return ret;
1629 : }
1630 :
1631 : /*
1632 : * Make sure there's writeable room. Wait for room if we can, otherwise
1633 : * return an appropriate error.
1634 : */
1635 58155 : static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1636 : {
1637 58155 : int ret;
1638 :
1639 : /*
1640 : * Check pipe occupancy without the inode lock first. This function
1641 : * is speculative anyways, so missing one is ok.
1642 : */
1643 58155 : if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1644 : return 0;
1645 :
1646 29028 : ret = 0;
1647 29028 : pipe_lock(pipe);
1648 :
1649 58008 : while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1650 28980 : if (!pipe->readers) {
1651 0 : send_sig(SIGPIPE, current, 0);
1652 0 : ret = -EPIPE;
1653 0 : break;
1654 : }
1655 28980 : if (flags & SPLICE_F_NONBLOCK) {
1656 : ret = -EAGAIN;
1657 : break;
1658 : }
1659 28980 : if (signal_pending(current)) {
1660 : ret = -ERESTARTSYS;
1661 : break;
1662 : }
1663 28980 : pipe_wait_writable(pipe);
1664 : }
1665 :
1666 29028 : pipe_unlock(pipe);
1667 29028 : return ret;
1668 : }
1669 :
1670 : /*
1671 : * Splice contents of ipipe to opipe.
1672 : */
1673 58155 : static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1674 : struct pipe_inode_info *opipe,
1675 : size_t len, unsigned int flags)
1676 : {
1677 58155 : struct pipe_buffer *ibuf, *obuf;
1678 58155 : unsigned int i_head, o_head;
1679 58155 : unsigned int i_tail, o_tail;
1680 58155 : unsigned int i_mask, o_mask;
1681 58155 : int ret = 0;
1682 58155 : bool input_wakeup = false;
1683 :
1684 :
1685 58155 : retry:
1686 58155 : ret = ipipe_prep(ipipe, flags);
1687 58155 : if (ret)
1688 0 : return ret;
1689 :
1690 58155 : ret = opipe_prep(opipe, flags);
1691 58155 : if (ret)
1692 0 : return ret;
1693 :
1694 : /*
1695 : * Potential ABBA deadlock, work around it by ordering lock
1696 : * grabbing by pipe info address. Otherwise two different processes
1697 : * could deadlock (one doing tee from A -> B, the other from B -> A).
1698 : */
1699 58155 : pipe_double_lock(ipipe, opipe);
1700 :
1701 58155 : i_tail = ipipe->tail;
1702 58155 : i_mask = ipipe->ring_size - 1;
1703 58155 : o_head = opipe->head;
1704 58155 : o_mask = opipe->ring_size - 1;
1705 :
1706 481097 : do {
1707 481097 : size_t o_len;
1708 :
1709 481097 : if (!opipe->readers) {
1710 0 : send_sig(SIGPIPE, current, 0);
1711 0 : if (!ret)
1712 0 : ret = -EPIPE;
1713 : break;
1714 : }
1715 :
1716 481097 : i_head = ipipe->head;
1717 481097 : o_tail = opipe->tail;
1718 :
1719 481097 : if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1720 : break;
1721 :
1722 : /*
1723 : * Cannot make any progress, because either the input
1724 : * pipe is empty or the output pipe is full.
1725 : */
1726 480969 : if (pipe_empty(i_head, i_tail) ||
1727 451424 : pipe_full(o_head, o_tail, opipe->max_usage)) {
1728 : /* Already processed some buffers, break */
1729 57954 : if (ret)
1730 : break;
1731 :
1732 0 : if (flags & SPLICE_F_NONBLOCK) {
1733 : ret = -EAGAIN;
1734 : break;
1735 : }
1736 :
1737 : /*
1738 : * We raced with another reader/writer and haven't
1739 : * managed to process any buffers. A zero return
1740 : * value means EOF, so retry instead.
1741 : */
1742 0 : pipe_unlock(ipipe);
1743 0 : pipe_unlock(opipe);
1744 0 : goto retry;
1745 : }
1746 :
1747 423015 : ibuf = &ipipe->bufs[i_tail & i_mask];
1748 423015 : obuf = &opipe->bufs[o_head & o_mask];
1749 :
1750 423015 : if (len >= ibuf->len) {
1751 : /*
1752 : * Simply move the whole buffer from ipipe to opipe
1753 : */
1754 423014 : *obuf = *ibuf;
1755 423014 : ibuf->ops = NULL;
1756 423014 : i_tail++;
1757 423014 : ipipe->tail = i_tail;
1758 423014 : input_wakeup = true;
1759 423014 : o_len = obuf->len;
1760 423014 : o_head++;
1761 423014 : opipe->head = o_head;
1762 : } else {
1763 : /*
1764 : * Get a reference to this pipe buffer,
1765 : * so we can copy the contents over.
1766 : */
1767 1 : if (!pipe_buf_get(ipipe, ibuf)) {
1768 0 : if (ret == 0)
1769 0 : ret = -EFAULT;
1770 : break;
1771 : }
1772 0 : *obuf = *ibuf;
1773 :
1774 : /*
1775 : * Don't inherit the gift and merge flags, we need to
1776 : * prevent multiple steals of this page.
1777 : */
1778 0 : obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1779 0 : obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1780 :
1781 0 : obuf->len = len;
1782 0 : ibuf->offset += len;
1783 0 : ibuf->len -= len;
1784 0 : o_len = len;
1785 0 : o_head++;
1786 0 : opipe->head = o_head;
1787 : }
1788 423014 : ret += o_len;
1789 423014 : len -= o_len;
1790 423014 : } while (len);
1791 :
1792 58154 : pipe_unlock(ipipe);
1793 58155 : pipe_unlock(opipe);
1794 :
1795 : /*
1796 : * If we put data in the output pipe, wakeup any potential readers.
1797 : */
1798 58155 : if (ret > 0)
1799 58085 : wakeup_pipe_readers(opipe);
1800 :
1801 58155 : if (input_wakeup)
1802 58085 : wakeup_pipe_writers(ipipe);
1803 :
1804 : return ret;
1805 : }
1806 :
1807 : /*
1808 : * Link contents of ipipe to opipe.
1809 : */
1810 0 : static int link_pipe(struct pipe_inode_info *ipipe,
1811 : struct pipe_inode_info *opipe,
1812 : size_t len, unsigned int flags)
1813 : {
1814 0 : struct pipe_buffer *ibuf, *obuf;
1815 0 : unsigned int i_head, o_head;
1816 0 : unsigned int i_tail, o_tail;
1817 0 : unsigned int i_mask, o_mask;
1818 0 : int ret = 0;
1819 :
1820 : /*
1821 : * Potential ABBA deadlock, work around it by ordering lock
1822 : * grabbing by pipe info address. Otherwise two different processes
1823 : * could deadlock (one doing tee from A -> B, the other from B -> A).
1824 : */
1825 0 : pipe_double_lock(ipipe, opipe);
1826 :
1827 0 : i_tail = ipipe->tail;
1828 0 : i_mask = ipipe->ring_size - 1;
1829 0 : o_head = opipe->head;
1830 0 : o_mask = opipe->ring_size - 1;
1831 :
1832 0 : do {
1833 0 : if (!opipe->readers) {
1834 0 : send_sig(SIGPIPE, current, 0);
1835 0 : if (!ret)
1836 0 : ret = -EPIPE;
1837 : break;
1838 : }
1839 :
1840 0 : i_head = ipipe->head;
1841 0 : o_tail = opipe->tail;
1842 :
1843 : /*
1844 : * If we have iterated all input buffers or run out of
1845 : * output room, break.
1846 : */
1847 0 : if (pipe_empty(i_head, i_tail) ||
1848 0 : pipe_full(o_head, o_tail, opipe->max_usage))
1849 : break;
1850 :
1851 0 : ibuf = &ipipe->bufs[i_tail & i_mask];
1852 0 : obuf = &opipe->bufs[o_head & o_mask];
1853 :
1854 : /*
1855 : * Get a reference to this pipe buffer,
1856 : * so we can copy the contents over.
1857 : */
1858 0 : if (!pipe_buf_get(ipipe, ibuf)) {
1859 0 : if (ret == 0)
1860 0 : ret = -EFAULT;
1861 : break;
1862 : }
1863 :
1864 0 : *obuf = *ibuf;
1865 :
1866 : /*
1867 : * Don't inherit the gift and merge flag, we need to prevent
1868 : * multiple steals of this page.
1869 : */
1870 0 : obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1871 0 : obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1872 :
1873 0 : if (obuf->len > len)
1874 0 : obuf->len = len;
1875 0 : ret += obuf->len;
1876 0 : len -= obuf->len;
1877 :
1878 0 : o_head++;
1879 0 : opipe->head = o_head;
1880 0 : i_tail++;
1881 0 : } while (len);
1882 :
1883 0 : pipe_unlock(ipipe);
1884 0 : pipe_unlock(opipe);
1885 :
1886 : /*
1887 : * If we put data in the output pipe, wakeup any potential readers.
1888 : */
1889 0 : if (ret > 0)
1890 0 : wakeup_pipe_readers(opipe);
1891 :
1892 0 : return ret;
1893 : }
1894 :
1895 : /*
1896 : * This is a tee(1) implementation that works on pipes. It doesn't copy
1897 : * any data, it simply references the 'in' pages on the 'out' pipe.
1898 : * The 'flags' used are the SPLICE_F_* variants, currently the only
1899 : * applicable one is SPLICE_F_NONBLOCK.
1900 : */
1901 0 : long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1902 : {
1903 0 : struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1904 0 : struct pipe_inode_info *opipe = get_pipe_info(out, true);
1905 0 : int ret = -EINVAL;
1906 :
1907 0 : if (unlikely(!(in->f_mode & FMODE_READ) ||
1908 : !(out->f_mode & FMODE_WRITE)))
1909 : return -EBADF;
1910 :
1911 : /*
1912 : * Duplicate the contents of ipipe to opipe without actually
1913 : * copying the data.
1914 : */
1915 0 : if (ipipe && opipe && ipipe != opipe) {
1916 0 : if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1917 0 : flags |= SPLICE_F_NONBLOCK;
1918 :
1919 : /*
1920 : * Keep going, unless we encounter an error. The ipipe/opipe
1921 : * ordering doesn't really matter.
1922 : */
1923 0 : ret = ipipe_prep(ipipe, flags);
1924 0 : if (!ret) {
1925 0 : ret = opipe_prep(opipe, flags);
1926 0 : if (!ret)
1927 0 : ret = link_pipe(ipipe, opipe, len, flags);
1928 : }
1929 : }
1930 :
1931 0 : return ret;
1932 : }
1933 :
1934 0 : SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1935 : {
1936 0 : struct fd in, out;
1937 0 : int error;
1938 :
1939 0 : if (unlikely(flags & ~SPLICE_F_ALL))
1940 : return -EINVAL;
1941 :
1942 0 : if (unlikely(!len))
1943 : return 0;
1944 :
1945 0 : error = -EBADF;
1946 0 : in = fdget(fdin);
1947 0 : if (in.file) {
1948 0 : out = fdget(fdout);
1949 0 : if (out.file) {
1950 0 : error = do_tee(in.file, out.file, len, flags);
1951 0 : fdput(out);
1952 : }
1953 0 : fdput(in);
1954 : }
1955 :
1956 0 : return error;
1957 : }
|