LCOV - code coverage report
Current view: top level - fs - pipe.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-achx @ Mon Jul 31 20:08:12 PDT 2023 Lines: 427 657 65.0 %
Date: 2023-07-31 20:08:12 Functions: 23 44 52.3 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  *  linux/fs/pipe.c
       4             :  *
       5             :  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
       6             :  */
       7             : 
       8             : #include <linux/mm.h>
       9             : #include <linux/file.h>
      10             : #include <linux/poll.h>
      11             : #include <linux/slab.h>
      12             : #include <linux/module.h>
      13             : #include <linux/init.h>
      14             : #include <linux/fs.h>
      15             : #include <linux/log2.h>
      16             : #include <linux/mount.h>
      17             : #include <linux/pseudo_fs.h>
      18             : #include <linux/magic.h>
      19             : #include <linux/pipe_fs_i.h>
      20             : #include <linux/uio.h>
      21             : #include <linux/highmem.h>
      22             : #include <linux/pagemap.h>
      23             : #include <linux/audit.h>
      24             : #include <linux/syscalls.h>
      25             : #include <linux/fcntl.h>
      26             : #include <linux/memcontrol.h>
      27             : #include <linux/watch_queue.h>
      28             : #include <linux/sysctl.h>
      29             : 
      30             : #include <linux/uaccess.h>
      31             : #include <asm/ioctls.h>
      32             : 
      33             : #include "internal.h"
      34             : 
      35             : /*
      36             :  * New pipe buffers will be restricted to this size while the user is exceeding
      37             :  * their pipe buffer quota. The general pipe use case needs at least two
      38             :  * buffers: one for data yet to be read, and one for new data. If this is less
      39             :  * than two, then a write to a non-empty pipe may block even if the pipe is not
      40             :  * full. This can occur with GNU make jobserver or similar uses of pipes as
      41             :  * semaphores: multiple processes may be waiting to write tokens back to the
      42             :  * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
      43             :  *
      44             :  * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
      45             :  * own risk, namely: pipe writes to non-full pipes may block until the pipe is
      46             :  * emptied.
      47             :  */
      48             : #define PIPE_MIN_DEF_BUFFERS 2
      49             : 
      50             : /*
      51             :  * The max size that a non-root user is allowed to grow the pipe. Can
      52             :  * be set by root in /proc/sys/fs/pipe-max-size
      53             :  */
      54             : static unsigned int pipe_max_size = 1048576;
      55             : 
      56             : /* Maximum allocatable pages per user. Hard limit is unset by default, soft
      57             :  * matches default values.
      58             :  */
      59             : static unsigned long pipe_user_pages_hard;
      60             : static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
      61             : 
      62             : /*
      63             :  * We use head and tail indices that aren't masked off, except at the point of
      64             :  * dereference, but rather they're allowed to wrap naturally.  This means there
      65             :  * isn't a dead spot in the buffer, but the ring has to be a power of two and
      66             :  * <= 2^31.
      67             :  * -- David Howells 2019-09-23.
      68             :  *
      69             :  * Reads with count = 0 should always return 0.
      70             :  * -- Julian Bradfield 1999-06-07.
      71             :  *
      72             :  * FIFOs and Pipes now generate SIGIO for both readers and writers.
      73             :  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
      74             :  *
      75             :  * pipe_read & write cleanup
      76             :  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
      77             :  */
      78             : 
      79             : static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
      80             : {
      81    20678167 :         if (pipe->files)
      82     9718882 :                 mutex_lock_nested(&pipe->mutex, subclass);
      83             : }
      84             : 
      85    18210667 : void pipe_lock(struct pipe_inode_info *pipe)
      86             : {
      87             :         /*
      88             :          * pipe_lock() nests non-pipe inode locks (for writing to a file)
      89             :          */
      90    18210667 :         pipe_lock_nested(pipe, I_MUTEX_PARENT);
      91    18211191 : }
      92             : EXPORT_SYMBOL(pipe_lock);
      93             : 
      94    18336230 : void pipe_unlock(struct pipe_inode_info *pipe)
      95             : {
      96    18336230 :         if (pipe->files)
      97     9719114 :                 mutex_unlock(&pipe->mutex);
      98    18336685 : }
      99             : EXPORT_SYMBOL(pipe_unlock);
     100             : 
     101             : static inline void __pipe_lock(struct pipe_inode_info *pipe)
     102             : {
     103  2187388090 :         mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
     104             : }
     105             : 
     106             : static inline void __pipe_unlock(struct pipe_inode_info *pipe)
     107             : {
     108  2187445221 :         mutex_unlock(&pipe->mutex);
     109             : }
     110             : 
     111       62661 : void pipe_double_lock(struct pipe_inode_info *pipe1,
     112             :                       struct pipe_inode_info *pipe2)
     113             : {
     114       62661 :         BUG_ON(pipe1 == pipe2);
     115             : 
     116       62661 :         if (pipe1 < pipe2) {
     117       26047 :                 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
     118       26047 :                 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
     119             :         } else {
     120       36614 :                 pipe_lock_nested(pipe2, I_MUTEX_PARENT);
     121       36614 :                 pipe_lock_nested(pipe1, I_MUTEX_CHILD);
     122             :         }
     123       62661 : }
     124             : 
     125   107265310 : static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
     126             :                                   struct pipe_buffer *buf)
     127             : {
     128   107265310 :         struct page *page = buf->page;
     129             : 
     130             :         /*
     131             :          * If nobody else uses this page, and we don't already have a
     132             :          * temporary page, let's keep track of it as a one-deep
     133             :          * allocation cache. (Otherwise just release our reference to it)
     134             :          */
     135   107265310 :         if (page_count(page) == 1 && !pipe->tmp_page)
     136    95196532 :                 pipe->tmp_page = page;
     137             :         else
     138    12064581 :                 put_page(page);
     139   107268217 : }
     140             : 
     141           0 : static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
     142             :                 struct pipe_buffer *buf)
     143             : {
     144           0 :         struct page *page = buf->page;
     145             : 
     146           0 :         if (page_count(page) != 1)
     147             :                 return false;
     148           0 :         memcg_kmem_uncharge_page(page, 0);
     149           0 :         __SetPageLocked(page);
     150           0 :         return true;
     151             : }
     152             : 
     153             : /**
     154             :  * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
     155             :  * @pipe:       the pipe that the buffer belongs to
     156             :  * @buf:        the buffer to attempt to steal
     157             :  *
     158             :  * Description:
     159             :  *      This function attempts to steal the &struct page attached to
     160             :  *      @buf. If successful, this function returns 0 and returns with
     161             :  *      the page locked. The caller may then reuse the page for whatever
     162             :  *      he wishes; the typical use is insertion into a different file
     163             :  *      page cache.
     164             :  */
     165           0 : bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
     166             :                 struct pipe_buffer *buf)
     167             : {
     168           0 :         struct page *page = buf->page;
     169             : 
     170             :         /*
     171             :          * A reference of one is golden, that means that the owner of this
     172             :          * page is the only one holding a reference to it. lock the page
     173             :          * and return OK.
     174             :          */
     175           0 :         if (page_count(page) == 1) {
     176           0 :                 lock_page(page);
     177           0 :                 return true;
     178             :         }
     179             :         return false;
     180             : }
     181             : EXPORT_SYMBOL(generic_pipe_buf_try_steal);
     182             : 
     183             : /**
     184             :  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
     185             :  * @pipe:       the pipe that the buffer belongs to
     186             :  * @buf:        the buffer to get a reference to
     187             :  *
     188             :  * Description:
     189             :  *      This function grabs an extra reference to @buf. It's used in
     190             :  *      the tee() system call, when we duplicate the buffers in one
     191             :  *      pipe into another.
     192             :  */
     193           0 : bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
     194             : {
     195           0 :         return try_get_page(buf->page);
     196             : }
     197             : EXPORT_SYMBOL(generic_pipe_buf_get);
     198             : 
     199             : /**
     200             :  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
     201             :  * @pipe:       the pipe that the buffer belongs to
     202             :  * @buf:        the buffer to put a reference to
     203             :  *
     204             :  * Description:
     205             :  *      This function releases a reference to @buf.
     206             :  */
     207     6198888 : void generic_pipe_buf_release(struct pipe_inode_info *pipe,
     208             :                               struct pipe_buffer *buf)
     209             : {
     210     6198888 :         put_page(buf->page);
     211     6198888 : }
     212             : EXPORT_SYMBOL(generic_pipe_buf_release);
     213             : 
     214             : static const struct pipe_buf_operations anon_pipe_buf_ops = {
     215             :         .release        = anon_pipe_buf_release,
     216             :         .try_steal      = anon_pipe_buf_try_steal,
     217             :         .get            = generic_pipe_buf_get,
     218             : };
     219             : 
     220             : /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
     221             : static inline bool pipe_readable(const struct pipe_inode_info *pipe)
     222             : {
     223   300390551 :         unsigned int head = READ_ONCE(pipe->head);
     224   300390551 :         unsigned int tail = READ_ONCE(pipe->tail);
     225   300390551 :         unsigned int writers = READ_ONCE(pipe->writers);
     226             : 
     227   210845942 :         return !pipe_empty(head, tail) || !writers;
     228             : }
     229             : 
     230             : static ssize_t
     231  1929543203 : pipe_read(struct kiocb *iocb, struct iov_iter *to)
     232             : {
     233  1929543203 :         size_t total_len = iov_iter_count(to);
     234  1929543203 :         struct file *filp = iocb->ki_filp;
     235  1929543203 :         struct pipe_inode_info *pipe = filp->private_data;
     236  1929543203 :         bool was_full, wake_next_reader = false;
     237  1929543203 :         ssize_t ret;
     238             : 
     239             :         /* Null read succeeds. */
     240  1929543203 :         if (unlikely(total_len == 0))
     241             :                 return 0;
     242             : 
     243  1929543203 :         ret = 0;
     244  1929543203 :         __pipe_lock(pipe);
     245             : 
     246             :         /*
     247             :          * We only wake up writers if the pipe was full when we started
     248             :          * reading in order to avoid unnecessary wakeups.
     249             :          *
     250             :          * But when we do wake up writers, we do so using a sync wakeup
     251             :          * (WF_SYNC), because we want them to get going and generate more
     252             :          * data for us.
     253             :          */
     254  1929555525 :         was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
     255  2028511223 :         for (;;) {
     256             :                 /* Read ->head with a barrier vs post_one_notification() */
     257  2028511223 :                 unsigned int head = smp_load_acquire(&pipe->head);
     258  2028493260 :                 unsigned int tail = pipe->tail;
     259  2028493260 :                 unsigned int mask = pipe->ring_size - 1;
     260             : 
     261             : #ifdef CONFIG_WATCH_QUEUE
     262             :                 if (pipe->note_loss) {
     263             :                         struct watch_notification n;
     264             : 
     265             :                         if (total_len < 8) {
     266             :                                 if (ret == 0)
     267             :                                         ret = -ENOBUFS;
     268             :                                 break;
     269             :                         }
     270             : 
     271             :                         n.type = WATCH_TYPE_META;
     272             :                         n.subtype = WATCH_META_LOSS_NOTIFICATION;
     273             :                         n.info = watch_sizeof(n);
     274             :                         if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
     275             :                                 if (ret == 0)
     276             :                                         ret = -EFAULT;
     277             :                                 break;
     278             :                         }
     279             :                         ret += sizeof(n);
     280             :                         total_len -= sizeof(n);
     281             :                         pipe->note_loss = false;
     282             :                 }
     283             : #endif
     284             : 
     285  2028493260 :                 if (!pipe_empty(head, tail)) {
     286  1913479679 :                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
     287  1913479679 :                         size_t chars = buf->len;
     288  1913479679 :                         size_t written;
     289  1913479679 :                         int error;
     290             : 
     291  1913479679 :                         if (chars > total_len) {
     292  1818250604 :                                 if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
     293           0 :                                         if (ret == 0)
     294           0 :                                                 ret = -ENOBUFS;
     295             :                                         break;
     296             :                                 }
     297             :                                 chars = total_len;
     298             :                         }
     299             : 
     300  1913479679 :                         error = pipe_buf_confirm(pipe, buf);
     301         455 :                         if (error) {
     302           0 :                                 if (!ret)
     303           0 :                                         ret = error;
     304             :                                 break;
     305             :                         }
     306             : 
     307  1913479679 :                         written = copy_page_to_iter(buf->page, buf->offset, chars, to);
     308  1913500235 :                         if (unlikely(written < chars)) {
     309           0 :                                 if (!ret)
     310           0 :                                         ret = -EFAULT;
     311             :                                 break;
     312             :                         }
     313  1913500235 :                         ret += chars;
     314  1913500235 :                         buf->offset += chars;
     315  1913500235 :                         buf->len -= chars;
     316             : 
     317             :                         /* Was it a packet buffer? Clean up and exit */
     318  1913500235 :                         if (buf->flags & PIPE_BUF_FLAG_PACKET) {
     319           0 :                                 total_len = chars;
     320           0 :                                 buf->len = 0;
     321             :                         }
     322             : 
     323  1913500235 :                         if (!buf->len) {
     324    95249825 :                                 pipe_buf_release(pipe, buf);
     325    95241797 :                                 spin_lock_irq(&pipe->rd_wait.lock);
     326             : #ifdef CONFIG_WATCH_QUEUE
     327             :                                 if (buf->flags & PIPE_BUF_FLAG_LOSS)
     328             :                                         pipe->note_loss = true;
     329             : #endif
     330    95258751 :                                 tail++;
     331    95258751 :                                 pipe->tail = tail;
     332    95258751 :                                 spin_unlock_irq(&pipe->rd_wait.lock);
     333             :                         }
     334  1913520695 :                         total_len -= chars;
     335  1913520695 :                         if (!total_len)
     336             :                                 break;  /* common path: read succeeded */
     337    78225479 :                         if (!pipe_empty(head, tail))    /* More to do? */
     338     1142730 :                                 continue;
     339             :                 }
     340             : 
     341   192096330 :                 if (!pipe->writers)
     342             :                         break;
     343   168835545 :                 if (ret)
     344             :                         break;
     345    97877724 :                 if ((filp->f_flags & O_NONBLOCK) ||
     346    97853355 :                     (iocb->ki_flags & IOCB_NOWAIT)) {
     347             :                         ret = -EAGAIN;
     348             :                         break;
     349             :                 }
     350    97853355 :                 __pipe_unlock(pipe);
     351             : 
     352             :                 /*
     353             :                  * We only get here if we didn't actually read anything.
     354             :                  *
     355             :                  * However, we could have seen (and removed) a zero-sized
     356             :                  * pipe buffer, and might have made space in the buffers
     357             :                  * that way.
     358             :                  *
     359             :                  * You can't make zero-sized pipe buffers by doing an empty
     360             :                  * write (not even in packet mode), but they can happen if
     361             :                  * the writer gets an EFAULT when trying to fill a buffer
     362             :                  * that already got allocated and inserted in the buffer
     363             :                  * array.
     364             :                  *
     365             :                  * So we still need to wake up any pending writers in the
     366             :                  * _very_ unlikely case that the pipe was full, but we got
     367             :                  * no data.
     368             :                  */
     369    97858666 :                 if (unlikely(was_full))
     370           0 :                         wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
     371    97858666 :                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
     372             : 
     373             :                 /*
     374             :                  * But because we didn't read anything, at this point we can
     375             :                  * just return directly with -ERESTARTSYS if we're interrupted,
     376             :                  * since we've done any required wakeups and there's no need
     377             :                  * to mark anything accessed. And we've dropped the lock.
     378             :                  */
     379   391307813 :                 if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
     380             :                         return -ERESTARTSYS;
     381             : 
     382    97779635 :                 __pipe_lock(pipe);
     383    97812968 :                 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
     384    97812968 :                 wake_next_reader = true;
     385             :         }
     386  1929538191 :         if (pipe_empty(pipe->head, pipe->tail))
     387   109318023 :                 wake_next_reader = false;
     388  1929538191 :         __pipe_unlock(pipe);
     389             : 
     390  1929545106 :         if (was_full)
     391  1060204188 :                 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
     392  1929545107 :         if (wake_next_reader)
     393     1099136 :                 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
     394  1929545106 :         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
     395  1929528828 :         if (ret > 0)
     396  1912367883 :                 file_accessed(filp);
     397             :         return ret;
     398             : }
     399             : 
     400             : static inline int is_packetized(struct file *file)
     401             : {
     402   107284305 :         return (file->f_flags & O_DIRECT) != 0;
     403             : }
     404             : 
     405             : /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
     406             : static inline bool pipe_writable(const struct pipe_inode_info *pipe)
     407             : {
     408   516340508 :         unsigned int head = READ_ONCE(pipe->head);
     409   516340508 :         unsigned int tail = READ_ONCE(pipe->tail);
     410   516340508 :         unsigned int max_usage = READ_ONCE(pipe->max_usage);
     411             : 
     412  1547975195 :         return !pipe_full(head, tail, max_usage) ||
     413   515294179 :                 !READ_ONCE(pipe->readers);
     414             : }
     415             : 
     416             : static ssize_t
     417   114621125 : pipe_write(struct kiocb *iocb, struct iov_iter *from)
     418             : {
     419   114621125 :         struct file *filp = iocb->ki_filp;
     420   114621125 :         struct pipe_inode_info *pipe = filp->private_data;
     421   114621125 :         unsigned int head;
     422   114621125 :         ssize_t ret = 0;
     423   114621125 :         size_t total_len = iov_iter_count(from);
     424   114621125 :         ssize_t chars;
     425   114621125 :         bool was_empty = false;
     426   114621125 :         bool wake_next_writer = false;
     427             : 
     428             :         /* Null write succeeds. */
     429   114621125 :         if (unlikely(total_len == 0))
     430             :                 return 0;
     431             : 
     432   114621125 :         __pipe_lock(pipe);
     433             : 
     434   114619029 :         if (!pipe->readers) {
     435        2146 :                 send_sig(SIGPIPE, current, 0);
     436        2146 :                 ret = -EPIPE;
     437        2146 :                 goto out;
     438             :         }
     439             : 
     440             : #ifdef CONFIG_WATCH_QUEUE
     441             :         if (pipe->watch_queue) {
     442             :                 ret = -EXDEV;
     443             :                 goto out;
     444             :         }
     445             : #endif
     446             : 
     447             :         /*
     448             :          * If it wasn't empty we try to merge new data into
     449             :          * the last buffer.
     450             :          *
     451             :          * That naturally merges small writes, but it also
     452             :          * page-aligns the rest of the writes for large writes
     453             :          * spanning multiple pages.
     454             :          */
     455   114616883 :         head = pipe->head;
     456   114616883 :         was_empty = pipe_empty(head, pipe->tail);
     457   114616883 :         chars = total_len & (PAGE_SIZE-1);
     458   114616883 :         if (chars && !was_empty) {
     459    18431102 :                 unsigned int mask = pipe->ring_size - 1;
     460    18431102 :                 struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
     461    18431102 :                 int offset = buf->offset + buf->len;
     462             : 
     463    18431102 :                 if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
     464    18431038 :                     offset + chars <= PAGE_SIZE) {
     465    17724940 :                         ret = pipe_buf_confirm(pipe, buf);
     466    17724941 :                         if (ret)
     467           0 :                                 goto out;
     468             : 
     469    17724941 :                         ret = copy_page_from_iter(buf->page, offset, chars, from);
     470    17724666 :                         if (unlikely(ret < chars)) {
     471           0 :                                 ret = -EFAULT;
     472           0 :                                 goto out;
     473             :                         }
     474             : 
     475    17724666 :                         buf->len += ret;
     476    17724666 :                         if (!iov_iter_count(from))
     477    17137492 :                                 goto out;
     478             :                 }
     479             :         }
     480             : 
     481   108317352 :         for (;;) {
     482   108317352 :                 if (!pipe->readers) {
     483         456 :                         send_sig(SIGPIPE, current, 0);
     484         456 :                         if (!ret)
     485         456 :                                 ret = -EPIPE;
     486             :                         break;
     487             :                 }
     488             : 
     489   108316896 :                 head = pipe->head;
     490   108316896 :                 if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
     491   107297082 :                         unsigned int mask = pipe->ring_size - 1;
     492   107297082 :                         struct pipe_buffer *buf = &pipe->bufs[head & mask];
     493   107297082 :                         struct page *page = pipe->tmp_page;
     494   107297082 :                         int copied;
     495             : 
     496   107297082 :                         if (!page) {
     497    26292301 :                                 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
     498    26291085 :                                 if (unlikely(!page)) {
     499           0 :                                         ret = ret ? : -ENOMEM;
     500             :                                         break;
     501             :                                 }
     502    26291085 :                                 pipe->tmp_page = page;
     503             :                         }
     504             : 
     505             :                         /* Allocate a slot in the ring in advance and attach an
     506             :                          * empty buffer.  If we fault or otherwise fail to use
     507             :                          * it, either the reader will consume it or it'll still
     508             :                          * be there for the next write.
     509             :                          */
     510   107295866 :                         spin_lock_irq(&pipe->rd_wait.lock);
     511             : 
     512   107298400 :                         head = pipe->head;
     513   107298400 :                         if (pipe_full(head, pipe->tail, pipe->max_usage)) {
     514           0 :                                 spin_unlock_irq(&pipe->rd_wait.lock);
     515           0 :                                 continue;
     516             :                         }
     517             : 
     518   107298400 :                         pipe->head = head + 1;
     519   107298400 :                         spin_unlock_irq(&pipe->rd_wait.lock);
     520             : 
     521             :                         /* Insert it into the buffer array */
     522   107284305 :                         buf = &pipe->bufs[head & mask];
     523   107284305 :                         buf->page = page;
     524   107284305 :                         buf->ops = &anon_pipe_buf_ops;
     525   107284305 :                         buf->offset = 0;
     526   107284305 :                         buf->len = 0;
     527   107284305 :                         if (is_packetized(filp))
     528           0 :                                 buf->flags = PIPE_BUF_FLAG_PACKET;
     529             :                         else
     530   107284305 :                                 buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
     531   107284305 :                         pipe->tmp_page = NULL;
     532             : 
     533   107284305 :                         copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
     534   107298890 :                         if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
     535           0 :                                 if (!ret)
     536           0 :                                         ret = -EFAULT;
     537             :                                 break;
     538             :                         }
     539   107298890 :                         ret += copied;
     540   107298890 :                         buf->offset = 0;
     541   107298890 :                         buf->len = copied;
     542             : 
     543   107298890 :                         if (!iov_iter_count(from))
     544             :                                 break;
     545             :                 }
     546             : 
     547    10849483 :                 if (!pipe_full(head, pipe->tail, pipe->max_usage))
     548     9821671 :                         continue;
     549             : 
     550             :                 /* Wait for buffer space to become available. */
     551     1027812 :                 if ((filp->f_flags & O_NONBLOCK) ||
     552     1027812 :                     (iocb->ki_flags & IOCB_NOWAIT)) {
     553           0 :                         if (!ret)
     554           0 :                                 ret = -EAGAIN;
     555             :                         break;
     556             :                 }
     557     1027812 :                 if (signal_pending(current)) {
     558           0 :                         if (!ret)
     559           0 :                                 ret = -ERESTARTSYS;
     560             :                         break;
     561             :                 }
     562             : 
     563             :                 /*
     564             :                  * We're going to release the pipe lock and wait for more
     565             :                  * space. We wake up any readers if necessary, and then
     566             :                  * after waiting we need to re-check whether the pipe
     567             :                  * become empty while we dropped the lock.
     568             :                  */
     569     1016715 :                 __pipe_unlock(pipe);
     570     1016780 :                 if (was_empty)
     571        7976 :                         wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
     572     1016780 :                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
     573   517252151 :                 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
     574     1016312 :                 __pipe_lock(pipe);
     575     1016564 :                 was_empty = pipe_empty(pipe->head, pipe->tail);
     576     1016564 :                 wake_next_writer = true;
     577             :         }
     578    97469221 : out:
     579   114609315 :         if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
     580      455689 :                 wake_next_writer = false;
     581   114609315 :         __pipe_unlock(pipe);
     582             : 
     583             :         /*
     584             :          * If we do do a wakeup event, we do a 'sync' wakeup, because we
     585             :          * want the reader to start processing things asap, rather than
     586             :          * leave the data pending.
     587             :          *
     588             :          * This is particularly important for small writes, because of
     589             :          * how (for example) the GNU make jobserver uses small writes to
     590             :          * wake up pending jobs
     591             :          *
     592             :          * Epoll nonsensically wants a wakeup whether the pipe
     593             :          * was already empty or not.
     594             :          */
     595   114608960 :         if (was_empty || pipe->poll_usage)
     596    95814383 :                 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
     597   114589945 :         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
     598   114575155 :         if (wake_next_writer)
     599      661232 :                 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
     600   114575262 :         if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
     601   114601380 :                 int err = file_update_time(filp);
     602   114577798 :                 if (err)
     603           0 :                         ret = err;
     604   114577798 :                 sb_end_write(file_inode(filp)->i_sb);
     605             :         }
     606             :         return ret;
     607             : }
     608             : 
     609     4964219 : static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
     610             : {
     611     4964219 :         struct pipe_inode_info *pipe = filp->private_data;
     612     4964219 :         unsigned int count, head, tail, mask;
     613             : 
     614     4964219 :         switch (cmd) {
     615             :         case FIONREAD:
     616     1739347 :                 __pipe_lock(pipe);
     617     1739347 :                 count = 0;
     618     1739347 :                 head = pipe->head;
     619     1739347 :                 tail = pipe->tail;
     620     1739347 :                 mask = pipe->ring_size - 1;
     621             : 
     622     2608876 :                 while (tail != head) {
     623      869529 :                         count += pipe->bufs[tail & mask].len;
     624      869529 :                         tail++;
     625             :                 }
     626     1739347 :                 __pipe_unlock(pipe);
     627             : 
     628     1739347 :                 return put_user(count, (int __user *)arg);
     629             : 
     630             : #ifdef CONFIG_WATCH_QUEUE
     631             :         case IOC_WATCH_QUEUE_SET_SIZE: {
     632             :                 int ret;
     633             :                 __pipe_lock(pipe);
     634             :                 ret = watch_queue_set_size(pipe, arg);
     635             :                 __pipe_unlock(pipe);
     636             :                 return ret;
     637             :         }
     638             : 
     639             :         case IOC_WATCH_QUEUE_SET_FILTER:
     640             :                 return watch_queue_set_filter(
     641             :                         pipe, (struct watch_notification_filter __user *)arg);
     642             : #endif
     643             : 
     644             :         default:
     645             :                 return -ENOIOCTLCMD;
     646             :         }
     647             : }
     648             : 
     649             : /* No kernel lock held - fine */
     650             : static __poll_t
     651     2025939 : pipe_poll(struct file *filp, poll_table *wait)
     652             : {
     653     2025939 :         __poll_t mask;
     654     2025939 :         struct pipe_inode_info *pipe = filp->private_data;
     655     2025939 :         unsigned int head, tail;
     656             : 
     657             :         /* Epoll has some historical nasty semantics, this enables them */
     658     2025939 :         WRITE_ONCE(pipe->poll_usage, true);
     659             : 
     660             :         /*
     661             :          * Reading pipe state only -- no need for acquiring the semaphore.
     662             :          *
     663             :          * But because this is racy, the code has to add the
     664             :          * entry to the poll table _first_ ..
     665             :          */
     666     2025939 :         if (filp->f_mode & FMODE_READ)
     667     2005709 :                 poll_wait(filp, &pipe->rd_wait, wait);
     668     2025938 :         if (filp->f_mode & FMODE_WRITE)
     669      203582 :                 poll_wait(filp, &pipe->wr_wait, wait);
     670             : 
     671             :         /*
     672             :          * .. and only then can you do the racy tests. That way,
     673             :          * if something changes and you got it wrong, the poll
     674             :          * table entry will wake you up and fix it.
     675             :          */
     676     2025924 :         head = READ_ONCE(pipe->head);
     677     2025924 :         tail = READ_ONCE(pipe->tail);
     678             : 
     679     2025924 :         mask = 0;
     680     2025924 :         if (filp->f_mode & FMODE_READ) {
     681     2005708 :                 if (!pipe_empty(head, tail))
     682       56068 :                         mask |= EPOLLIN | EPOLLRDNORM;
     683     2005708 :                 if (!pipe->writers && filp->f_version != pipe->w_counter)
     684      180524 :                         mask |= EPOLLHUP;
     685             :         }
     686             : 
     687     2025924 :         if (filp->f_mode & FMODE_WRITE) {
     688      203579 :                 if (!pipe_full(head, tail, pipe->max_usage))
     689      203575 :                         mask |= EPOLLOUT | EPOLLWRNORM;
     690             :                 /*
     691             :                  * Most Unices do not set EPOLLERR for FIFOs but on Linux they
     692             :                  * behave exactly like pipes for poll().
     693             :                  */
     694      203579 :                 if (!pipe->readers)
     695           0 :                         mask |= EPOLLERR;
     696             :         }
     697             : 
     698     2025924 :         return mask;
     699             : }
     700             : 
     701    41845864 : static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
     702             : {
     703    41845864 :         int kill = 0;
     704             : 
     705    41845864 :         spin_lock(&inode->i_lock);
     706    41848047 :         if (!--pipe->files) {
     707    20553336 :                 inode->i_pipe = NULL;
     708    20553336 :                 kill = 1;
     709             :         }
     710    41848047 :         spin_unlock(&inode->i_lock);
     711             : 
     712    41850700 :         if (kill)
     713    20556880 :                 free_pipe_info(pipe);
     714    41851658 : }
     715             : 
     716             : static int
     717    41848016 : pipe_release(struct inode *inode, struct file *file)
     718             : {
     719    41848016 :         struct pipe_inode_info *pipe = file->private_data;
     720             : 
     721    41848016 :         __pipe_lock(pipe);
     722    41851665 :         if (file->f_mode & FMODE_READ)
     723    21294237 :                 pipe->readers--;
     724    41851665 :         if (file->f_mode & FMODE_WRITE)
     725    20558672 :                 pipe->writers--;
     726             : 
     727             :         /* Was that the last reader or writer, but not the other side? */
     728    41851665 :         if (!pipe->readers != !pipe->writers) {
     729    21291950 :                 wake_up_interruptible_all(&pipe->rd_wait);
     730    21290966 :                 wake_up_interruptible_all(&pipe->wr_wait);
     731    21288039 :                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
     732    21287260 :                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
     733             :         }
     734    41847846 :         __pipe_unlock(pipe);
     735             : 
     736    41847361 :         put_pipe_info(inode, pipe);
     737    41851746 :         return 0;
     738             : }
     739             : 
     740             : static int
     741           0 : pipe_fasync(int fd, struct file *filp, int on)
     742             : {
     743           0 :         struct pipe_inode_info *pipe = filp->private_data;
     744           0 :         int retval = 0;
     745             : 
     746           0 :         __pipe_lock(pipe);
     747           0 :         if (filp->f_mode & FMODE_READ)
     748           0 :                 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
     749           0 :         if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
     750           0 :                 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
     751           0 :                 if (retval < 0 && (filp->f_mode & FMODE_READ))
     752             :                         /* this can happen only if on == T */
     753           0 :                         fasync_helper(-1, filp, 0, &pipe->fasync_readers);
     754             :         }
     755           0 :         __pipe_unlock(pipe);
     756           0 :         return retval;
     757             : }
     758             : 
     759           0 : unsigned long account_pipe_buffers(struct user_struct *user,
     760             :                                    unsigned long old, unsigned long new)
     761             : {
     762           0 :         return atomic_long_add_return(new - old, &user->pipe_bufs);
     763             : }
     764             : 
     765           0 : bool too_many_pipe_buffers_soft(unsigned long user_bufs)
     766             : {
     767    20605192 :         unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
     768             : 
     769    20605192 :         return soft_limit && user_bufs > soft_limit;
     770             : }
     771             : 
     772           0 : bool too_many_pipe_buffers_hard(unsigned long user_bufs)
     773             : {
     774    20605192 :         unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
     775             : 
     776    20605192 :         return hard_limit && user_bufs > hard_limit;
     777             : }
     778             : 
     779           0 : bool pipe_is_unprivileged_user(void)
     780             : {
     781           0 :         return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
     782             : }
     783             : 
     784    20601625 : struct pipe_inode_info *alloc_pipe_info(void)
     785             : {
     786    20601625 :         struct pipe_inode_info *pipe;
     787    20601625 :         unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
     788    20601625 :         struct user_struct *user = get_current_user();
     789    20601500 :         unsigned long user_bufs;
     790    20601500 :         unsigned int max_size = READ_ONCE(pipe_max_size);
     791             : 
     792    20601500 :         pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
     793    20605392 :         if (pipe == NULL)
     794           0 :                 goto out_free_uid;
     795             : 
     796    20605392 :         if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
     797           0 :                 pipe_bufs = max_size >> PAGE_SHIFT;
     798             : 
     799    20605392 :         user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
     800             : 
     801    20605192 :         if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
     802           0 :                 user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
     803           0 :                 pipe_bufs = PIPE_MIN_DEF_BUFFERS;
     804             :         }
     805             : 
     806    20605192 :         if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
     807           0 :                 goto out_revert_acct;
     808             : 
     809    20605192 :         pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
     810             :                              GFP_KERNEL_ACCOUNT);
     811             : 
     812    20605841 :         if (pipe->bufs) {
     813    20605841 :                 init_waitqueue_head(&pipe->rd_wait);
     814    20604175 :                 init_waitqueue_head(&pipe->wr_wait);
     815    20603733 :                 pipe->r_counter = pipe->w_counter = 1;
     816    20603733 :                 pipe->max_usage = pipe_bufs;
     817    20603733 :                 pipe->ring_size = pipe_bufs;
     818    20603733 :                 pipe->nr_accounted = pipe_bufs;
     819    20603733 :                 pipe->user = user;
     820    20603733 :                 mutex_init(&pipe->mutex);
     821    20603733 :                 return pipe;
     822             :         }
     823             : 
     824           0 : out_revert_acct:
     825           0 :         (void) account_pipe_buffers(user, pipe_bufs, 0);
     826           0 :         kfree(pipe);
     827           0 : out_free_uid:
     828           0 :         free_uid(user);
     829           0 :         return NULL;
     830             : }
     831             : 
     832    20603704 : void free_pipe_info(struct pipe_inode_info *pipe)
     833             : {
     834    20603704 :         unsigned int i;
     835             : 
     836             : #ifdef CONFIG_WATCH_QUEUE
     837             :         if (pipe->watch_queue)
     838             :                 watch_queue_clear(pipe->watch_queue);
     839             : #endif
     840             : 
     841    20603704 :         (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
     842    20604119 :         free_uid(pipe->user);
     843   370866444 :         for (i = 0; i < pipe->ring_size; i++) {
     844   329655713 :                 struct pipe_buffer *buf = pipe->bufs + i;
     845   329655713 :                 if (buf->ops)
     846      564869 :                         pipe_buf_release(pipe, buf);
     847             :         }
     848             : #ifdef CONFIG_WATCH_QUEUE
     849             :         if (pipe->watch_queue)
     850             :                 put_watch_queue(pipe->watch_queue);
     851             : #endif
     852    20606612 :         if (pipe->tmp_page)
     853    14221290 :                 __free_page(pipe->tmp_page);
     854    20605242 :         kfree(pipe->bufs);
     855    20605572 :         kfree(pipe);
     856    20606531 : }
     857             : 
     858             : static struct vfsmount *pipe_mnt __read_mostly;
     859             : 
     860             : /*
     861             :  * pipefs_dname() is called from d_path().
     862             :  */
     863           0 : static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
     864             : {
     865           0 :         return dynamic_dname(buffer, buflen, "pipe:[%lu]",
     866             :                                 d_inode(dentry)->i_ino);
     867             : }
     868             : 
     869             : static const struct dentry_operations pipefs_dentry_operations = {
     870             :         .d_dname        = pipefs_dname,
     871             : };
     872             : 
     873    20511605 : static struct inode * get_pipe_inode(void)
     874             : {
     875    20511605 :         struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
     876    20520338 :         struct pipe_inode_info *pipe;
     877             : 
     878    20520338 :         if (!inode)
     879           0 :                 goto fail_inode;
     880             : 
     881    20520338 :         inode->i_ino = get_next_ino();
     882             : 
     883    20515021 :         pipe = alloc_pipe_info();
     884    20520249 :         if (!pipe)
     885           0 :                 goto fail_iput;
     886             : 
     887    20520249 :         inode->i_pipe = pipe;
     888    20520249 :         pipe->files = 2;
     889    20520249 :         pipe->readers = pipe->writers = 1;
     890    20520249 :         inode->i_fop = &pipefifo_fops;
     891             : 
     892             :         /*
     893             :          * Mark the inode dirty from the very beginning,
     894             :          * that way it will never be moved to the dirty
     895             :          * list because "mark_inode_dirty()" will think
     896             :          * that it already _is_ on the dirty list.
     897             :          */
     898    20520249 :         inode->i_state = I_DIRTY;
     899    20520249 :         inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
     900    20520249 :         inode->i_uid = current_fsuid();
     901    20520249 :         inode->i_gid = current_fsgid();
     902    20520249 :         inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
     903             : 
     904    20516827 :         return inode;
     905             : 
     906             : fail_iput:
     907           0 :         iput(inode);
     908             : 
     909             : fail_inode:
     910             :         return NULL;
     911             : }
     912             : 
     913    20509747 : int create_pipe_files(struct file **res, int flags)
     914             : {
     915    20509747 :         struct inode *inode = get_pipe_inode();
     916    20517404 :         struct file *f;
     917    20517404 :         int error;
     918             : 
     919    20517404 :         if (!inode)
     920             :                 return -ENFILE;
     921             : 
     922    20517404 :         if (flags & O_NOTIFICATION_PIPE) {
     923           0 :                 error = watch_queue_init(inode->i_pipe);
     924           0 :                 if (error) {
     925           0 :                         free_pipe_info(inode->i_pipe);
     926           0 :                         iput(inode);
     927           0 :                         return error;
     928             :                 }
     929             :         }
     930             : 
     931    20517404 :         f = alloc_file_pseudo(inode, pipe_mnt, "",
     932    20517404 :                                 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
     933             :                                 &pipefifo_fops);
     934    20516808 :         if (IS_ERR(f)) {
     935           0 :                 free_pipe_info(inode->i_pipe);
     936           0 :                 iput(inode);
     937           0 :                 return PTR_ERR(f);
     938             :         }
     939             : 
     940    20516808 :         f->private_data = inode->i_pipe;
     941             : 
     942    20516808 :         res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
     943             :                                   &pipefifo_fops);
     944    20519451 :         if (IS_ERR(res[0])) {
     945           0 :                 put_pipe_info(inode, inode->i_pipe);
     946           0 :                 fput(f);
     947           0 :                 return PTR_ERR(res[0]);
     948             :         }
     949    20519451 :         res[0]->private_data = inode->i_pipe;
     950    20519451 :         res[1] = f;
     951    20519451 :         stream_open(inode, res[0]);
     952    20514667 :         stream_open(inode, res[1]);
     953    20514667 :         return 0;
     954             : }
     955             : 
     956    20516691 : static int __do_pipe_flags(int *fd, struct file **files, int flags)
     957             : {
     958    20516691 :         int error;
     959    20516691 :         int fdw, fdr;
     960             : 
     961    20516691 :         if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
     962             :                 return -EINVAL;
     963             : 
     964    20516691 :         error = create_pipe_files(files, flags);
     965    20519729 :         if (error)
     966             :                 return error;
     967             : 
     968    20519151 :         error = get_unused_fd_flags(flags);
     969    20518920 :         if (error < 0)
     970           0 :                 goto err_read_pipe;
     971    20518920 :         fdr = error;
     972             : 
     973    20518920 :         error = get_unused_fd_flags(flags);
     974    20523227 :         if (error < 0)
     975           0 :                 goto err_fdr;
     976    20523227 :         fdw = error;
     977             : 
     978    20523227 :         audit_fd_pair(fdr, fdw);
     979    20523227 :         fd[0] = fdr;
     980    20523227 :         fd[1] = fdw;
     981             :         /* pipe groks IOCB_NOWAIT */
     982    20523227 :         files[0]->f_mode |= FMODE_NOWAIT;
     983    20523227 :         files[1]->f_mode |= FMODE_NOWAIT;
     984    20523227 :         return 0;
     985             : 
     986             :  err_fdr:
     987           0 :         put_unused_fd(fdr);
     988           0 :  err_read_pipe:
     989           0 :         fput(files[0]);
     990           0 :         fput(files[1]);
     991           0 :         return error;
     992             : }
     993             : 
     994           0 : int do_pipe_flags(int *fd, int flags)
     995             : {
     996           0 :         struct file *files[2];
     997           0 :         int error = __do_pipe_flags(fd, files, flags);
     998           0 :         if (!error) {
     999           0 :                 fd_install(fd[0], files[0]);
    1000           0 :                 fd_install(fd[1], files[1]);
    1001             :         }
    1002           0 :         return error;
    1003             : }
    1004             : 
    1005             : /*
    1006             :  * sys_pipe() is the normal C calling standard for creating
    1007             :  * a pipe. It's not the way Unix traditionally does this, though.
    1008             :  */
    1009    20515435 : static int do_pipe2(int __user *fildes, int flags)
    1010             : {
    1011    20515435 :         struct file *files[2];
    1012    20515435 :         int fd[2];
    1013    20515435 :         int error;
    1014             : 
    1015    20515435 :         error = __do_pipe_flags(fd, files, flags);
    1016    20522198 :         if (!error) {
    1017    20522218 :                 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
    1018           0 :                         fput(files[0]);
    1019           0 :                         fput(files[1]);
    1020           0 :                         put_unused_fd(fd[0]);
    1021           0 :                         put_unused_fd(fd[1]);
    1022           0 :                         error = -EFAULT;
    1023             :                 } else {
    1024    20519023 :                         fd_install(fd[0], files[0]);
    1025    20516120 :                         fd_install(fd[1], files[1]);
    1026             :                 }
    1027             :         }
    1028    20519827 :         return error;
    1029             : }
    1030             : 
    1031    41037365 : SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
    1032             : {
    1033    20515915 :         return do_pipe2(fildes, flags);
    1034             : }
    1035             : 
    1036           0 : SYSCALL_DEFINE1(pipe, int __user *, fildes)
    1037             : {
    1038           0 :         return do_pipe2(fildes, 0);
    1039             : }
    1040             : 
    1041             : /*
    1042             :  * This is the stupid "wait for pipe to be readable or writable"
    1043             :  * model.
    1044             :  *
    1045             :  * See pipe_read/write() for the proper kind of exclusive wait,
    1046             :  * but that requires that we wake up any other readers/writers
    1047             :  * if we then do not end up reading everything (ie the whole
    1048             :  * "wake_next_reader/writer" logic in pipe_read/write()).
    1049             :  */
    1050     2311891 : void pipe_wait_readable(struct pipe_inode_info *pipe)
    1051             : {
    1052     2311891 :         pipe_unlock(pipe);
    1053     9247512 :         wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
    1054     2311883 :         pipe_lock(pipe);
    1055     2312150 : }
    1056             : 
    1057       30295 : void pipe_wait_writable(struct pipe_inode_info *pipe)
    1058             : {
    1059       30295 :         pipe_unlock(pipe);
    1060      135160 :         wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
    1061       30295 :         pipe_lock(pipe);
    1062       30295 : }
    1063             : 
    1064             : /*
    1065             :  * This depends on both the wait (here) and the wakeup (wake_up_partner)
    1066             :  * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
    1067             :  * race with the count check and waitqueue prep.
    1068             :  *
    1069             :  * Normally in order to avoid races, you'd do the prepare_to_wait() first,
    1070             :  * then check the condition you're waiting for, and only then sleep. But
    1071             :  * because of the pipe lock, we can check the condition before being on
    1072             :  * the wait queue.
    1073             :  *
    1074             :  * We use the 'rd_wait' waitqueue for pipe partner waiting.
    1075             :  */
    1076           0 : static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
    1077             : {
    1078           0 :         DEFINE_WAIT(rdwait);
    1079           0 :         int cur = *cnt;
    1080             : 
    1081           0 :         while (cur == *cnt) {
    1082           0 :                 prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
    1083           0 :                 pipe_unlock(pipe);
    1084           0 :                 schedule();
    1085           0 :                 finish_wait(&pipe->rd_wait, &rdwait);
    1086           0 :                 pipe_lock(pipe);
    1087           0 :                 if (signal_pending(current))
    1088             :                         break;
    1089             :         }
    1090           0 :         return cur == *cnt ? -ERESTARTSYS : 0;
    1091             : }
    1092             : 
    1093             : static void wake_up_partner(struct pipe_inode_info *pipe)
    1094             : {
    1095       68309 :         wake_up_interruptible_all(&pipe->rd_wait);
    1096       68309 : }
    1097             : 
    1098      805853 : static int fifo_open(struct inode *inode, struct file *filp)
    1099             : {
    1100      805853 :         struct pipe_inode_info *pipe;
    1101      805853 :         bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
    1102      805853 :         int ret;
    1103             : 
    1104      805853 :         filp->f_version = 0;
    1105             : 
    1106      805853 :         spin_lock(&inode->i_lock);
    1107      805853 :         if (inode->i_pipe) {
    1108      771032 :                 pipe = inode->i_pipe;
    1109      771032 :                 pipe->files++;
    1110      771032 :                 spin_unlock(&inode->i_lock);
    1111             :         } else {
    1112       34821 :                 spin_unlock(&inode->i_lock);
    1113       34821 :                 pipe = alloc_pipe_info();
    1114       34821 :                 if (!pipe)
    1115             :                         return -ENOMEM;
    1116       34821 :                 pipe->files = 1;
    1117       34821 :                 spin_lock(&inode->i_lock);
    1118       34821 :                 if (unlikely(inode->i_pipe)) {
    1119           0 :                         inode->i_pipe->files++;
    1120           0 :                         spin_unlock(&inode->i_lock);
    1121           0 :                         free_pipe_info(pipe);
    1122           0 :                         pipe = inode->i_pipe;
    1123             :                 } else {
    1124       34821 :                         inode->i_pipe = pipe;
    1125       34821 :                         spin_unlock(&inode->i_lock);
    1126             :                 }
    1127             :         }
    1128      805853 :         filp->private_data = pipe;
    1129             :         /* OK, we have a pipe and it's pinned down */
    1130             : 
    1131      805853 :         __pipe_lock(pipe);
    1132             : 
    1133             :         /* We can only do regular read/write on fifos */
    1134      805853 :         stream_open(inode, filp);
    1135             : 
    1136      805853 :         switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
    1137      770399 :         case FMODE_READ:
    1138             :         /*
    1139             :          *  O_RDONLY
    1140             :          *  POSIX.1 says that O_NONBLOCK means return with the FIFO
    1141             :          *  opened, even when there is no process writing the FIFO.
    1142             :          */
    1143      770399 :                 pipe->r_counter++;
    1144      770399 :                 if (pipe->readers++ == 0)
    1145       34742 :                         wake_up_partner(pipe);
    1146             : 
    1147      770399 :                 if (!is_pipe && !pipe->writers) {
    1148       34742 :                         if ((filp->f_flags & O_NONBLOCK)) {
    1149             :                                 /* suppress EPOLLHUP until we have
    1150             :                                  * seen a writer */
    1151       34742 :                                 filp->f_version = pipe->w_counter;
    1152             :                         } else {
    1153           0 :                                 if (wait_for_partner(pipe, &pipe->w_counter))
    1154           0 :                                         goto err_rd;
    1155             :                         }
    1156             :                 }
    1157             :                 break;
    1158             : 
    1159       34137 :         case FMODE_WRITE:
    1160             :         /*
    1161             :          *  O_WRONLY
    1162             :          *  POSIX.1 says that O_NONBLOCK means return -1 with
    1163             :          *  errno=ENXIO when there is no process reading the FIFO.
    1164             :          */
    1165       34137 :                 ret = -ENXIO;
    1166       34137 :                 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
    1167           0 :                         goto err;
    1168             : 
    1169       34137 :                 pipe->w_counter++;
    1170       34137 :                 if (!pipe->writers++)
    1171       33488 :                         wake_up_partner(pipe);
    1172             : 
    1173       34137 :                 if (!is_pipe && !pipe->readers) {
    1174           0 :                         if (wait_for_partner(pipe, &pipe->r_counter))
    1175           0 :                                 goto err_wr;
    1176             :                 }
    1177             :                 break;
    1178             : 
    1179        1317 :         case FMODE_READ | FMODE_WRITE:
    1180             :         /*
    1181             :          *  O_RDWR
    1182             :          *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
    1183             :          *  This implementation will NEVER block on a O_RDWR open, since
    1184             :          *  the process can at least talk to itself.
    1185             :          */
    1186             : 
    1187        1317 :                 pipe->readers++;
    1188        1317 :                 pipe->writers++;
    1189        1317 :                 pipe->r_counter++;
    1190        1317 :                 pipe->w_counter++;
    1191        1317 :                 if (pipe->readers == 1 || pipe->writers == 1)
    1192          79 :                         wake_up_partner(pipe);
    1193             :                 break;
    1194             : 
    1195           0 :         default:
    1196           0 :                 ret = -EINVAL;
    1197           0 :                 goto err;
    1198             :         }
    1199             : 
    1200             :         /* Ok! */
    1201      805853 :         __pipe_unlock(pipe);
    1202      805853 :         return 0;
    1203             : 
    1204             : err_rd:
    1205           0 :         if (!--pipe->readers)
    1206           0 :                 wake_up_interruptible(&pipe->wr_wait);
    1207           0 :         ret = -ERESTARTSYS;
    1208           0 :         goto err;
    1209             : 
    1210             : err_wr:
    1211           0 :         if (!--pipe->writers)
    1212           0 :                 wake_up_interruptible_all(&pipe->rd_wait);
    1213           0 :         ret = -ERESTARTSYS;
    1214           0 :         goto err;
    1215             : 
    1216           0 : err:
    1217           0 :         __pipe_unlock(pipe);
    1218             : 
    1219           0 :         put_pipe_info(inode, pipe);
    1220           0 :         return ret;
    1221             : }
    1222             : 
    1223             : const struct file_operations pipefifo_fops = {
    1224             :         .open           = fifo_open,
    1225             :         .llseek         = no_llseek,
    1226             :         .read_iter      = pipe_read,
    1227             :         .write_iter     = pipe_write,
    1228             :         .poll           = pipe_poll,
    1229             :         .unlocked_ioctl = pipe_ioctl,
    1230             :         .release        = pipe_release,
    1231             :         .fasync         = pipe_fasync,
    1232             :         .splice_write   = iter_file_splice_write,
    1233             : };
    1234             : 
    1235             : /*
    1236             :  * Currently we rely on the pipe array holding a power-of-2 number
    1237             :  * of pages. Returns 0 on error.
    1238             :  */
    1239           0 : unsigned int round_pipe_size(unsigned long size)
    1240             : {
    1241           0 :         if (size > (1U << 31))
    1242             :                 return 0;
    1243             : 
    1244             :         /* Minimum pipe size, as required by POSIX */
    1245           0 :         if (size < PAGE_SIZE)
    1246             :                 return PAGE_SIZE;
    1247             : 
    1248           0 :         return roundup_pow_of_two(size);
    1249             : }
    1250             : 
    1251             : /*
    1252             :  * Resize the pipe ring to a number of slots.
    1253             :  *
    1254             :  * Note the pipe can be reduced in capacity, but only if the current
    1255             :  * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
    1256             :  * returned instead.
    1257             :  */
    1258           0 : int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
    1259             : {
    1260           0 :         struct pipe_buffer *bufs;
    1261           0 :         unsigned int head, tail, mask, n;
    1262             : 
    1263           0 :         bufs = kcalloc(nr_slots, sizeof(*bufs),
    1264             :                        GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
    1265           0 :         if (unlikely(!bufs))
    1266             :                 return -ENOMEM;
    1267             : 
    1268           0 :         spin_lock_irq(&pipe->rd_wait.lock);
    1269           0 :         mask = pipe->ring_size - 1;
    1270           0 :         head = pipe->head;
    1271           0 :         tail = pipe->tail;
    1272             : 
    1273           0 :         n = pipe_occupancy(head, tail);
    1274           0 :         if (nr_slots < n) {
    1275           0 :                 spin_unlock_irq(&pipe->rd_wait.lock);
    1276           0 :                 kfree(bufs);
    1277           0 :                 return -EBUSY;
    1278             :         }
    1279             : 
    1280             :         /*
    1281             :          * The pipe array wraps around, so just start the new one at zero
    1282             :          * and adjust the indices.
    1283             :          */
    1284           0 :         if (n > 0) {
    1285           0 :                 unsigned int h = head & mask;
    1286           0 :                 unsigned int t = tail & mask;
    1287           0 :                 if (h > t) {
    1288           0 :                         memcpy(bufs, pipe->bufs + t,
    1289             :                                n * sizeof(struct pipe_buffer));
    1290             :                 } else {
    1291           0 :                         unsigned int tsize = pipe->ring_size - t;
    1292           0 :                         if (h > 0)
    1293           0 :                                 memcpy(bufs + tsize, pipe->bufs,
    1294             :                                        h * sizeof(struct pipe_buffer));
    1295           0 :                         memcpy(bufs, pipe->bufs + t,
    1296             :                                tsize * sizeof(struct pipe_buffer));
    1297             :                 }
    1298             :         }
    1299             : 
    1300           0 :         head = n;
    1301           0 :         tail = 0;
    1302             : 
    1303           0 :         kfree(pipe->bufs);
    1304           0 :         pipe->bufs = bufs;
    1305           0 :         pipe->ring_size = nr_slots;
    1306           0 :         if (pipe->max_usage > nr_slots)
    1307           0 :                 pipe->max_usage = nr_slots;
    1308           0 :         pipe->tail = tail;
    1309           0 :         pipe->head = head;
    1310             : 
    1311           0 :         spin_unlock_irq(&pipe->rd_wait.lock);
    1312             : 
    1313             :         /* This might have made more room for writers */
    1314           0 :         wake_up_interruptible(&pipe->wr_wait);
    1315           0 :         return 0;
    1316             : }
    1317             : 
    1318             : /*
    1319             :  * Allocate a new array of pipe buffers and copy the info over. Returns the
    1320             :  * pipe size if successful, or return -ERROR on error.
    1321             :  */
    1322           0 : static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
    1323             : {
    1324           0 :         unsigned long user_bufs;
    1325           0 :         unsigned int nr_slots, size;
    1326           0 :         long ret = 0;
    1327             : 
    1328             : #ifdef CONFIG_WATCH_QUEUE
    1329             :         if (pipe->watch_queue)
    1330             :                 return -EBUSY;
    1331             : #endif
    1332             : 
    1333           0 :         size = round_pipe_size(arg);
    1334           0 :         nr_slots = size >> PAGE_SHIFT;
    1335             : 
    1336           0 :         if (!nr_slots)
    1337             :                 return -EINVAL;
    1338             : 
    1339             :         /*
    1340             :          * If trying to increase the pipe capacity, check that an
    1341             :          * unprivileged user is not trying to exceed various limits
    1342             :          * (soft limit check here, hard limit check just below).
    1343             :          * Decreasing the pipe capacity is always permitted, even
    1344             :          * if the user is currently over a limit.
    1345             :          */
    1346           0 :         if (nr_slots > pipe->max_usage &&
    1347           0 :                         size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
    1348             :                 return -EPERM;
    1349             : 
    1350           0 :         user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
    1351             : 
    1352           0 :         if (nr_slots > pipe->max_usage &&
    1353           0 :                         (too_many_pipe_buffers_hard(user_bufs) ||
    1354           0 :                          too_many_pipe_buffers_soft(user_bufs)) &&
    1355           0 :                         pipe_is_unprivileged_user()) {
    1356           0 :                 ret = -EPERM;
    1357           0 :                 goto out_revert_acct;
    1358             :         }
    1359             : 
    1360           0 :         ret = pipe_resize_ring(pipe, nr_slots);
    1361           0 :         if (ret < 0)
    1362           0 :                 goto out_revert_acct;
    1363             : 
    1364           0 :         pipe->max_usage = nr_slots;
    1365           0 :         pipe->nr_accounted = nr_slots;
    1366           0 :         return pipe->max_usage * PAGE_SIZE;
    1367             : 
    1368           0 : out_revert_acct:
    1369           0 :         (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
    1370           0 :         return ret;
    1371             : }
    1372             : 
    1373             : /*
    1374             :  * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
    1375             :  * not enough to verify that this is a pipe.
    1376             :  */
    1377    28958082 : struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
    1378             : {
    1379    28992681 :         struct pipe_inode_info *pipe = file->private_data;
    1380             : 
    1381    28992681 :         if (file->f_op != &pipefifo_fops || !pipe)
    1382    14345469 :                 return NULL;
    1383             : #ifdef CONFIG_WATCH_QUEUE
    1384             :         if (for_splice && pipe->watch_queue)
    1385             :                 return NULL;
    1386             : #endif
    1387             :         return pipe;
    1388             : }
    1389             : 
    1390       34599 : long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
    1391             : {
    1392       34599 :         struct pipe_inode_info *pipe;
    1393       34599 :         long ret;
    1394             : 
    1395       34599 :         pipe = get_pipe_info(file, false);
    1396       34599 :         if (!pipe)
    1397             :                 return -EBADF;
    1398             : 
    1399       34599 :         __pipe_lock(pipe);
    1400             : 
    1401       34599 :         switch (cmd) {
    1402           0 :         case F_SETPIPE_SZ:
    1403           0 :                 ret = pipe_set_size(pipe, arg);
    1404           0 :                 break;
    1405       34599 :         case F_GETPIPE_SZ:
    1406       34599 :                 ret = pipe->max_usage * PAGE_SIZE;
    1407       34599 :                 break;
    1408             :         default:
    1409             :                 ret = -EINVAL;
    1410             :                 break;
    1411             :         }
    1412             : 
    1413       34599 :         __pipe_unlock(pipe);
    1414       34599 :         return ret;
    1415             : }
    1416             : 
    1417             : static const struct super_operations pipefs_ops = {
    1418             :         .destroy_inode = free_inode_nonrcu,
    1419             :         .statfs = simple_statfs,
    1420             : };
    1421             : 
    1422             : /*
    1423             :  * pipefs should _never_ be mounted by userland - too much of security hassle,
    1424             :  * no real gain from having the whole whorehouse mounted. So we don't need
    1425             :  * any operations on the root directory. However, we need a non-trivial
    1426             :  * d_name - pipe: will go nicely and kill the special-casing in procfs.
    1427             :  */
    1428             : 
    1429           0 : static int pipefs_init_fs_context(struct fs_context *fc)
    1430             : {
    1431           0 :         struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
    1432           0 :         if (!ctx)
    1433             :                 return -ENOMEM;
    1434           0 :         ctx->ops = &pipefs_ops;
    1435           0 :         ctx->dops = &pipefs_dentry_operations;
    1436           0 :         return 0;
    1437             : }
    1438             : 
    1439             : static struct file_system_type pipe_fs_type = {
    1440             :         .name           = "pipefs",
    1441             :         .init_fs_context = pipefs_init_fs_context,
    1442             :         .kill_sb        = kill_anon_super,
    1443             : };
    1444             : 
    1445             : #ifdef CONFIG_SYSCTL
    1446           0 : static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
    1447             :                                         unsigned int *valp,
    1448             :                                         int write, void *data)
    1449             : {
    1450           0 :         if (write) {
    1451           0 :                 unsigned int val;
    1452             : 
    1453           0 :                 val = round_pipe_size(*lvalp);
    1454           0 :                 if (val == 0)
    1455             :                         return -EINVAL;
    1456             : 
    1457           0 :                 *valp = val;
    1458             :         } else {
    1459           0 :                 unsigned int val = *valp;
    1460           0 :                 *lvalp = (unsigned long) val;
    1461             :         }
    1462             : 
    1463             :         return 0;
    1464             : }
    1465             : 
    1466           0 : static int proc_dopipe_max_size(struct ctl_table *table, int write,
    1467             :                                 void *buffer, size_t *lenp, loff_t *ppos)
    1468             : {
    1469           0 :         return do_proc_douintvec(table, write, buffer, lenp, ppos,
    1470             :                                  do_proc_dopipe_max_size_conv, NULL);
    1471             : }
    1472             : 
    1473             : static struct ctl_table fs_pipe_sysctls[] = {
    1474             :         {
    1475             :                 .procname       = "pipe-max-size",
    1476             :                 .data           = &pipe_max_size,
    1477             :                 .maxlen         = sizeof(pipe_max_size),
    1478             :                 .mode           = 0644,
    1479             :                 .proc_handler   = proc_dopipe_max_size,
    1480             :         },
    1481             :         {
    1482             :                 .procname       = "pipe-user-pages-hard",
    1483             :                 .data           = &pipe_user_pages_hard,
    1484             :                 .maxlen         = sizeof(pipe_user_pages_hard),
    1485             :                 .mode           = 0644,
    1486             :                 .proc_handler   = proc_doulongvec_minmax,
    1487             :         },
    1488             :         {
    1489             :                 .procname       = "pipe-user-pages-soft",
    1490             :                 .data           = &pipe_user_pages_soft,
    1491             :                 .maxlen         = sizeof(pipe_user_pages_soft),
    1492             :                 .mode           = 0644,
    1493             :                 .proc_handler   = proc_doulongvec_minmax,
    1494             :         },
    1495             :         { }
    1496             : };
    1497             : #endif
    1498             : 
    1499           0 : static int __init init_pipe_fs(void)
    1500             : {
    1501           0 :         int err = register_filesystem(&pipe_fs_type);
    1502             : 
    1503           0 :         if (!err) {
    1504           0 :                 pipe_mnt = kern_mount(&pipe_fs_type);
    1505           0 :                 if (IS_ERR(pipe_mnt)) {
    1506           0 :                         err = PTR_ERR(pipe_mnt);
    1507           0 :                         unregister_filesystem(&pipe_fs_type);
    1508             :                 }
    1509             :         }
    1510             : #ifdef CONFIG_SYSCTL
    1511           0 :         register_sysctl_init("fs", fs_pipe_sysctls);
    1512             : #endif
    1513           0 :         return err;
    1514             : }
    1515             : 
    1516             : fs_initcall(init_pipe_fs);

Generated by: LCOV version 1.14