LCOV - code coverage report
Current view: top level - fs - pipe.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-acha @ Mon Jul 31 20:08:06 PDT 2023 Lines: 415 657 63.2 %
Date: 2023-07-31 20:08:07 Functions: 22 42 52.4 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  *  linux/fs/pipe.c
       4             :  *
       5             :  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
       6             :  */
       7             : 
       8             : #include <linux/mm.h>
       9             : #include <linux/file.h>
      10             : #include <linux/poll.h>
      11             : #include <linux/slab.h>
      12             : #include <linux/module.h>
      13             : #include <linux/init.h>
      14             : #include <linux/fs.h>
      15             : #include <linux/log2.h>
      16             : #include <linux/mount.h>
      17             : #include <linux/pseudo_fs.h>
      18             : #include <linux/magic.h>
      19             : #include <linux/pipe_fs_i.h>
      20             : #include <linux/uio.h>
      21             : #include <linux/highmem.h>
      22             : #include <linux/pagemap.h>
      23             : #include <linux/audit.h>
      24             : #include <linux/syscalls.h>
      25             : #include <linux/fcntl.h>
      26             : #include <linux/memcontrol.h>
      27             : #include <linux/watch_queue.h>
      28             : #include <linux/sysctl.h>
      29             : 
      30             : #include <linux/uaccess.h>
      31             : #include <asm/ioctls.h>
      32             : 
      33             : #include "internal.h"
      34             : 
      35             : /*
      36             :  * New pipe buffers will be restricted to this size while the user is exceeding
      37             :  * their pipe buffer quota. The general pipe use case needs at least two
      38             :  * buffers: one for data yet to be read, and one for new data. If this is less
      39             :  * than two, then a write to a non-empty pipe may block even if the pipe is not
      40             :  * full. This can occur with GNU make jobserver or similar uses of pipes as
      41             :  * semaphores: multiple processes may be waiting to write tokens back to the
      42             :  * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
      43             :  *
      44             :  * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
      45             :  * own risk, namely: pipe writes to non-full pipes may block until the pipe is
      46             :  * emptied.
      47             :  */
      48             : #define PIPE_MIN_DEF_BUFFERS 2
      49             : 
      50             : /*
      51             :  * The max size that a non-root user is allowed to grow the pipe. Can
      52             :  * be set by root in /proc/sys/fs/pipe-max-size
      53             :  */
      54             : static unsigned int pipe_max_size = 1048576;
      55             : 
      56             : /* Maximum allocatable pages per user. Hard limit is unset by default, soft
      57             :  * matches default values.
      58             :  */
      59             : static unsigned long pipe_user_pages_hard;
      60             : static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
      61             : 
      62             : /*
      63             :  * We use head and tail indices that aren't masked off, except at the point of
      64             :  * dereference, but rather they're allowed to wrap naturally.  This means there
      65             :  * isn't a dead spot in the buffer, but the ring has to be a power of two and
      66             :  * <= 2^31.
      67             :  * -- David Howells 2019-09-23.
      68             :  *
      69             :  * Reads with count = 0 should always return 0.
      70             :  * -- Julian Bradfield 1999-06-07.
      71             :  *
      72             :  * FIFOs and Pipes now generate SIGIO for both readers and writers.
      73             :  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
      74             :  *
      75             :  * pipe_read & write cleanup
      76             :  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
      77             :  */
      78             : 
      79             : static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
      80             : {
      81     6060511 :         if (pipe->files)
      82     2027173 :                 mutex_lock_nested(&pipe->mutex, subclass);
      83             : }
      84             : 
      85     6020522 : void pipe_lock(struct pipe_inode_info *pipe)
      86             : {
      87             :         /*
      88             :          * pipe_lock() nests non-pipe inode locks (for writing to a file)
      89             :          */
      90     6020522 :         pipe_lock_nested(pipe, I_MUTEX_PARENT);
      91     6020529 : }
      92             : EXPORT_SYMBOL(pipe_lock);
      93             : 
      94     6020537 : void pipe_unlock(struct pipe_inode_info *pipe)
      95             : {
      96     6020537 :         if (pipe->files)
      97     2027202 :                 mutex_unlock(&pipe->mutex);
      98     6020511 : }
      99             : EXPORT_SYMBOL(pipe_unlock);
     100             : 
     101             : static inline void __pipe_lock(struct pipe_inode_info *pipe)
     102             : {
     103   759019860 :         mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
     104             : }
     105             : 
     106             : static inline void __pipe_unlock(struct pipe_inode_info *pipe)
     107             : {
     108   759021144 :         mutex_unlock(&pipe->mutex);
     109             : }
     110             : 
     111           0 : void pipe_double_lock(struct pipe_inode_info *pipe1,
     112             :                       struct pipe_inode_info *pipe2)
     113             : {
     114           0 :         BUG_ON(pipe1 == pipe2);
     115             : 
     116           0 :         if (pipe1 < pipe2) {
     117           0 :                 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
     118           0 :                 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
     119             :         } else {
     120           0 :                 pipe_lock_nested(pipe2, I_MUTEX_PARENT);
     121           0 :                 pipe_lock_nested(pipe1, I_MUTEX_CHILD);
     122             :         }
     123           0 : }
     124             : 
     125     8361667 : static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
     126             :                                   struct pipe_buffer *buf)
     127             : {
     128     8361667 :         struct page *page = buf->page;
     129             : 
     130             :         /*
     131             :          * If nobody else uses this page, and we don't already have a
     132             :          * temporary page, let's keep track of it as a one-deep
     133             :          * allocation cache. (Otherwise just release our reference to it)
     134             :          */
     135    16723334 :         if (page_count(page) == 1 && !pipe->tmp_page)
     136     8355020 :                 pipe->tmp_page = page;
     137             :         else
     138        6647 :                 put_page(page);
     139     8361106 : }
     140             : 
     141           0 : static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
     142             :                 struct pipe_buffer *buf)
     143             : {
     144           0 :         struct page *page = buf->page;
     145             : 
     146           0 :         if (page_count(page) != 1)
     147             :                 return false;
     148           0 :         memcg_kmem_uncharge_page(page, 0);
     149           0 :         __SetPageLocked(page);
     150           0 :         return true;
     151             : }
     152             : 
     153             : /**
     154             :  * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
     155             :  * @pipe:       the pipe that the buffer belongs to
     156             :  * @buf:        the buffer to attempt to steal
     157             :  *
     158             :  * Description:
     159             :  *      This function attempts to steal the &struct page attached to
     160             :  *      @buf. If successful, this function returns 0 and returns with
     161             :  *      the page locked. The caller may then reuse the page for whatever
     162             :  *      he wishes; the typical use is insertion into a different file
     163             :  *      page cache.
     164             :  */
     165           0 : bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
     166             :                 struct pipe_buffer *buf)
     167             : {
     168           0 :         struct page *page = buf->page;
     169             : 
     170             :         /*
     171             :          * A reference of one is golden, that means that the owner of this
     172             :          * page is the only one holding a reference to it. lock the page
     173             :          * and return OK.
     174             :          */
     175           0 :         if (page_count(page) == 1) {
     176           0 :                 lock_page(page);
     177           0 :                 return true;
     178             :         }
     179             :         return false;
     180             : }
     181             : EXPORT_SYMBOL(generic_pipe_buf_try_steal);
     182             : 
     183             : /**
     184             :  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
     185             :  * @pipe:       the pipe that the buffer belongs to
     186             :  * @buf:        the buffer to get a reference to
     187             :  *
     188             :  * Description:
     189             :  *      This function grabs an extra reference to @buf. It's used in
     190             :  *      the tee() system call, when we duplicate the buffers in one
     191             :  *      pipe into another.
     192             :  */
     193           0 : bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
     194             : {
     195           0 :         return try_get_page(buf->page);
     196             : }
     197             : EXPORT_SYMBOL(generic_pipe_buf_get);
     198             : 
     199             : /**
     200             :  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
     201             :  * @pipe:       the pipe that the buffer belongs to
     202             :  * @buf:        the buffer to put a reference to
     203             :  *
     204             :  * Description:
     205             :  *      This function releases a reference to @buf.
     206             :  */
     207       65495 : void generic_pipe_buf_release(struct pipe_inode_info *pipe,
     208             :                               struct pipe_buffer *buf)
     209             : {
     210       65495 :         put_page(buf->page);
     211       65495 : }
     212             : EXPORT_SYMBOL(generic_pipe_buf_release);
     213             : 
     214             : static const struct pipe_buf_operations anon_pipe_buf_ops = {
     215             :         .release        = anon_pipe_buf_release,
     216             :         .try_steal      = anon_pipe_buf_try_steal,
     217             :         .get            = generic_pipe_buf_get,
     218             : };
     219             : 
     220             : /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
     221             : static inline bool pipe_readable(const struct pipe_inode_info *pipe)
     222             : {
     223    35085761 :         unsigned int head = READ_ONCE(pipe->head);
     224    35085761 :         unsigned int tail = READ_ONCE(pipe->tail);
     225    35085761 :         unsigned int writers = READ_ONCE(pipe->writers);
     226             : 
     227    28086003 :         return !pipe_empty(head, tail) || !writers;
     228             : }
     229             : 
     230             : static ssize_t
     231   719194253 : pipe_read(struct kiocb *iocb, struct iov_iter *to)
     232             : {
     233   719194253 :         size_t total_len = iov_iter_count(to);
     234   719194253 :         struct file *filp = iocb->ki_filp;
     235   719194253 :         struct pipe_inode_info *pipe = filp->private_data;
     236   719194253 :         bool was_full, wake_next_reader = false;
     237   719194253 :         ssize_t ret;
     238             : 
     239             :         /* Null read succeeds. */
     240   719194253 :         if (unlikely(total_len == 0))
     241             :                 return 0;
     242             : 
     243   719194253 :         ret = 0;
     244   719194253 :         __pipe_lock(pipe);
     245             : 
     246             :         /*
     247             :          * We only wake up writers if the pipe was full when we started
     248             :          * reading in order to avoid unnecessary wakeups.
     249             :          *
     250             :          * But when we do wake up writers, we do so using a sync wakeup
     251             :          * (WF_SYNC), because we want them to get going and generate more
     252             :          * data for us.
     253             :          */
     254   719195423 :         was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
     255   730850622 :         for (;;) {
     256             :                 /* Read ->head with a barrier vs post_one_notification() */
     257   730850622 :                 unsigned int head = smp_load_acquire(&pipe->head);
     258   730853071 :                 unsigned int tail = pipe->tail;
     259   730853071 :                 unsigned int mask = pipe->ring_size - 1;
     260             : 
     261             : #ifdef CONFIG_WATCH_QUEUE
     262             :                 if (pipe->note_loss) {
     263             :                         struct watch_notification n;
     264             : 
     265             :                         if (total_len < 8) {
     266             :                                 if (ret == 0)
     267             :                                         ret = -ENOBUFS;
     268             :                                 break;
     269             :                         }
     270             : 
     271             :                         n.type = WATCH_TYPE_META;
     272             :                         n.subtype = WATCH_META_LOSS_NOTIFICATION;
     273             :                         n.info = watch_sizeof(n);
     274             :                         if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
     275             :                                 if (ret == 0)
     276             :                                         ret = -EFAULT;
     277             :                                 break;
     278             :                         }
     279             :                         ret += sizeof(n);
     280             :                         total_len -= sizeof(n);
     281             :                         pipe->note_loss = false;
     282             :                 }
     283             : #endif
     284             : 
     285   730853071 :                 if (!pipe_empty(head, tail)) {
     286   711563548 :                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
     287   711563548 :                         size_t chars = buf->len;
     288   711563548 :                         size_t written;
     289   711563548 :                         int error;
     290             : 
     291   711563548 :                         if (chars > total_len) {
     292   703201220 :                                 if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
     293           0 :                                         if (ret == 0)
     294           0 :                                                 ret = -ENOBUFS;
     295             :                                         break;
     296             :                                 }
     297             :                                 chars = total_len;
     298             :                         }
     299             : 
     300   711563548 :                         error = pipe_buf_confirm(pipe, buf);
     301           8 :                         if (error) {
     302           0 :                                 if (!ret)
     303           0 :                                         ret = error;
     304             :                                 break;
     305             :                         }
     306             : 
     307   711563548 :                         written = copy_page_to_iter(buf->page, buf->offset, chars, to);
     308   711562585 :                         if (unlikely(written < chars)) {
     309           0 :                                 if (!ret)
     310           0 :                                         ret = -EFAULT;
     311             :                                 break;
     312             :                         }
     313   711562585 :                         ret += chars;
     314   711562585 :                         buf->offset += chars;
     315   711562585 :                         buf->len -= chars;
     316             : 
     317             :                         /* Was it a packet buffer? Clean up and exit */
     318   711562585 :                         if (buf->flags & PIPE_BUF_FLAG_PACKET) {
     319           0 :                                 total_len = chars;
     320           0 :                                 buf->len = 0;
     321             :                         }
     322             : 
     323   711562585 :                         if (!buf->len) {
     324     8359023 :                                 pipe_buf_release(pipe, buf);
     325     8359178 :                                 spin_lock_irq(&pipe->rd_wait.lock);
     326             : #ifdef CONFIG_WATCH_QUEUE
     327             :                                 if (buf->flags & PIPE_BUF_FLAG_LOSS)
     328             :                                         pipe->note_loss = true;
     329             : #endif
     330     8360248 :                                 tail++;
     331     8360248 :                                 pipe->tail = tail;
     332     8360248 :                                 spin_unlock_irq(&pipe->rd_wait.lock);
     333             :                         }
     334   711562737 :                         total_len -= chars;
     335   711562737 :                         if (!total_len)
     336             :                                 break;  /* common path: read succeeded */
     337     7286587 :                         if (!pipe_empty(head, tail))    /* More to do? */
     338        2749 :                                 continue;
     339             :                 }
     340             : 
     341    26573361 :                 if (!pipe->writers)
     342             :                         break;
     343    16807054 :                 if (ret)
     344             :                         break;
     345    11659948 :                 if ((filp->f_flags & O_NONBLOCK) ||
     346    11655884 :                     (iocb->ki_flags & IOCB_NOWAIT)) {
     347             :                         ret = -EAGAIN;
     348             :                         break;
     349             :                 }
     350    11655884 :                 __pipe_unlock(pipe);
     351             : 
     352             :                 /*
     353             :                  * We only get here if we didn't actually read anything.
     354             :                  *
     355             :                  * However, we could have seen (and removed) a zero-sized
     356             :                  * pipe buffer, and might have made space in the buffers
     357             :                  * that way.
     358             :                  *
     359             :                  * You can't make zero-sized pipe buffers by doing an empty
     360             :                  * write (not even in packet mode), but they can happen if
     361             :                  * the writer gets an EFAULT when trying to fill a buffer
     362             :                  * that already got allocated and inserted in the buffer
     363             :                  * array.
     364             :                  *
     365             :                  * So we still need to wake up any pending writers in the
     366             :                  * _very_ unlikely case that the pipe was full, but we got
     367             :                  * no data.
     368             :                  */
     369    11654779 :                 if (unlikely(was_full))
     370           0 :                         wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
     371    11654779 :                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
     372             : 
     373             :                 /*
     374             :                  * But because we didn't read anything, at this point we can
     375             :                  * just return directly with -ERESTARTSYS if we're interrupted,
     376             :                  * since we've done any required wakeups and there's no need
     377             :                  * to mark anything accessed. And we've dropped the lock.
     378             :                  */
     379    46621583 :                 if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
     380             :                         return -ERESTARTSYS;
     381             : 
     382    11653809 :                 __pipe_lock(pipe);
     383    11652450 :                 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
     384    11652450 :                 wake_next_reader = true;
     385             :         }
     386   719193627 :         if (pipe_empty(pipe->head, pipe->tail))
     387    15977146 :                 wake_next_reader = false;
     388   719193627 :         __pipe_unlock(pipe);
     389             : 
     390   719186785 :         if (was_full)
     391   482863246 :                 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
     392   719186785 :         if (wake_next_reader)
     393      591776 :                 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
     394   719186785 :         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
     395   719186673 :         if (ret > 0)
     396   711554722 :                 file_accessed(filp);
     397             :         return ret;
     398             : }
     399             : 
     400             : static inline int is_packetized(struct file *file)
     401             : {
     402     8360532 :         return (file->f_flags & O_DIRECT) != 0;
     403             : }
     404             : 
     405             : /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
     406             : static inline bool pipe_writable(const struct pipe_inode_info *pipe)
     407             : {
     408   348151463 :         unsigned int head = READ_ONCE(pipe->head);
     409   348151463 :         unsigned int tail = READ_ONCE(pipe->tail);
     410   348151463 :         unsigned int max_usage = READ_ONCE(pipe->max_usage);
     411             : 
     412  1044446949 :         return !pipe_full(head, tail, max_usage) ||
     413   348144023 :                 !READ_ONCE(pipe->readers);
     414             : }
     415             : 
     416             : static ssize_t
     417     8762378 : pipe_write(struct kiocb *iocb, struct iov_iter *from)
     418             : {
     419     8762378 :         struct file *filp = iocb->ki_filp;
     420     8762378 :         struct pipe_inode_info *pipe = filp->private_data;
     421     8762378 :         unsigned int head;
     422     8762378 :         ssize_t ret = 0;
     423     8762378 :         size_t total_len = iov_iter_count(from);
     424     8762378 :         ssize_t chars;
     425     8762378 :         bool was_empty = false;
     426     8762378 :         bool wake_next_writer = false;
     427             : 
     428             :         /* Null write succeeds. */
     429     8762378 :         if (unlikely(total_len == 0))
     430             :                 return 0;
     431             : 
     432     8762378 :         __pipe_lock(pipe);
     433             : 
     434     8762290 :         if (!pipe->readers) {
     435         325 :                 send_sig(SIGPIPE, current, 0);
     436         325 :                 ret = -EPIPE;
     437         325 :                 goto out;
     438             :         }
     439             : 
     440             : #ifdef CONFIG_WATCH_QUEUE
     441             :         if (pipe->watch_queue) {
     442             :                 ret = -EXDEV;
     443             :                 goto out;
     444             :         }
     445             : #endif
     446             : 
     447             :         /*
     448             :          * If it wasn't empty we try to merge new data into
     449             :          * the last buffer.
     450             :          *
     451             :          * That naturally merges small writes, but it also
     452             :          * page-aligns the rest of the writes for large writes
     453             :          * spanning multiple pages.
     454             :          */
     455     8761965 :         head = pipe->head;
     456     8761965 :         was_empty = pipe_empty(head, pipe->tail);
     457     8761965 :         chars = total_len & (PAGE_SIZE-1);
     458     8761965 :         if (chars && !was_empty) {
     459      415487 :                 unsigned int mask = pipe->ring_size - 1;
     460      415487 :                 struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
     461      415487 :                 int offset = buf->offset + buf->len;
     462             : 
     463      415487 :                 if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
     464      415483 :                     offset + chars <= PAGE_SIZE) {
     465      402138 :                         ret = pipe_buf_confirm(pipe, buf);
     466      402138 :                         if (ret)
     467           0 :                                 goto out;
     468             : 
     469      402138 :                         ret = copy_page_from_iter(buf->page, offset, chars, from);
     470      402138 :                         if (unlikely(ret < chars)) {
     471           0 :                                 ret = -EFAULT;
     472           0 :                                 goto out;
     473             :                         }
     474             : 
     475      402138 :                         buf->len += ret;
     476      402138 :                         if (!iov_iter_count(from))
     477      402137 :                                 goto out;
     478             :                 }
     479             :         }
     480             : 
     481     8368735 :         for (;;) {
     482     8368735 :                 if (!pipe->readers) {
     483           4 :                         send_sig(SIGPIPE, current, 0);
     484           4 :                         if (!ret)
     485           4 :                                 ret = -EPIPE;
     486             :                         break;
     487             :                 }
     488             : 
     489     8368731 :                 head = pipe->head;
     490     8368731 :                 if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
     491     8360438 :                         unsigned int mask = pipe->ring_size - 1;
     492     8360438 :                         struct pipe_buffer *buf = &pipe->bufs[head & mask];
     493     8360438 :                         struct page *page = pipe->tmp_page;
     494     8360438 :                         int copied;
     495             : 
     496     8360438 :                         if (!page) {
     497     5838498 :                                 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
     498     5839492 :                                 if (unlikely(!page)) {
     499           0 :                                         ret = ret ? : -ENOMEM;
     500             :                                         break;
     501             :                                 }
     502     5839492 :                                 pipe->tmp_page = page;
     503             :                         }
     504             : 
     505             :                         /* Allocate a slot in the ring in advance and attach an
     506             :                          * empty buffer.  If we fault or otherwise fail to use
     507             :                          * it, either the reader will consume it or it'll still
     508             :                          * be there for the next write.
     509             :                          */
     510     8361432 :                         spin_lock_irq(&pipe->rd_wait.lock);
     511             : 
     512     8361888 :                         head = pipe->head;
     513     8361888 :                         if (pipe_full(head, pipe->tail, pipe->max_usage)) {
     514           0 :                                 spin_unlock_irq(&pipe->rd_wait.lock);
     515           0 :                                 continue;
     516             :                         }
     517             : 
     518     8361888 :                         pipe->head = head + 1;
     519     8361888 :                         spin_unlock_irq(&pipe->rd_wait.lock);
     520             : 
     521             :                         /* Insert it into the buffer array */
     522     8360532 :                         buf = &pipe->bufs[head & mask];
     523     8360532 :                         buf->page = page;
     524     8360532 :                         buf->ops = &anon_pipe_buf_ops;
     525     8360532 :                         buf->offset = 0;
     526     8360532 :                         buf->len = 0;
     527     8360532 :                         if (is_packetized(filp))
     528           0 :                                 buf->flags = PIPE_BUF_FLAG_PACKET;
     529             :                         else
     530     8360532 :                                 buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
     531     8360532 :                         pipe->tmp_page = NULL;
     532             : 
     533     8360532 :                         copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
     534     8361374 :                         if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
     535           0 :                                 if (!ret)
     536           0 :                                         ret = -EFAULT;
     537             :                                 break;
     538             :                         }
     539     8361374 :                         ret += copied;
     540     8361374 :                         buf->offset = 0;
     541     8361374 :                         buf->len = copied;
     542             : 
     543     8361374 :                         if (!iov_iter_count(from))
     544             :                                 break;
     545             :                 }
     546             : 
     547        9505 :                 if (!pipe_full(head, pipe->tail, pipe->max_usage))
     548        1463 :                         continue;
     549             : 
     550             :                 /* Wait for buffer space to become available. */
     551        8042 :                 if ((filp->f_flags & O_NONBLOCK) ||
     552        8042 :                     (iocb->ki_flags & IOCB_NOWAIT)) {
     553           0 :                         if (!ret)
     554           0 :                                 ret = -EAGAIN;
     555             :                         break;
     556             :                 }
     557        8042 :                 if (signal_pending(current)) {
     558           0 :                         if (!ret)
     559           0 :                                 ret = -ERESTARTSYS;
     560             :                         break;
     561             :                 }
     562             : 
     563             :                 /*
     564             :                  * We're going to release the pipe lock and wait for more
     565             :                  * space. We wake up any readers if necessary, and then
     566             :                  * after waiting we need to re-check whether the pipe
     567             :                  * become empty while we dropped the lock.
     568             :                  */
     569        7444 :                 __pipe_unlock(pipe);
     570        7444 :                 if (was_empty)
     571           0 :                         wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
     572        7444 :                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
     573   348158907 :                 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
     574        7444 :                 __pipe_lock(pipe);
     575        7444 :                 was_empty = pipe_empty(pipe->head, pipe->tail);
     576        7444 :                 wake_next_writer = true;
     577             :         }
     578     8360162 : out:
     579     8762628 :         if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
     580       60272 :                 wake_next_writer = false;
     581     8762628 :         __pipe_unlock(pipe);
     582             : 
     583             :         /*
     584             :          * If we do do a wakeup event, we do a 'sync' wakeup, because we
     585             :          * want the reader to start processing things asap, rather than
     586             :          * leave the data pending.
     587             :          *
     588             :          * This is particularly important for small writes, because of
     589             :          * how (for example) the GNU make jobserver uses small writes to
     590             :          * wake up pending jobs
     591             :          *
     592             :          * Epoll nonsensically wants a wakeup whether the pipe
     593             :          * was already empty or not.
     594             :          */
     595     8762784 :         if (was_empty || pipe->poll_usage)
     596     8585278 :                 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
     597     8763019 :         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
     598     8762974 :         if (wake_next_writer)
     599           8 :                 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
     600     8762974 :         if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
     601     8762171 :                 int err = file_update_time(filp);
     602     8762117 :                 if (err)
     603           0 :                         ret = err;
     604     8762117 :                 sb_end_write(file_inode(filp)->i_sb);
     605             :         }
     606             :         return ret;
     607             : }
     608             : 
     609     1077788 : static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
     610             : {
     611     1077788 :         struct pipe_inode_info *pipe = filp->private_data;
     612     1077788 :         unsigned int count, head, tail, mask;
     613             : 
     614     1077788 :         switch (cmd) {
     615             :         case FIONREAD:
     616      279136 :                 __pipe_lock(pipe);
     617      279136 :                 count = 0;
     618      279136 :                 head = pipe->head;
     619      279136 :                 tail = pipe->tail;
     620      279136 :                 mask = pipe->ring_size - 1;
     621             : 
     622      418683 :                 while (tail != head) {
     623      139547 :                         count += pipe->bufs[tail & mask].len;
     624      139547 :                         tail++;
     625             :                 }
     626      279136 :                 __pipe_unlock(pipe);
     627             : 
     628      279136 :                 return put_user(count, (int __user *)arg);
     629             : 
     630             : #ifdef CONFIG_WATCH_QUEUE
     631             :         case IOC_WATCH_QUEUE_SET_SIZE: {
     632             :                 int ret;
     633             :                 __pipe_lock(pipe);
     634             :                 ret = watch_queue_set_size(pipe, arg);
     635             :                 __pipe_unlock(pipe);
     636             :                 return ret;
     637             :         }
     638             : 
     639             :         case IOC_WATCH_QUEUE_SET_FILTER:
     640             :                 return watch_queue_set_filter(
     641             :                         pipe, (struct watch_notification_filter __user *)arg);
     642             : #endif
     643             : 
     644             :         default:
     645             :                 return -ENOIOCTLCMD;
     646             :         }
     647             : }
     648             : 
     649             : /* No kernel lock held - fine */
     650             : static __poll_t
     651      435604 : pipe_poll(struct file *filp, poll_table *wait)
     652             : {
     653      435604 :         __poll_t mask;
     654      435604 :         struct pipe_inode_info *pipe = filp->private_data;
     655      435604 :         unsigned int head, tail;
     656             : 
     657             :         /* Epoll has some historical nasty semantics, this enables them */
     658      435604 :         WRITE_ONCE(pipe->poll_usage, true);
     659             : 
     660             :         /*
     661             :          * Reading pipe state only -- no need for acquiring the semaphore.
     662             :          *
     663             :          * But because this is racy, the code has to add the
     664             :          * entry to the poll table _first_ ..
     665             :          */
     666      435604 :         if (filp->f_mode & FMODE_READ)
     667      431818 :                 poll_wait(filp, &pipe->rd_wait, wait);
     668      435604 :         if (filp->f_mode & FMODE_WRITE)
     669       31644 :                 poll_wait(filp, &pipe->wr_wait, wait);
     670             : 
     671             :         /*
     672             :          * .. and only then can you do the racy tests. That way,
     673             :          * if something changes and you got it wrong, the poll
     674             :          * table entry will wake you up and fix it.
     675             :          */
     676      435603 :         head = READ_ONCE(pipe->head);
     677      435603 :         tail = READ_ONCE(pipe->tail);
     678             : 
     679      435603 :         mask = 0;
     680      435603 :         if (filp->f_mode & FMODE_READ) {
     681      431818 :                 if (!pipe_empty(head, tail))
     682       26645 :                         mask |= EPOLLIN | EPOLLRDNORM;
     683      431818 :                 if (!pipe->writers && filp->f_version != pipe->w_counter)
     684       85711 :                         mask |= EPOLLHUP;
     685             :         }
     686             : 
     687      435603 :         if (filp->f_mode & FMODE_WRITE) {
     688       31643 :                 if (!pipe_full(head, tail, pipe->max_usage))
     689       31643 :                         mask |= EPOLLOUT | EPOLLWRNORM;
     690             :                 /*
     691             :                  * Most Unices do not set EPOLLERR for FIFOs but on Linux they
     692             :                  * behave exactly like pipes for poll().
     693             :                  */
     694       31643 :                 if (!pipe->readers)
     695           0 :                         mask |= EPOLLERR;
     696             :         }
     697             : 
     698      435603 :         return mask;
     699             : }
     700             : 
     701    18536580 : static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
     702             : {
     703    18536580 :         int kill = 0;
     704             : 
     705    18536580 :         spin_lock(&inode->i_lock);
     706    18537028 :         if (!--pipe->files) {
     707     8984902 :                 inode->i_pipe = NULL;
     708     8984902 :                 kill = 1;
     709             :         }
     710    18537028 :         spin_unlock(&inode->i_lock);
     711             : 
     712    18535956 :         if (kill)
     713     8983790 :                 free_pipe_info(pipe);
     714    18536981 : }
     715             : 
     716             : static int
     717    18537456 : pipe_release(struct inode *inode, struct file *file)
     718             : {
     719    18537456 :         struct pipe_inode_info *pipe = file->private_data;
     720             : 
     721    18537456 :         __pipe_lock(pipe);
     722    18537644 :         if (file->f_mode & FMODE_READ)
     723     9552183 :                 pipe->readers--;
     724    18537644 :         if (file->f_mode & FMODE_WRITE)
     725     8985682 :                 pipe->writers--;
     726             : 
     727             :         /* Was that the last reader or writer, but not the other side? */
     728    18537644 :         if (!pipe->readers != !pipe->writers) {
     729     9551789 :                 wake_up_interruptible_all(&pipe->rd_wait);
     730     9551723 :                 wake_up_interruptible_all(&pipe->wr_wait);
     731     9551153 :                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
     732     9551143 :                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
     733             :         }
     734    18537041 :         __pipe_unlock(pipe);
     735             : 
     736    18536509 :         put_pipe_info(inode, pipe);
     737    18536550 :         return 0;
     738             : }
     739             : 
     740             : static int
     741           0 : pipe_fasync(int fd, struct file *filp, int on)
     742             : {
     743           0 :         struct pipe_inode_info *pipe = filp->private_data;
     744           0 :         int retval = 0;
     745             : 
     746           0 :         __pipe_lock(pipe);
     747           0 :         if (filp->f_mode & FMODE_READ)
     748           0 :                 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
     749           0 :         if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
     750           0 :                 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
     751           0 :                 if (retval < 0 && (filp->f_mode & FMODE_READ))
     752             :                         /* this can happen only if on == T */
     753           0 :                         fasync_helper(-1, filp, 0, &pipe->fasync_readers);
     754             :         }
     755           0 :         __pipe_unlock(pipe);
     756           0 :         return retval;
     757             : }
     758             : 
     759    18033644 : unsigned long account_pipe_buffers(struct user_struct *user,
     760             :                                    unsigned long old, unsigned long new)
     761             : {
     762    18033644 :         return atomic_long_add_return(new - old, &user->pipe_bufs);
     763             : }
     764             : 
     765           0 : bool too_many_pipe_buffers_soft(unsigned long user_bufs)
     766             : {
     767     9016747 :         unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
     768             : 
     769     9016747 :         return soft_limit && user_bufs > soft_limit;
     770             : }
     771             : 
     772           0 : bool too_many_pipe_buffers_hard(unsigned long user_bufs)
     773             : {
     774     9016747 :         unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
     775             : 
     776     9016747 :         return hard_limit && user_bufs > hard_limit;
     777             : }
     778             : 
     779           0 : bool pipe_is_unprivileged_user(void)
     780             : {
     781           0 :         return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
     782             : }
     783             : 
     784     9016050 : struct pipe_inode_info *alloc_pipe_info(void)
     785             : {
     786     9016050 :         struct pipe_inode_info *pipe;
     787     9016050 :         unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
     788     9016050 :         struct user_struct *user = get_current_user();
     789     9014398 :         unsigned long user_bufs;
     790     9014398 :         unsigned int max_size = READ_ONCE(pipe_max_size);
     791             : 
     792     9014398 :         pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
     793     9015951 :         if (pipe == NULL)
     794           0 :                 goto out_free_uid;
     795             : 
     796     9015951 :         if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
     797           0 :                 pipe_bufs = max_size >> PAGE_SHIFT;
     798             : 
     799     9015951 :         user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
     800             : 
     801     9016747 :         if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
     802           0 :                 user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
     803           0 :                 pipe_bufs = PIPE_MIN_DEF_BUFFERS;
     804             :         }
     805             : 
     806     9016747 :         if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
     807           0 :                 goto out_revert_acct;
     808             : 
     809     9016747 :         pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
     810             :                              GFP_KERNEL_ACCOUNT);
     811             : 
     812     9016455 :         if (pipe->bufs) {
     813     9016455 :                 init_waitqueue_head(&pipe->rd_wait);
     814     9016551 :                 init_waitqueue_head(&pipe->wr_wait);
     815     9016430 :                 pipe->r_counter = pipe->w_counter = 1;
     816     9016430 :                 pipe->max_usage = pipe_bufs;
     817     9016430 :                 pipe->ring_size = pipe_bufs;
     818     9016430 :                 pipe->nr_accounted = pipe_bufs;
     819     9016430 :                 pipe->user = user;
     820     9016430 :                 mutex_init(&pipe->mutex);
     821     9016430 :                 return pipe;
     822             :         }
     823             : 
     824           0 : out_revert_acct:
     825           0 :         (void) account_pipe_buffers(user, pipe_bufs, 0);
     826           0 :         kfree(pipe);
     827           0 : out_free_uid:
     828           0 :         free_uid(user);
     829           0 :         return NULL;
     830             : }
     831             : 
     832     9016376 : void free_pipe_info(struct pipe_inode_info *pipe)
     833             : {
     834     9016376 :         unsigned int i;
     835             : 
     836             : #ifdef CONFIG_WATCH_QUEUE
     837             :         if (pipe->watch_queue)
     838             :                 watch_queue_clear(pipe->watch_queue);
     839             : #endif
     840             : 
     841     9016376 :         (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
     842     9016465 :         free_uid(pipe->user);
     843   162276005 :         for (i = 0; i < pipe->ring_size; i++) {
     844   144242973 :                 struct pipe_buffer *buf = pipe->bufs + i;
     845   144242973 :                 if (buf->ops)
     846       32271 :                         pipe_buf_release(pipe, buf);
     847             :         }
     848             : #ifdef CONFIG_WATCH_QUEUE
     849             :         if (pipe->watch_queue)
     850             :                 put_watch_queue(pipe->watch_queue);
     851             : #endif
     852     9016567 :         if (pipe->tmp_page)
     853     5833399 :                 __free_page(pipe->tmp_page);
     854     9015632 :         kfree(pipe->bufs);
     855     9015969 :         kfree(pipe);
     856     9016253 : }
     857             : 
     858             : static struct vfsmount *pipe_mnt __read_mostly;
     859             : 
     860             : /*
     861             :  * pipefs_dname() is called from d_path().
     862             :  */
     863           0 : static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
     864             : {
     865           0 :         return dynamic_dname(buffer, buflen, "pipe:[%lu]",
     866             :                                 d_inode(dentry)->i_ino);
     867             : }
     868             : 
     869             : static const struct dentry_operations pipefs_dentry_operations = {
     870             :         .d_dname        = pipefs_dname,
     871             : };
     872             : 
     873     8976746 : static struct inode * get_pipe_inode(void)
     874             : {
     875     8976746 :         struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
     876     8978288 :         struct pipe_inode_info *pipe;
     877             : 
     878     8978288 :         if (!inode)
     879           0 :                 goto fail_inode;
     880             : 
     881     8978288 :         inode->i_ino = get_next_ino();
     882             : 
     883     8977730 :         pipe = alloc_pipe_info();
     884     8978479 :         if (!pipe)
     885           0 :                 goto fail_iput;
     886             : 
     887     8978479 :         inode->i_pipe = pipe;
     888     8978479 :         pipe->files = 2;
     889     8978479 :         pipe->readers = pipe->writers = 1;
     890     8978479 :         inode->i_fop = &pipefifo_fops;
     891             : 
     892             :         /*
     893             :          * Mark the inode dirty from the very beginning,
     894             :          * that way it will never be moved to the dirty
     895             :          * list because "mark_inode_dirty()" will think
     896             :          * that it already _is_ on the dirty list.
     897             :          */
     898     8978479 :         inode->i_state = I_DIRTY;
     899     8978479 :         inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
     900     8978479 :         inode->i_uid = current_fsuid();
     901     8978479 :         inode->i_gid = current_fsgid();
     902     8978479 :         inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
     903             : 
     904     8978314 :         return inode;
     905             : 
     906             : fail_iput:
     907           0 :         iput(inode);
     908             : 
     909             : fail_inode:
     910             :         return NULL;
     911             : }
     912             : 
     913     8976135 : int create_pipe_files(struct file **res, int flags)
     914             : {
     915     8976135 :         struct inode *inode = get_pipe_inode();
     916     8978423 :         struct file *f;
     917     8978423 :         int error;
     918             : 
     919     8978423 :         if (!inode)
     920             :                 return -ENFILE;
     921             : 
     922     8978423 :         if (flags & O_NOTIFICATION_PIPE) {
     923           0 :                 error = watch_queue_init(inode->i_pipe);
     924           0 :                 if (error) {
     925           0 :                         free_pipe_info(inode->i_pipe);
     926           0 :                         iput(inode);
     927           0 :                         return error;
     928             :                 }
     929             :         }
     930             : 
     931     8978423 :         f = alloc_file_pseudo(inode, pipe_mnt, "",
     932     8978423 :                                 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
     933             :                                 &pipefifo_fops);
     934     8979311 :         if (IS_ERR(f)) {
     935           0 :                 free_pipe_info(inode->i_pipe);
     936           0 :                 iput(inode);
     937           0 :                 return PTR_ERR(f);
     938             :         }
     939             : 
     940     8979311 :         f->private_data = inode->i_pipe;
     941             : 
     942     8979311 :         res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
     943             :                                   &pipefifo_fops);
     944     8978817 :         if (IS_ERR(res[0])) {
     945           0 :                 put_pipe_info(inode, inode->i_pipe);
     946           0 :                 fput(f);
     947           0 :                 return PTR_ERR(res[0]);
     948             :         }
     949     8978817 :         res[0]->private_data = inode->i_pipe;
     950     8978817 :         res[1] = f;
     951     8978817 :         stream_open(inode, res[0]);
     952     8978946 :         stream_open(inode, res[1]);
     953     8978946 :         return 0;
     954             : }
     955             : 
     956     8976899 : static int __do_pipe_flags(int *fd, struct file **files, int flags)
     957             : {
     958     8976899 :         int error;
     959     8976899 :         int fdw, fdr;
     960             : 
     961     8976899 :         if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
     962             :                 return -EINVAL;
     963             : 
     964     8976899 :         error = create_pipe_files(files, flags);
     965     8980061 :         if (error)
     966             :                 return error;
     967             : 
     968     8980041 :         error = get_unused_fd_flags(flags);
     969     8978746 :         if (error < 0)
     970           0 :                 goto err_read_pipe;
     971     8978746 :         fdr = error;
     972             : 
     973     8978746 :         error = get_unused_fd_flags(flags);
     974     8978722 :         if (error < 0)
     975           0 :                 goto err_fdr;
     976     8978722 :         fdw = error;
     977             : 
     978     8978722 :         audit_fd_pair(fdr, fdw);
     979     8978722 :         fd[0] = fdr;
     980     8978722 :         fd[1] = fdw;
     981             :         /* pipe groks IOCB_NOWAIT */
     982     8978722 :         files[0]->f_mode |= FMODE_NOWAIT;
     983     8978722 :         files[1]->f_mode |= FMODE_NOWAIT;
     984     8978722 :         return 0;
     985             : 
     986             :  err_fdr:
     987           0 :         put_unused_fd(fdr);
     988           0 :  err_read_pipe:
     989           0 :         fput(files[0]);
     990           0 :         fput(files[1]);
     991           0 :         return error;
     992             : }
     993             : 
     994           0 : int do_pipe_flags(int *fd, int flags)
     995             : {
     996           0 :         struct file *files[2];
     997           0 :         int error = __do_pipe_flags(fd, files, flags);
     998           0 :         if (!error) {
     999           0 :                 fd_install(fd[0], files[0]);
    1000           0 :                 fd_install(fd[1], files[1]);
    1001             :         }
    1002           0 :         return error;
    1003             : }
    1004             : 
    1005             : /*
    1006             :  * sys_pipe() is the normal C calling standard for creating
    1007             :  * a pipe. It's not the way Unix traditionally does this, though.
    1008             :  */
    1009     8976655 : static int do_pipe2(int __user *fildes, int flags)
    1010             : {
    1011     8976655 :         struct file *files[2];
    1012     8976655 :         int fd[2];
    1013     8976655 :         int error;
    1014             : 
    1015     8976655 :         error = __do_pipe_flags(fd, files, flags);
    1016     8978686 :         if (!error) {
    1017     8978277 :                 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
    1018           0 :                         fput(files[0]);
    1019           0 :                         fput(files[1]);
    1020           0 :                         put_unused_fd(fd[0]);
    1021           0 :                         put_unused_fd(fd[1]);
    1022           0 :                         error = -EFAULT;
    1023             :                 } else {
    1024     8977480 :                         fd_install(fd[0], files[0]);
    1025     8978003 :                         fd_install(fd[1], files[1]);
    1026             :                 }
    1027             :         }
    1028     8978892 :         return error;
    1029             : }
    1030             : 
    1031    17956330 : SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
    1032             : {
    1033     8977202 :         return do_pipe2(fildes, flags);
    1034             : }
    1035             : 
    1036           0 : SYSCALL_DEFINE1(pipe, int __user *, fildes)
    1037             : {
    1038           0 :         return do_pipe2(fildes, 0);
    1039             : }
    1040             : 
    1041             : /*
    1042             :  * This is the stupid "wait for pipe to be readable or writable"
    1043             :  * model.
    1044             :  *
    1045             :  * See pipe_read/write() for the proper kind of exclusive wait,
    1046             :  * but that requires that we wake up any other readers/writers
    1047             :  * if we then do not end up reading everything (ie the whole
    1048             :  * "wake_next_reader/writer" logic in pipe_read/write()).
    1049             :  */
    1050       39989 : void pipe_wait_readable(struct pipe_inode_info *pipe)
    1051             : {
    1052       39989 :         pipe_unlock(pipe);
    1053      159959 :         wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
    1054       39989 :         pipe_lock(pipe);
    1055       39989 : }
    1056             : 
    1057           0 : void pipe_wait_writable(struct pipe_inode_info *pipe)
    1058             : {
    1059           0 :         pipe_unlock(pipe);
    1060           0 :         wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
    1061           0 :         pipe_lock(pipe);
    1062           0 : }
    1063             : 
    1064             : /*
    1065             :  * This depends on both the wait (here) and the wakeup (wake_up_partner)
    1066             :  * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
    1067             :  * race with the count check and waitqueue prep.
    1068             :  *
    1069             :  * Normally in order to avoid races, you'd do the prepare_to_wait() first,
    1070             :  * then check the condition you're waiting for, and only then sleep. But
    1071             :  * because of the pipe lock, we can check the condition before being on
    1072             :  * the wait queue.
    1073             :  *
    1074             :  * We use the 'rd_wait' waitqueue for pipe partner waiting.
    1075             :  */
    1076           0 : static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
    1077             : {
    1078           0 :         DEFINE_WAIT(rdwait);
    1079           0 :         int cur = *cnt;
    1080             : 
    1081           0 :         while (cur == *cnt) {
    1082           0 :                 prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
    1083           0 :                 pipe_unlock(pipe);
    1084           0 :                 schedule();
    1085           0 :                 finish_wait(&pipe->rd_wait, &rdwait);
    1086           0 :                 pipe_lock(pipe);
    1087           0 :                 if (signal_pending(current))
    1088             :                         break;
    1089             :         }
    1090           0 :         return cur == *cnt ? -ERESTARTSYS : 0;
    1091             : }
    1092             : 
    1093             : static void wake_up_partner(struct pipe_inode_info *pipe)
    1094             : {
    1095       11160 :         wake_up_interruptible_all(&pipe->rd_wait);
    1096       11160 : }
    1097             : 
    1098      577923 : static int fifo_open(struct inode *inode, struct file *filp)
    1099             : {
    1100      577923 :         struct pipe_inode_info *pipe;
    1101      577923 :         bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
    1102      577923 :         int ret;
    1103             : 
    1104      577923 :         filp->f_version = 0;
    1105             : 
    1106      577923 :         spin_lock(&inode->i_lock);
    1107      577923 :         if (inode->i_pipe) {
    1108      572225 :                 pipe = inode->i_pipe;
    1109      572225 :                 pipe->files++;
    1110      572225 :                 spin_unlock(&inode->i_lock);
    1111             :         } else {
    1112        5698 :                 spin_unlock(&inode->i_lock);
    1113        5698 :                 pipe = alloc_pipe_info();
    1114        5698 :                 if (!pipe)
    1115             :                         return -ENOMEM;
    1116        5698 :                 pipe->files = 1;
    1117        5698 :                 spin_lock(&inode->i_lock);
    1118        5698 :                 if (unlikely(inode->i_pipe)) {
    1119           0 :                         inode->i_pipe->files++;
    1120           0 :                         spin_unlock(&inode->i_lock);
    1121           0 :                         free_pipe_info(pipe);
    1122           0 :                         pipe = inode->i_pipe;
    1123             :                 } else {
    1124        5698 :                         inode->i_pipe = pipe;
    1125        5698 :                         spin_unlock(&inode->i_lock);
    1126             :                 }
    1127             :         }
    1128      577923 :         filp->private_data = pipe;
    1129             :         /* OK, we have a pipe and it's pinned down */
    1130             : 
    1131      577923 :         __pipe_lock(pipe);
    1132             : 
    1133             :         /* We can only do regular read/write on fifos */
    1134      577923 :         stream_open(inode, filp);
    1135             : 
    1136      577923 :         switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
    1137      572170 :         case FMODE_READ:
    1138             :         /*
    1139             :          *  O_RDONLY
    1140             :          *  POSIX.1 says that O_NONBLOCK means return with the FIFO
    1141             :          *  opened, even when there is no process writing the FIFO.
    1142             :          */
    1143      572170 :                 pipe->r_counter++;
    1144      572170 :                 if (pipe->readers++ == 0)
    1145        5684 :                         wake_up_partner(pipe);
    1146             : 
    1147      572170 :                 if (!is_pipe && !pipe->writers) {
    1148        5684 :                         if ((filp->f_flags & O_NONBLOCK)) {
    1149             :                                 /* suppress EPOLLHUP until we have
    1150             :                                  * seen a writer */
    1151        5684 :                                 filp->f_version = pipe->w_counter;
    1152             :                         } else {
    1153           0 :                                 if (wait_for_partner(pipe, &pipe->w_counter))
    1154           0 :                                         goto err_rd;
    1155             :                         }
    1156             :                 }
    1157             :                 break;
    1158             : 
    1159        5557 :         case FMODE_WRITE:
    1160             :         /*
    1161             :          *  O_WRONLY
    1162             :          *  POSIX.1 says that O_NONBLOCK means return -1 with
    1163             :          *  errno=ENXIO when there is no process reading the FIFO.
    1164             :          */
    1165        5557 :                 ret = -ENXIO;
    1166        5557 :                 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
    1167           0 :                         goto err;
    1168             : 
    1169        5557 :                 pipe->w_counter++;
    1170        5557 :                 if (!pipe->writers++)
    1171        5462 :                         wake_up_partner(pipe);
    1172             : 
    1173        5557 :                 if (!is_pipe && !pipe->readers) {
    1174           0 :                         if (wait_for_partner(pipe, &pipe->r_counter))
    1175           0 :                                 goto err_wr;
    1176             :                 }
    1177             :                 break;
    1178             : 
    1179         196 :         case FMODE_READ | FMODE_WRITE:
    1180             :         /*
    1181             :          *  O_RDWR
    1182             :          *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
    1183             :          *  This implementation will NEVER block on a O_RDWR open, since
    1184             :          *  the process can at least talk to itself.
    1185             :          */
    1186             : 
    1187         196 :                 pipe->readers++;
    1188         196 :                 pipe->writers++;
    1189         196 :                 pipe->r_counter++;
    1190         196 :                 pipe->w_counter++;
    1191         196 :                 if (pipe->readers == 1 || pipe->writers == 1)
    1192          14 :                         wake_up_partner(pipe);
    1193             :                 break;
    1194             : 
    1195           0 :         default:
    1196           0 :                 ret = -EINVAL;
    1197           0 :                 goto err;
    1198             :         }
    1199             : 
    1200             :         /* Ok! */
    1201      577923 :         __pipe_unlock(pipe);
    1202      577923 :         return 0;
    1203             : 
    1204             : err_rd:
    1205           0 :         if (!--pipe->readers)
    1206           0 :                 wake_up_interruptible(&pipe->wr_wait);
    1207           0 :         ret = -ERESTARTSYS;
    1208           0 :         goto err;
    1209             : 
    1210             : err_wr:
    1211           0 :         if (!--pipe->writers)
    1212           0 :                 wake_up_interruptible_all(&pipe->rd_wait);
    1213           0 :         ret = -ERESTARTSYS;
    1214           0 :         goto err;
    1215             : 
    1216           0 : err:
    1217           0 :         __pipe_unlock(pipe);
    1218             : 
    1219           0 :         put_pipe_info(inode, pipe);
    1220           0 :         return ret;
    1221             : }
    1222             : 
    1223             : const struct file_operations pipefifo_fops = {
    1224             :         .open           = fifo_open,
    1225             :         .llseek         = no_llseek,
    1226             :         .read_iter      = pipe_read,
    1227             :         .write_iter     = pipe_write,
    1228             :         .poll           = pipe_poll,
    1229             :         .unlocked_ioctl = pipe_ioctl,
    1230             :         .release        = pipe_release,
    1231             :         .fasync         = pipe_fasync,
    1232             :         .splice_write   = iter_file_splice_write,
    1233             : };
    1234             : 
    1235             : /*
    1236             :  * Currently we rely on the pipe array holding a power-of-2 number
    1237             :  * of pages. Returns 0 on error.
    1238             :  */
    1239           0 : unsigned int round_pipe_size(unsigned long size)
    1240             : {
    1241           0 :         if (size > (1U << 31))
    1242             :                 return 0;
    1243             : 
    1244             :         /* Minimum pipe size, as required by POSIX */
    1245           0 :         if (size < PAGE_SIZE)
    1246             :                 return PAGE_SIZE;
    1247             : 
    1248           0 :         return roundup_pow_of_two(size);
    1249             : }
    1250             : 
    1251             : /*
    1252             :  * Resize the pipe ring to a number of slots.
    1253             :  *
    1254             :  * Note the pipe can be reduced in capacity, but only if the current
    1255             :  * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
    1256             :  * returned instead.
    1257             :  */
    1258           0 : int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
    1259             : {
    1260           0 :         struct pipe_buffer *bufs;
    1261           0 :         unsigned int head, tail, mask, n;
    1262             : 
    1263           0 :         bufs = kcalloc(nr_slots, sizeof(*bufs),
    1264             :                        GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
    1265           0 :         if (unlikely(!bufs))
    1266             :                 return -ENOMEM;
    1267             : 
    1268           0 :         spin_lock_irq(&pipe->rd_wait.lock);
    1269           0 :         mask = pipe->ring_size - 1;
    1270           0 :         head = pipe->head;
    1271           0 :         tail = pipe->tail;
    1272             : 
    1273           0 :         n = pipe_occupancy(head, tail);
    1274           0 :         if (nr_slots < n) {
    1275           0 :                 spin_unlock_irq(&pipe->rd_wait.lock);
    1276           0 :                 kfree(bufs);
    1277           0 :                 return -EBUSY;
    1278             :         }
    1279             : 
    1280             :         /*
    1281             :          * The pipe array wraps around, so just start the new one at zero
    1282             :          * and adjust the indices.
    1283             :          */
    1284           0 :         if (n > 0) {
    1285           0 :                 unsigned int h = head & mask;
    1286           0 :                 unsigned int t = tail & mask;
    1287           0 :                 if (h > t) {
    1288           0 :                         memcpy(bufs, pipe->bufs + t,
    1289             :                                n * sizeof(struct pipe_buffer));
    1290             :                 } else {
    1291           0 :                         unsigned int tsize = pipe->ring_size - t;
    1292           0 :                         if (h > 0)
    1293           0 :                                 memcpy(bufs + tsize, pipe->bufs,
    1294             :                                        h * sizeof(struct pipe_buffer));
    1295           0 :                         memcpy(bufs, pipe->bufs + t,
    1296             :                                tsize * sizeof(struct pipe_buffer));
    1297             :                 }
    1298             :         }
    1299             : 
    1300           0 :         head = n;
    1301           0 :         tail = 0;
    1302             : 
    1303           0 :         kfree(pipe->bufs);
    1304           0 :         pipe->bufs = bufs;
    1305           0 :         pipe->ring_size = nr_slots;
    1306           0 :         if (pipe->max_usage > nr_slots)
    1307           0 :                 pipe->max_usage = nr_slots;
    1308           0 :         pipe->tail = tail;
    1309           0 :         pipe->head = head;
    1310             : 
    1311           0 :         spin_unlock_irq(&pipe->rd_wait.lock);
    1312             : 
    1313             :         /* This might have made more room for writers */
    1314           0 :         wake_up_interruptible(&pipe->wr_wait);
    1315           0 :         return 0;
    1316             : }
    1317             : 
    1318             : /*
    1319             :  * Allocate a new array of pipe buffers and copy the info over. Returns the
    1320             :  * pipe size if successful, or return -ERROR on error.
    1321             :  */
    1322           0 : static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
    1323             : {
    1324           0 :         unsigned long user_bufs;
    1325           0 :         unsigned int nr_slots, size;
    1326           0 :         long ret = 0;
    1327             : 
    1328             : #ifdef CONFIG_WATCH_QUEUE
    1329             :         if (pipe->watch_queue)
    1330             :                 return -EBUSY;
    1331             : #endif
    1332             : 
    1333           0 :         size = round_pipe_size(arg);
    1334           0 :         nr_slots = size >> PAGE_SHIFT;
    1335             : 
    1336           0 :         if (!nr_slots)
    1337             :                 return -EINVAL;
    1338             : 
    1339             :         /*
    1340             :          * If trying to increase the pipe capacity, check that an
    1341             :          * unprivileged user is not trying to exceed various limits
    1342             :          * (soft limit check here, hard limit check just below).
    1343             :          * Decreasing the pipe capacity is always permitted, even
    1344             :          * if the user is currently over a limit.
    1345             :          */
    1346           0 :         if (nr_slots > pipe->max_usage &&
    1347           0 :                         size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
    1348             :                 return -EPERM;
    1349             : 
    1350           0 :         user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
    1351             : 
    1352           0 :         if (nr_slots > pipe->max_usage &&
    1353           0 :                         (too_many_pipe_buffers_hard(user_bufs) ||
    1354           0 :                          too_many_pipe_buffers_soft(user_bufs)) &&
    1355           0 :                         pipe_is_unprivileged_user()) {
    1356           0 :                 ret = -EPERM;
    1357           0 :                 goto out_revert_acct;
    1358             :         }
    1359             : 
    1360           0 :         ret = pipe_resize_ring(pipe, nr_slots);
    1361           0 :         if (ret < 0)
    1362           0 :                 goto out_revert_acct;
    1363             : 
    1364           0 :         pipe->max_usage = nr_slots;
    1365           0 :         pipe->nr_accounted = nr_slots;
    1366           0 :         return pipe->max_usage * PAGE_SIZE;
    1367             : 
    1368           0 : out_revert_acct:
    1369           0 :         (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
    1370           0 :         return ret;
    1371             : }
    1372             : 
    1373             : /*
    1374             :  * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
    1375             :  * not enough to verify that this is a pipe.
    1376             :  */
    1377     7941010 : struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
    1378             : {
    1379     7948471 :         struct pipe_inode_info *pipe = file->private_data;
    1380             : 
    1381     7948471 :         if (file->f_op != &pipefifo_fops || !pipe)
    1382     3969224 :                 return NULL;
    1383             : #ifdef CONFIG_WATCH_QUEUE
    1384             :         if (for_splice && pipe->watch_queue)
    1385             :                 return NULL;
    1386             : #endif
    1387             :         return pipe;
    1388             : }
    1389             : 
    1390        7461 : long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
    1391             : {
    1392        7461 :         struct pipe_inode_info *pipe;
    1393        7461 :         long ret;
    1394             : 
    1395        7461 :         pipe = get_pipe_info(file, false);
    1396        7461 :         if (!pipe)
    1397             :                 return -EBADF;
    1398             : 
    1399        7461 :         __pipe_lock(pipe);
    1400             : 
    1401        7461 :         switch (cmd) {
    1402           0 :         case F_SETPIPE_SZ:
    1403           0 :                 ret = pipe_set_size(pipe, arg);
    1404           0 :                 break;
    1405        7461 :         case F_GETPIPE_SZ:
    1406        7461 :                 ret = pipe->max_usage * PAGE_SIZE;
    1407        7461 :                 break;
    1408             :         default:
    1409             :                 ret = -EINVAL;
    1410             :                 break;
    1411             :         }
    1412             : 
    1413        7461 :         __pipe_unlock(pipe);
    1414        7461 :         return ret;
    1415             : }
    1416             : 
    1417             : static const struct super_operations pipefs_ops = {
    1418             :         .destroy_inode = free_inode_nonrcu,
    1419             :         .statfs = simple_statfs,
    1420             : };
    1421             : 
    1422             : /*
    1423             :  * pipefs should _never_ be mounted by userland - too much of security hassle,
    1424             :  * no real gain from having the whole whorehouse mounted. So we don't need
    1425             :  * any operations on the root directory. However, we need a non-trivial
    1426             :  * d_name - pipe: will go nicely and kill the special-casing in procfs.
    1427             :  */
    1428             : 
    1429           0 : static int pipefs_init_fs_context(struct fs_context *fc)
    1430             : {
    1431           0 :         struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
    1432           0 :         if (!ctx)
    1433             :                 return -ENOMEM;
    1434           0 :         ctx->ops = &pipefs_ops;
    1435           0 :         ctx->dops = &pipefs_dentry_operations;
    1436           0 :         return 0;
    1437             : }
    1438             : 
    1439             : static struct file_system_type pipe_fs_type = {
    1440             :         .name           = "pipefs",
    1441             :         .init_fs_context = pipefs_init_fs_context,
    1442             :         .kill_sb        = kill_anon_super,
    1443             : };
    1444             : 
    1445             : #ifdef CONFIG_SYSCTL
    1446           0 : static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
    1447             :                                         unsigned int *valp,
    1448             :                                         int write, void *data)
    1449             : {
    1450           0 :         if (write) {
    1451           0 :                 unsigned int val;
    1452             : 
    1453           0 :                 val = round_pipe_size(*lvalp);
    1454           0 :                 if (val == 0)
    1455             :                         return -EINVAL;
    1456             : 
    1457           0 :                 *valp = val;
    1458             :         } else {
    1459           0 :                 unsigned int val = *valp;
    1460           0 :                 *lvalp = (unsigned long) val;
    1461             :         }
    1462             : 
    1463             :         return 0;
    1464             : }
    1465             : 
    1466           0 : static int proc_dopipe_max_size(struct ctl_table *table, int write,
    1467             :                                 void *buffer, size_t *lenp, loff_t *ppos)
    1468             : {
    1469           0 :         return do_proc_douintvec(table, write, buffer, lenp, ppos,
    1470             :                                  do_proc_dopipe_max_size_conv, NULL);
    1471             : }
    1472             : 
    1473             : static struct ctl_table fs_pipe_sysctls[] = {
    1474             :         {
    1475             :                 .procname       = "pipe-max-size",
    1476             :                 .data           = &pipe_max_size,
    1477             :                 .maxlen         = sizeof(pipe_max_size),
    1478             :                 .mode           = 0644,
    1479             :                 .proc_handler   = proc_dopipe_max_size,
    1480             :         },
    1481             :         {
    1482             :                 .procname       = "pipe-user-pages-hard",
    1483             :                 .data           = &pipe_user_pages_hard,
    1484             :                 .maxlen         = sizeof(pipe_user_pages_hard),
    1485             :                 .mode           = 0644,
    1486             :                 .proc_handler   = proc_doulongvec_minmax,
    1487             :         },
    1488             :         {
    1489             :                 .procname       = "pipe-user-pages-soft",
    1490             :                 .data           = &pipe_user_pages_soft,
    1491             :                 .maxlen         = sizeof(pipe_user_pages_soft),
    1492             :                 .mode           = 0644,
    1493             :                 .proc_handler   = proc_doulongvec_minmax,
    1494             :         },
    1495             :         { }
    1496             : };
    1497             : #endif
    1498             : 
    1499           0 : static int __init init_pipe_fs(void)
    1500             : {
    1501           0 :         int err = register_filesystem(&pipe_fs_type);
    1502             : 
    1503           0 :         if (!err) {
    1504           0 :                 pipe_mnt = kern_mount(&pipe_fs_type);
    1505           0 :                 if (IS_ERR(pipe_mnt)) {
    1506           0 :                         err = PTR_ERR(pipe_mnt);
    1507           0 :                         unregister_filesystem(&pipe_fs_type);
    1508             :                 }
    1509             :         }
    1510             : #ifdef CONFIG_SYSCTL
    1511           0 :         register_sysctl_init("fs", fs_pipe_sysctls);
    1512             : #endif
    1513           0 :         return err;
    1514             : }
    1515             : 
    1516             : fs_initcall(init_pipe_fs);

Generated by: LCOV version 1.14