LCOV - code coverage report
Current view: top level - fs - eventpoll.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsa @ Mon Jul 31 20:08:27 PDT 2023 Lines: 637 807 78.9 %
Date: 2023-07-31 20:08:27 Functions: 49 60 81.7 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  *  fs/eventpoll.c (Efficient event retrieval implementation)
       4             :  *  Copyright (C) 2001,...,2009  Davide Libenzi
       5             :  *
       6             :  *  Davide Libenzi <davidel@xmailserver.org>
       7             :  */
       8             : 
       9             : #include <linux/init.h>
      10             : #include <linux/kernel.h>
      11             : #include <linux/sched/signal.h>
      12             : #include <linux/fs.h>
      13             : #include <linux/file.h>
      14             : #include <linux/signal.h>
      15             : #include <linux/errno.h>
      16             : #include <linux/mm.h>
      17             : #include <linux/slab.h>
      18             : #include <linux/poll.h>
      19             : #include <linux/string.h>
      20             : #include <linux/list.h>
      21             : #include <linux/hash.h>
      22             : #include <linux/spinlock.h>
      23             : #include <linux/syscalls.h>
      24             : #include <linux/rbtree.h>
      25             : #include <linux/wait.h>
      26             : #include <linux/eventpoll.h>
      27             : #include <linux/mount.h>
      28             : #include <linux/bitops.h>
      29             : #include <linux/mutex.h>
      30             : #include <linux/anon_inodes.h>
      31             : #include <linux/device.h>
      32             : #include <linux/uaccess.h>
      33             : #include <asm/io.h>
      34             : #include <asm/mman.h>
      35             : #include <linux/atomic.h>
      36             : #include <linux/proc_fs.h>
      37             : #include <linux/seq_file.h>
      38             : #include <linux/compat.h>
      39             : #include <linux/rculist.h>
      40             : #include <net/busy_poll.h>
      41             : 
      42             : /*
      43             :  * LOCKING:
      44             :  * There are three level of locking required by epoll :
      45             :  *
      46             :  * 1) epnested_mutex (mutex)
      47             :  * 2) ep->mtx (mutex)
      48             :  * 3) ep->lock (rwlock)
      49             :  *
      50             :  * The acquire order is the one listed above, from 1 to 3.
      51             :  * We need a rwlock (ep->lock) because we manipulate objects
      52             :  * from inside the poll callback, that might be triggered from
      53             :  * a wake_up() that in turn might be called from IRQ context.
      54             :  * So we can't sleep inside the poll callback and hence we need
      55             :  * a spinlock. During the event transfer loop (from kernel to
      56             :  * user space) we could end up sleeping due a copy_to_user(), so
      57             :  * we need a lock that will allow us to sleep. This lock is a
      58             :  * mutex (ep->mtx). It is acquired during the event transfer loop,
      59             :  * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
      60             :  * The epnested_mutex is acquired when inserting an epoll fd onto another
      61             :  * epoll fd. We do this so that we walk the epoll tree and ensure that this
      62             :  * insertion does not create a cycle of epoll file descriptors, which
      63             :  * could lead to deadlock. We need a global mutex to prevent two
      64             :  * simultaneous inserts (A into B and B into A) from racing and
      65             :  * constructing a cycle without either insert observing that it is
      66             :  * going to.
      67             :  * It is necessary to acquire multiple "ep->mtx"es at once in the
      68             :  * case when one epoll fd is added to another. In this case, we
      69             :  * always acquire the locks in the order of nesting (i.e. after
      70             :  * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
      71             :  * before e2->mtx). Since we disallow cycles of epoll file
      72             :  * descriptors, this ensures that the mutexes are well-ordered. In
      73             :  * order to communicate this nesting to lockdep, when walking a tree
      74             :  * of epoll file descriptors, we use the current recursion depth as
      75             :  * the lockdep subkey.
      76             :  * It is possible to drop the "ep->mtx" and to use the global
      77             :  * mutex "epnested_mutex" (together with "ep->lock") to have it working,
      78             :  * but having "ep->mtx" will make the interface more scalable.
      79             :  * Events that require holding "epnested_mutex" are very rare, while for
      80             :  * normal operations the epoll private "ep->mtx" will guarantee
      81             :  * a better scalability.
      82             :  */
      83             : 
      84             : /* Epoll private bits inside the event mask */
      85             : #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
      86             : 
      87             : #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
      88             : 
      89             : #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
      90             :                                 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
      91             : 
      92             : /* Maximum number of nesting allowed inside epoll sets */
      93             : #define EP_MAX_NESTS 4
      94             : 
      95             : #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
      96             : 
      97             : #define EP_UNACTIVE_PTR ((void *) -1L)
      98             : 
      99             : #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
     100             : 
     101             : struct epoll_filefd {
     102             :         struct file *file;
     103             :         int fd;
     104             : } __packed;
     105             : 
     106             : /* Wait structure used by the poll hooks */
     107             : struct eppoll_entry {
     108             :         /* List header used to link this structure to the "struct epitem" */
     109             :         struct eppoll_entry *next;
     110             : 
     111             :         /* The "base" pointer is set to the container "struct epitem" */
     112             :         struct epitem *base;
     113             : 
     114             :         /*
     115             :          * Wait queue item that will be linked to the target file wait
     116             :          * queue head.
     117             :          */
     118             :         wait_queue_entry_t wait;
     119             : 
     120             :         /* The wait queue head that linked the "wait" wait queue item */
     121             :         wait_queue_head_t *whead;
     122             : };
     123             : 
     124             : /*
     125             :  * Each file descriptor added to the eventpoll interface will
     126             :  * have an entry of this type linked to the "rbr" RB tree.
     127             :  * Avoid increasing the size of this struct, there can be many thousands
     128             :  * of these on a server and we do not want this to take another cache line.
     129             :  */
     130             : struct epitem {
     131             :         union {
     132             :                 /* RB tree node links this structure to the eventpoll RB tree */
     133             :                 struct rb_node rbn;
     134             :                 /* Used to free the struct epitem */
     135             :                 struct rcu_head rcu;
     136             :         };
     137             : 
     138             :         /* List header used to link this structure to the eventpoll ready list */
     139             :         struct list_head rdllink;
     140             : 
     141             :         /*
     142             :          * Works together "struct eventpoll"->ovflist in keeping the
     143             :          * single linked chain of items.
     144             :          */
     145             :         struct epitem *next;
     146             : 
     147             :         /* The file descriptor information this item refers to */
     148             :         struct epoll_filefd ffd;
     149             : 
     150             :         /*
     151             :          * Protected by file->f_lock, true for to-be-released epitem already
     152             :          * removed from the "struct file" items list; together with
     153             :          * eventpoll->refcount orchestrates "struct eventpoll" disposal
     154             :          */
     155             :         bool dying;
     156             : 
     157             :         /* List containing poll wait queues */
     158             :         struct eppoll_entry *pwqlist;
     159             : 
     160             :         /* The "container" of this item */
     161             :         struct eventpoll *ep;
     162             : 
     163             :         /* List header used to link this item to the "struct file" items list */
     164             :         struct hlist_node fllink;
     165             : 
     166             :         /* wakeup_source used when EPOLLWAKEUP is set */
     167             :         struct wakeup_source __rcu *ws;
     168             : 
     169             :         /* The structure that describe the interested events and the source fd */
     170             :         struct epoll_event event;
     171             : };
     172             : 
     173             : /*
     174             :  * This structure is stored inside the "private_data" member of the file
     175             :  * structure and represents the main data structure for the eventpoll
     176             :  * interface.
     177             :  */
     178             : struct eventpoll {
     179             :         /*
     180             :          * This mutex is used to ensure that files are not removed
     181             :          * while epoll is using them. This is held during the event
     182             :          * collection loop, the file cleanup path, the epoll file exit
     183             :          * code and the ctl operations.
     184             :          */
     185             :         struct mutex mtx;
     186             : 
     187             :         /* Wait queue used by sys_epoll_wait() */
     188             :         wait_queue_head_t wq;
     189             : 
     190             :         /* Wait queue used by file->poll() */
     191             :         wait_queue_head_t poll_wait;
     192             : 
     193             :         /* List of ready file descriptors */
     194             :         struct list_head rdllist;
     195             : 
     196             :         /* Lock which protects rdllist and ovflist */
     197             :         rwlock_t lock;
     198             : 
     199             :         /* RB tree root used to store monitored fd structs */
     200             :         struct rb_root_cached rbr;
     201             : 
     202             :         /*
     203             :          * This is a single linked list that chains all the "struct epitem" that
     204             :          * happened while transferring ready events to userspace w/out
     205             :          * holding ->lock.
     206             :          */
     207             :         struct epitem *ovflist;
     208             : 
     209             :         /* wakeup_source used when ep_scan_ready_list is running */
     210             :         struct wakeup_source *ws;
     211             : 
     212             :         /* The user that created the eventpoll descriptor */
     213             :         struct user_struct *user;
     214             : 
     215             :         struct file *file;
     216             : 
     217             :         /* used to optimize loop detection check */
     218             :         u64 gen;
     219             :         struct hlist_head refs;
     220             : 
     221             :         /*
     222             :          * usage count, used together with epitem->dying to
     223             :          * orchestrate the disposal of this struct
     224             :          */
     225             :         refcount_t refcount;
     226             : 
     227             : #ifdef CONFIG_NET_RX_BUSY_POLL
     228             :         /* used to track busy poll napi_id */
     229             :         unsigned int napi_id;
     230             : #endif
     231             : 
     232             : #ifdef CONFIG_DEBUG_LOCK_ALLOC
     233             :         /* tracks wakeup nests for lockdep validation */
     234             :         u8 nests;
     235             : #endif
     236             : };
     237             : 
     238             : /* Wrapper struct used by poll queueing */
     239             : struct ep_pqueue {
     240             :         poll_table pt;
     241             :         struct epitem *epi;
     242             : };
     243             : 
     244             : /*
     245             :  * Configuration options available inside /proc/sys/fs/epoll/
     246             :  */
     247             : /* Maximum number of epoll watched descriptors, per user */
     248             : static long max_user_watches __read_mostly;
     249             : 
     250             : /* Used for cycles detection */
     251             : static DEFINE_MUTEX(epnested_mutex);
     252             : 
     253             : static u64 loop_check_gen = 0;
     254             : 
     255             : /* Used to check for epoll file descriptor inclusion loops */
     256             : static struct eventpoll *inserting_into;
     257             : 
     258             : /* Slab cache used to allocate "struct epitem" */
     259             : static struct kmem_cache *epi_cache __read_mostly;
     260             : 
     261             : /* Slab cache used to allocate "struct eppoll_entry" */
     262             : static struct kmem_cache *pwq_cache __read_mostly;
     263             : 
     264             : /*
     265             :  * List of files with newly added links, where we may need to limit the number
     266             :  * of emanating paths. Protected by the epnested_mutex.
     267             :  */
     268             : struct epitems_head {
     269             :         struct hlist_head epitems;
     270             :         struct epitems_head *next;
     271             : };
     272             : static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
     273             : 
     274             : static struct kmem_cache *ephead_cache __read_mostly;
     275             : 
     276     1250559 : static inline void free_ephead(struct epitems_head *head)
     277             : {
     278     1250559 :         if (head)
     279      609682 :                 kmem_cache_free(ephead_cache, head);
     280     1250550 : }
     281             : 
     282             : static void list_file(struct file *file)
     283             : {
     284       10052 :         struct epitems_head *head;
     285             : 
     286       10052 :         head = container_of(file->f_ep, struct epitems_head, epitems);
     287       10052 :         if (!head->next) {
     288       10052 :                 head->next = tfile_check_list;
     289       10052 :                 tfile_check_list = head;
     290             :         }
     291             : }
     292             : 
     293       10052 : static void unlist_file(struct epitems_head *head)
     294             : {
     295       10052 :         struct epitems_head *to_free = head;
     296       10052 :         struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
     297       10052 :         if (p) {
     298       10052 :                 struct epitem *epi= container_of(p, struct epitem, fllink);
     299       10052 :                 spin_lock(&epi->ffd.file->f_lock);
     300       10052 :                 if (!hlist_empty(&head->epitems))
     301       10052 :                         to_free = NULL;
     302       10052 :                 head->next = NULL;
     303       10052 :                 spin_unlock(&epi->ffd.file->f_lock);
     304             :         }
     305       10052 :         free_ephead(to_free);
     306       10052 : }
     307             : 
     308             : #ifdef CONFIG_SYSCTL
     309             : 
     310             : #include <linux/sysctl.h>
     311             : 
     312             : static long long_zero;
     313             : static long long_max = LONG_MAX;
     314             : 
     315             : static struct ctl_table epoll_table[] = {
     316             :         {
     317             :                 .procname       = "max_user_watches",
     318             :                 .data           = &max_user_watches,
     319             :                 .maxlen         = sizeof(max_user_watches),
     320             :                 .mode           = 0644,
     321             :                 .proc_handler   = proc_doulongvec_minmax,
     322             :                 .extra1         = &long_zero,
     323             :                 .extra2         = &long_max,
     324             :         },
     325             :         { }
     326             : };
     327             : 
     328             : static void __init epoll_sysctls_init(void)
     329             : {
     330           0 :         register_sysctl("fs/epoll", epoll_table);
     331             : }
     332             : #else
     333             : #define epoll_sysctls_init() do { } while (0)
     334             : #endif /* CONFIG_SYSCTL */
     335             : 
     336             : static const struct file_operations eventpoll_fops;
     337             : 
     338             : static inline int is_file_epoll(struct file *f)
     339             : {
     340   126747754 :         return f->f_op == &eventpoll_fops;
     341             : }
     342             : 
     343             : /* Setup the structure that is used as key for the RB tree */
     344             : static inline void ep_set_ffd(struct epoll_filefd *ffd,
     345             :                               struct file *file, int fd)
     346             : {
     347      620475 :         ffd->file = file;
     348      620475 :         ffd->fd = fd;
     349             : }
     350             : 
     351             : /* Compare RB tree keys */
     352             : static inline int ep_cmp_ffd(struct epoll_filefd *p1,
     353             :                              struct epoll_filefd *p2)
     354             : {
     355     6691121 :         return (p1->file > p2->file ? +1:
     356     3304002 :                 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
     357             : }
     358             : 
     359             : /* Tells us if the item is currently linked */
     360             : static inline int ep_is_linked(struct epitem *epi)
     361             : {
     362      267531 :         return !list_empty(&epi->rdllink);
     363             : }
     364             : 
     365             : static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
     366             : {
     367             :         return container_of(p, struct eppoll_entry, wait);
     368             : }
     369             : 
     370             : /* Get the "struct epitem" from a wait queue pointer */
     371             : static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
     372             : {
     373   145132077 :         return container_of(p, struct eppoll_entry, wait)->base;
     374             : }
     375             : 
     376             : /**
     377             :  * ep_events_available - Checks if ready events might be available.
     378             :  *
     379             :  * @ep: Pointer to the eventpoll context.
     380             :  *
     381             :  * Return: a value different than %zero if ready events are available,
     382             :  *          or %zero otherwise.
     383             :  */
     384             : static inline int ep_events_available(struct eventpoll *ep)
     385             : {
     386   105980114 :         return !list_empty_careful(&ep->rdllist) ||
     387    30305580 :                 READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
     388             : }
     389             : 
     390             : #ifdef CONFIG_NET_RX_BUSY_POLL
     391           0 : static bool ep_busy_loop_end(void *p, unsigned long start_time)
     392             : {
     393           0 :         struct eventpoll *ep = p;
     394             : 
     395           0 :         return ep_events_available(ep) || busy_loop_timeout(start_time);
     396             : }
     397             : 
     398             : /*
     399             :  * Busy poll if globally on and supporting sockets found && no events,
     400             :  * busy loop will return if need_resched or ep_events_available.
     401             :  *
     402             :  * we must do our busy polling with irqs enabled
     403             :  */
     404    28706619 : static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
     405             : {
     406    28706619 :         unsigned int napi_id = READ_ONCE(ep->napi_id);
     407             : 
     408    28706619 :         if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
     409           0 :                 napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
     410             :                                BUSY_POLL_BUDGET);
     411           0 :                 if (ep_events_available(ep))
     412             :                         return true;
     413             :                 /*
     414             :                  * Busy poll timed out.  Drop NAPI ID for now, we can add
     415             :                  * it back in when we have moved a socket with a valid NAPI
     416             :                  * ID onto the ready list.
     417             :                  */
     418           0 :                 ep->napi_id = 0;
     419           0 :                 return false;
     420             :         }
     421             :         return false;
     422             : }
     423             : 
     424             : /*
     425             :  * Set epoll busy poll NAPI ID from sk.
     426             :  */
     427   145817424 : static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
     428             : {
     429   145817424 :         struct eventpoll *ep;
     430   145817424 :         unsigned int napi_id;
     431   145817424 :         struct socket *sock;
     432   145817424 :         struct sock *sk;
     433             : 
     434   145817424 :         if (!net_busy_loop_on())
     435             :                 return;
     436             : 
     437           0 :         sock = sock_from_file(epi->ffd.file);
     438           0 :         if (!sock)
     439             :                 return;
     440             : 
     441           0 :         sk = sock->sk;
     442           0 :         if (!sk)
     443             :                 return;
     444             : 
     445           0 :         napi_id = READ_ONCE(sk->sk_napi_id);
     446           0 :         ep = epi->ep;
     447             : 
     448             :         /* Non-NAPI IDs can be rejected
     449             :          *      or
     450             :          * Nothing to do if we already have this ID
     451             :          */
     452           0 :         if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
     453             :                 return;
     454             : 
     455             :         /* record NAPI ID for use in next busy poll */
     456           0 :         ep->napi_id = napi_id;
     457             : }
     458             : 
     459             : #else
     460             : 
     461             : static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
     462             : {
     463             :         return false;
     464             : }
     465             : 
     466             : static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
     467             : {
     468             : }
     469             : 
     470             : #endif /* CONFIG_NET_RX_BUSY_POLL */
     471             : 
     472             : /*
     473             :  * As described in commit 0ccf831cb lockdep: annotate epoll
     474             :  * the use of wait queues used by epoll is done in a very controlled
     475             :  * manner. Wake ups can nest inside each other, but are never done
     476             :  * with the same locking. For example:
     477             :  *
     478             :  *   dfd = socket(...);
     479             :  *   efd1 = epoll_create();
     480             :  *   efd2 = epoll_create();
     481             :  *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
     482             :  *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
     483             :  *
     484             :  * When a packet arrives to the device underneath "dfd", the net code will
     485             :  * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
     486             :  * callback wakeup entry on that queue, and the wake_up() performed by the
     487             :  * "dfd" net code will end up in ep_poll_callback(). At this point epoll
     488             :  * (efd1) notices that it may have some event ready, so it needs to wake up
     489             :  * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
     490             :  * that ends up in another wake_up(), after having checked about the
     491             :  * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid
     492             :  * stack blasting.
     493             :  *
     494             :  * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
     495             :  * this special case of epoll.
     496             :  */
     497             : #ifdef CONFIG_DEBUG_LOCK_ALLOC
     498             : 
     499             : static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
     500             :                              unsigned pollflags)
     501             : {
     502             :         struct eventpoll *ep_src;
     503             :         unsigned long flags;
     504             :         u8 nests = 0;
     505             : 
     506             :         /*
     507             :          * To set the subclass or nesting level for spin_lock_irqsave_nested()
     508             :          * it might be natural to create a per-cpu nest count. However, since
     509             :          * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
     510             :          * schedule() in the -rt kernel, the per-cpu variable are no longer
     511             :          * protected. Thus, we are introducing a per eventpoll nest field.
     512             :          * If we are not being call from ep_poll_callback(), epi is NULL and
     513             :          * we are at the first level of nesting, 0. Otherwise, we are being
     514             :          * called from ep_poll_callback() and if a previous wakeup source is
     515             :          * not an epoll file itself, we are at depth 1 since the wakeup source
     516             :          * is depth 0. If the wakeup source is a previous epoll file in the
     517             :          * wakeup chain then we use its nests value and record ours as
     518             :          * nests + 1. The previous epoll file nests value is stable since its
     519             :          * already holding its own poll_wait.lock.
     520             :          */
     521             :         if (epi) {
     522             :                 if ((is_file_epoll(epi->ffd.file))) {
     523             :                         ep_src = epi->ffd.file->private_data;
     524             :                         nests = ep_src->nests;
     525             :                 } else {
     526             :                         nests = 1;
     527             :                 }
     528             :         }
     529             :         spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
     530             :         ep->nests = nests + 1;
     531             :         wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
     532             :         ep->nests = 0;
     533             :         spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
     534             : }
     535             : 
     536             : #else
     537             : 
     538             : static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
     539             :                              __poll_t pollflags)
     540             : {
     541      138096 :         wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
     542      138096 : }
     543             : 
     544             : #endif
     545             : 
     546      463405 : static void ep_remove_wait_queue(struct eppoll_entry *pwq)
     547             : {
     548      463405 :         wait_queue_head_t *whead;
     549             : 
     550      463405 :         rcu_read_lock();
     551             :         /*
     552             :          * If it is cleared by POLLFREE, it should be rcu-safe.
     553             :          * If we read NULL we need a barrier paired with
     554             :          * smp_store_release() in ep_poll_callback(), otherwise
     555             :          * we rely on whead->lock.
     556             :          */
     557      463391 :         whead = smp_load_acquire(&pwq->whead);
     558      463704 :         if (whead)
     559      463704 :                 remove_wait_queue(whead, &pwq->wait);
     560      463691 :         rcu_read_unlock();
     561      463610 : }
     562             : 
     563             : /*
     564             :  * This function unregisters poll callbacks from the associated file
     565             :  * descriptor.  Must be called with "mtx" held.
     566             :  */
     567      687644 : static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
     568             : {
     569      687644 :         struct eppoll_entry **p = &epi->pwqlist;
     570      687644 :         struct eppoll_entry *pwq;
     571             : 
     572     1151168 :         while ((pwq = *p) != NULL) {
     573      463422 :                 *p = pwq->next;
     574      463422 :                 ep_remove_wait_queue(pwq);
     575      463186 :                 kmem_cache_free(pwq_cache, pwq);
     576             :         }
     577      687746 : }
     578             : 
     579             : /* call only when ep->mtx is held */
     580             : static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
     581             : {
     582   120710313 :         return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
     583             : }
     584             : 
     585             : /* call only when ep->mtx is held */
     586             : static inline void ep_pm_stay_awake(struct epitem *epi)
     587             : {
     588    45734407 :         struct wakeup_source *ws = ep_wakeup_source(epi);
     589             : 
     590    45734407 :         if (ws)
     591           0 :                 __pm_stay_awake(ws);
     592             : }
     593             : 
     594             : static inline bool ep_has_wakeup_source(struct epitem *epi)
     595             : {
     596      503353 :         return rcu_access_pointer(epi->ws) ? true : false;
     597             : }
     598             : 
     599             : /* call when ep->mtx cannot be held (ep_poll_callback) */
     600    29277091 : static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
     601             : {
     602    29277091 :         struct wakeup_source *ws;
     603             : 
     604    29277091 :         rcu_read_lock();
     605    29277099 :         ws = rcu_dereference(epi->ws);
     606    29277099 :         if (ws)
     607           0 :                 __pm_stay_awake(ws);
     608    29277099 :         rcu_read_unlock();
     609    29277106 : }
     610             : 
     611             : 
     612             : /*
     613             :  * ep->mutex needs to be held because we could be hit by
     614             :  * eventpoll_release_file() and epoll_ctl().
     615             :  */
     616    74229724 : static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
     617             : {
     618             :         /*
     619             :          * Steal the ready list, and re-init the original one to the
     620             :          * empty list. Also, set ep->ovflist to NULL so that events
     621             :          * happening while looping w/out locks, are not lost. We cannot
     622             :          * have the poll callback to queue directly on ep->rdllist,
     623             :          * because we want the "sproc" callback to be able to do it
     624             :          * in a lockless way.
     625             :          */
     626    74229724 :         lockdep_assert_irqs_enabled();
     627    74229724 :         write_lock_irq(&ep->lock);
     628    74256373 :         list_splice_init(&ep->rdllist, txlist);
     629    74256373 :         WRITE_ONCE(ep->ovflist, NULL);
     630    74256373 :         write_unlock_irq(&ep->lock);
     631    74218236 : }
     632             : 
     633    74238078 : static void ep_done_scan(struct eventpoll *ep,
     634             :                          struct list_head *txlist)
     635             : {
     636    74238078 :         struct epitem *epi, *nepi;
     637             : 
     638    74238078 :         write_lock_irq(&ep->lock);
     639             :         /*
     640             :          * During the time we spent inside the "sproc" callback, some
     641             :          * other events might have been queued by the poll callback.
     642             :          * We re-insert them inside the main ready-list here.
     643             :          */
     644    74579061 :         for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
     645      283530 :              nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
     646             :                 /*
     647             :                  * We need to check if the item is already in the list.
     648             :                  * During the "sproc" callback execution time, items are
     649             :                  * queued into ->ovflist but the "txlist" might already
     650             :                  * contain them, and the list_splice() below takes care of them.
     651             :                  */
     652      300626 :                 if (!ep_is_linked(epi)) {
     653             :                         /*
     654             :                          * ->ovflist is LIFO, so we have to reverse it in order
     655             :                          * to keep in FIFO.
     656             :                          */
     657       14640 :                         list_add(&epi->rdllink, &ep->rdllist);
     658       14640 :                         ep_pm_stay_awake(epi);
     659             :                 }
     660             :         }
     661             :         /*
     662             :          * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
     663             :          * releasing the lock, events will be queued in the normal way inside
     664             :          * ep->rdllist.
     665             :          */
     666    74278435 :         WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
     667             : 
     668             :         /*
     669             :          * Quickly re-inject items left on "txlist".
     670             :          */
     671    74278435 :         list_splice(txlist, &ep->rdllist);
     672    74278435 :         __pm_relax(ep->ws);
     673             : 
     674    74283727 :         if (!list_empty(&ep->rdllist)) {
     675    45340026 :                 if (waitqueue_active(&ep->wq))
     676           0 :                         wake_up(&ep->wq);
     677             :         }
     678             : 
     679    74283727 :         write_unlock_irq(&ep->lock);
     680    74277394 : }
     681             : 
     682      619993 : static void epi_rcu_free(struct rcu_head *head)
     683             : {
     684      619993 :         struct epitem *epi = container_of(head, struct epitem, rcu);
     685      619993 :         kmem_cache_free(epi_cache, epi);
     686      619985 : }
     687             : 
     688             : static void ep_get(struct eventpoll *ep)
     689             : {
     690      620473 :         refcount_inc(&ep->refcount);
     691             : }
     692             : 
     693             : /*
     694             :  * Returns true if the event poll can be disposed
     695             :  */
     696      706163 : static bool ep_refcount_dec_and_test(struct eventpoll *ep)
     697             : {
     698      706163 :         if (!refcount_dec_and_test(&ep->refcount))
     699             :                 return false;
     700             : 
     701       86546 :         WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
     702             :         return true;
     703             : }
     704             : 
     705       86463 : static void ep_free(struct eventpoll *ep)
     706             : {
     707       86463 :         mutex_destroy(&ep->mtx);
     708       86304 :         free_uid(ep->user);
     709       85880 :         wakeup_source_unregister(ep->ws);
     710       85892 :         kfree(ep);
     711       86308 : }
     712             : 
     713             : /*
     714             :  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
     715             :  * all the associated resources. Must be called with "mtx" held.
     716             :  * If the dying flag is set, do the removal only if force is true.
     717             :  * This prevents ep_clear_and_put() from dropping all the ep references
     718             :  * while running concurrently with eventpoll_release_file().
     719             :  * Returns true if the eventpoll can be disposed.
     720             :  */
     721      619953 : static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
     722             : {
     723      619953 :         struct file *file = epi->ffd.file;
     724      619953 :         struct epitems_head *to_free;
     725      619953 :         struct hlist_head *head;
     726             : 
     727      619953 :         lockdep_assert_irqs_enabled();
     728             : 
     729             :         /*
     730             :          * Removes poll wait queue hooks.
     731             :          */
     732      619953 :         ep_unregister_pollwait(ep, epi);
     733             : 
     734             :         /* Remove the current item from the list of epoll hooks */
     735      620104 :         spin_lock(&file->f_lock);
     736      620174 :         if (epi->dying && !force) {
     737           0 :                 spin_unlock(&file->f_lock);
     738           0 :                 return false;
     739             :         }
     740             : 
     741      620174 :         to_free = NULL;
     742      620174 :         head = file->f_ep;
     743      620174 :         if (head->first == &epi->fllink && !epi->fllink.next) {
     744      614800 :                 file->f_ep = NULL;
     745      614800 :                 if (!is_file_epoll(file)) {
     746      609800 :                         struct epitems_head *v;
     747      609800 :                         v = container_of(head, struct epitems_head, epitems);
     748      609800 :                         if (!smp_load_acquire(&v->next))
     749      609826 :                                 to_free = v;
     750             :                 }
     751             :         }
     752      620178 :         hlist_del_rcu(&epi->fllink);
     753      620178 :         spin_unlock(&file->f_lock);
     754      619669 :         free_ephead(to_free);
     755             : 
     756      619969 :         rb_erase_cached(&epi->rbn, &ep->rbr);
     757             : 
     758      620204 :         write_lock_irq(&ep->lock);
     759      620108 :         if (ep_is_linked(epi))
     760      314798 :                 list_del_init(&epi->rdllink);
     761      620097 :         write_unlock_irq(&ep->lock);
     762             : 
     763      620018 :         wakeup_source_unregister(ep_wakeup_source(epi));
     764             :         /*
     765             :          * At this point it is safe to free the eventpoll item. Use the union
     766             :          * field epi->rcu, since we are trying to minimize the size of
     767             :          * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
     768             :          * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
     769             :          * use of the rbn field.
     770             :          */
     771      620225 :         call_rcu(&epi->rcu, epi_rcu_free);
     772             : 
     773      619856 :         percpu_counter_dec(&ep->user->epoll_watches);
     774      619617 :         return ep_refcount_dec_and_test(ep);
     775             : }
     776             : 
     777             : /*
     778             :  * ep_remove variant for callers owing an additional reference to the ep
     779             :  */
     780      583548 : static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
     781             : {
     782      583548 :         WARN_ON_ONCE(__ep_remove(ep, epi, false));
     783      583511 : }
     784             : 
     785       86549 : static void ep_clear_and_put(struct eventpoll *ep)
     786             : {
     787       86549 :         struct rb_node *rbp, *next;
     788       86549 :         struct epitem *epi;
     789       86549 :         bool dispose;
     790             : 
     791             :         /* We need to release all tasks waiting for these file */
     792       86549 :         if (waitqueue_active(&ep->poll_wait))
     793           0 :                 ep_poll_safewake(ep, NULL, 0);
     794             : 
     795       86549 :         mutex_lock(&ep->mtx);
     796             : 
     797             :         /*
     798             :          * Walks through the whole tree by unregistering poll callbacks.
     799             :          */
     800      154252 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
     801       67688 :                 epi = rb_entry(rbp, struct epitem, rbn);
     802             : 
     803       67688 :                 ep_unregister_pollwait(ep, epi);
     804       67688 :                 cond_resched();
     805             :         }
     806             : 
     807             :         /*
     808             :          * Walks through the whole tree and try to free each "struct epitem".
     809             :          * Note that ep_remove_safe() will not remove the epitem in case of a
     810             :          * racing eventpoll_release_file(); the latter will do the removal.
     811             :          * At this point we are sure no poll callbacks will be lingering around.
     812             :          * Since we still own a reference to the eventpoll struct, the loop can't
     813             :          * dispose it.
     814             :          */
     815      154245 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
     816       67688 :                 next = rb_next(rbp);
     817       67688 :                 epi = rb_entry(rbp, struct epitem, rbn);
     818       67688 :                 ep_remove_safe(ep, epi);
     819       67688 :                 cond_resched();
     820             :         }
     821             : 
     822       86557 :         dispose = ep_refcount_dec_and_test(ep);
     823       86546 :         mutex_unlock(&ep->mtx);
     824             : 
     825       86385 :         if (dispose)
     826       86389 :                 ep_free(ep);
     827       86270 : }
     828             : 
     829       86486 : static int ep_eventpoll_release(struct inode *inode, struct file *file)
     830             : {
     831       86486 :         struct eventpoll *ep = file->private_data;
     832             : 
     833       86486 :         if (ep)
     834       86486 :                 ep_clear_and_put(ep);
     835             : 
     836       86284 :         return 0;
     837             : }
     838             : 
     839             : static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);
     840             : 
     841      275838 : static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
     842             : {
     843      275838 :         struct eventpoll *ep = file->private_data;
     844      275838 :         LIST_HEAD(txlist);
     845      275838 :         struct epitem *epi, *tmp;
     846      275838 :         poll_table pt;
     847      275838 :         __poll_t res = 0;
     848             : 
     849      275838 :         init_poll_funcptr(&pt, NULL);
     850             : 
     851             :         /* Insert inside our poll wait queue */
     852      275838 :         poll_wait(file, &ep->poll_wait, wait);
     853             : 
     854             :         /*
     855             :          * Proceed to find out if wanted events are really available inside
     856             :          * the ready list.
     857             :          */
     858      275808 :         mutex_lock_nested(&ep->mtx, depth);
     859      275973 :         ep_start_scan(ep, &txlist);
     860      276024 :         list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
     861      141375 :                 if (ep_item_poll(epi, &pt, depth + 1)) {
     862             :                         res = EPOLLIN | EPOLLRDNORM;
     863             :                         break;
     864             :                 } else {
     865             :                         /*
     866             :                          * Item has been dropped into the ready list by the poll
     867             :                          * callback, but it's not actually ready, as far as
     868             :                          * caller requested events goes. We can remove it here.
     869             :                          */
     870           0 :                         __pm_relax(ep_wakeup_source(epi));
     871           0 :                         list_del_init(&epi->rdllink);
     872             :                 }
     873             :         }
     874      276003 :         ep_done_scan(ep, &txlist);
     875      276073 :         mutex_unlock(&ep->mtx);
     876      275836 :         return res;
     877             : }
     878             : 
     879             : /*
     880             :  * Differs from ep_eventpoll_poll() in that internal callers already have
     881             :  * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
     882             :  * is correctly annotated.
     883             :  */
     884    75651181 : static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
     885             :                                  int depth)
     886             : {
     887    75651181 :         struct file *file = epi->ffd.file;
     888    75651181 :         __poll_t res;
     889             : 
     890    75651181 :         pt->_key = epi->event.events;
     891    75651181 :         if (!is_file_epoll(file))
     892    75375176 :                 res = vfs_poll(file, pt);
     893             :         else
     894      276005 :                 res = __ep_eventpoll_poll(file, pt, depth);
     895    75676588 :         return res & epi->event.events;
     896             : }
     897             : 
     898           0 : static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
     899             : {
     900           0 :         return __ep_eventpoll_poll(file, wait, 0);
     901             : }
     902             : 
     903             : #ifdef CONFIG_PROC_FS
     904           0 : static void ep_show_fdinfo(struct seq_file *m, struct file *f)
     905             : {
     906           0 :         struct eventpoll *ep = f->private_data;
     907           0 :         struct rb_node *rbp;
     908             : 
     909           0 :         mutex_lock(&ep->mtx);
     910           0 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
     911           0 :                 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
     912           0 :                 struct inode *inode = file_inode(epi->ffd.file);
     913             : 
     914           0 :                 seq_printf(m, "tfd: %8d events: %8x data: %16llx "
     915             :                            " pos:%lli ino:%lx sdev:%x\n",
     916             :                            epi->ffd.fd, epi->event.events,
     917           0 :                            (long long)epi->event.data,
     918           0 :                            (long long)epi->ffd.file->f_pos,
     919           0 :                            inode->i_ino, inode->i_sb->s_dev);
     920           0 :                 if (seq_has_overflowed(m))
     921             :                         break;
     922             :         }
     923           0 :         mutex_unlock(&ep->mtx);
     924           0 : }
     925             : #endif
     926             : 
     927             : /* File callbacks that implement the eventpoll file behaviour */
     928             : static const struct file_operations eventpoll_fops = {
     929             : #ifdef CONFIG_PROC_FS
     930             :         .show_fdinfo    = ep_show_fdinfo,
     931             : #endif
     932             :         .release        = ep_eventpoll_release,
     933             :         .poll           = ep_eventpoll_poll,
     934             :         .llseek         = noop_llseek,
     935             : };
     936             : 
     937             : /*
     938             :  * This is called from eventpoll_release() to unlink files from the eventpoll
     939             :  * interface. We need to have this facility to cleanup correctly files that are
     940             :  * closed without being removed from the eventpoll interface.
     941             :  */
     942       36282 : void eventpoll_release_file(struct file *file)
     943             : {
     944       72078 :         struct eventpoll *ep;
     945       72078 :         struct epitem *epi;
     946       72078 :         bool dispose;
     947             : 
     948             :         /*
     949             :          * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
     950             :          * touching the epitems list before eventpoll_release_file() can access
     951             :          * the ep->mtx.
     952             :          */
     953       72078 : again:
     954       72078 :         spin_lock(&file->f_lock);
     955       72185 :         if (file->f_ep && file->f_ep->first) {
     956       35590 :                 epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
     957       35590 :                 epi->dying = true;
     958       35590 :                 spin_unlock(&file->f_lock);
     959             : 
     960             :                 /*
     961             :                  * ep access is safe as we still own a reference to the ep
     962             :                  * struct
     963             :                  */
     964       35623 :                 ep = epi->ep;
     965       35623 :                 mutex_lock(&ep->mtx);
     966       36427 :                 dispose = __ep_remove(ep, epi, true);
     967       35969 :                 mutex_unlock(&ep->mtx);
     968             : 
     969       35796 :                 if (dispose)
     970           0 :                         ep_free(ep);
     971       35796 :                 goto again;
     972             :         }
     973       36595 :         spin_unlock(&file->f_lock);
     974       36668 : }
     975             : 
     976       86584 : static int ep_alloc(struct eventpoll **pep)
     977             : {
     978       86584 :         int error;
     979       86584 :         struct user_struct *user;
     980       86584 :         struct eventpoll *ep;
     981             : 
     982       86584 :         user = get_current_user();
     983       86579 :         error = -ENOMEM;
     984       86579 :         ep = kzalloc(sizeof(*ep), GFP_KERNEL);
     985       86585 :         if (unlikely(!ep))
     986           0 :                 goto free_uid;
     987             : 
     988       86585 :         mutex_init(&ep->mtx);
     989       86584 :         rwlock_init(&ep->lock);
     990       86583 :         init_waitqueue_head(&ep->wq);
     991       86581 :         init_waitqueue_head(&ep->poll_wait);
     992       86581 :         INIT_LIST_HEAD(&ep->rdllist);
     993       86581 :         ep->rbr = RB_ROOT_CACHED;
     994       86581 :         ep->ovflist = EP_UNACTIVE_PTR;
     995       86581 :         ep->user = user;
     996       86581 :         refcount_set(&ep->refcount, 1);
     997             : 
     998       86581 :         *pep = ep;
     999             : 
    1000       86581 :         return 0;
    1001             : 
    1002             : free_uid:
    1003           0 :         free_uid(user);
    1004           0 :         return error;
    1005             : }
    1006             : 
    1007             : /*
    1008             :  * Search the file inside the eventpoll tree. The RB tree operations
    1009             :  * are protected by the "mtx" mutex, and ep_find() must be called with
    1010             :  * "mtx" held.
    1011             :  */
    1012     1639650 : static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
    1013             : {
    1014     1639650 :         int kcmp;
    1015     1639650 :         struct rb_node *rbp;
    1016     1639650 :         struct epitem *epi, *epir = NULL;
    1017     1639650 :         struct epoll_filefd ffd;
    1018             : 
    1019     1639650 :         ep_set_ffd(&ffd, file, fd);
    1020     5871472 :         for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
    1021     5251014 :                 epi = rb_entry(rbp, struct epitem, rbn);
    1022     5251014 :                 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
    1023     1019213 :                 if (kcmp > 0)
    1024     2531706 :                         rbp = rbp->rb_right;
    1025     2719308 :                 else if (kcmp < 0)
    1026     1700116 :                         rbp = rbp->rb_left;
    1027             :                 else {
    1028             :                         epir = epi;
    1029             :                         break;
    1030             :                 }
    1031             :         }
    1032             : 
    1033     1639650 :         return epir;
    1034             : }
    1035             : 
    1036             : #ifdef CONFIG_KCMP
    1037             : static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
    1038             : {
    1039             :         struct rb_node *rbp;
    1040             :         struct epitem *epi;
    1041             : 
    1042             :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
    1043             :                 epi = rb_entry(rbp, struct epitem, rbn);
    1044             :                 if (epi->ffd.fd == tfd) {
    1045             :                         if (toff == 0)
    1046             :                                 return epi;
    1047             :                         else
    1048             :                                 toff--;
    1049             :                 }
    1050             :                 cond_resched();
    1051             :         }
    1052             : 
    1053             :         return NULL;
    1054             : }
    1055             : 
    1056             : struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
    1057             :                                      unsigned long toff)
    1058             : {
    1059             :         struct file *file_raw;
    1060             :         struct eventpoll *ep;
    1061             :         struct epitem *epi;
    1062             : 
    1063             :         if (!is_file_epoll(file))
    1064             :                 return ERR_PTR(-EINVAL);
    1065             : 
    1066             :         ep = file->private_data;
    1067             : 
    1068             :         mutex_lock(&ep->mtx);
    1069             :         epi = ep_find_tfd(ep, tfd, toff);
    1070             :         if (epi)
    1071             :                 file_raw = epi->ffd.file;
    1072             :         else
    1073             :                 file_raw = ERR_PTR(-ENOENT);
    1074             :         mutex_unlock(&ep->mtx);
    1075             : 
    1076             :         return file_raw;
    1077             : }
    1078             : #endif /* CONFIG_KCMP */
    1079             : 
    1080             : /*
    1081             :  * Adds a new entry to the tail of the list in a lockless way, i.e.
    1082             :  * multiple CPUs are allowed to call this function concurrently.
    1083             :  *
    1084             :  * Beware: it is necessary to prevent any other modifications of the
    1085             :  *         existing list until all changes are completed, in other words
    1086             :  *         concurrent list_add_tail_lockless() calls should be protected
    1087             :  *         with a read lock, where write lock acts as a barrier which
    1088             :  *         makes sure all list_add_tail_lockless() calls are fully
    1089             :  *         completed.
    1090             :  *
    1091             :  *        Also an element can be locklessly added to the list only in one
    1092             :  *        direction i.e. either to the tail or to the head, otherwise
    1093             :  *        concurrent access will corrupt the list.
    1094             :  *
    1095             :  * Return: %false if element has been already added to the list, %true
    1096             :  * otherwise.
    1097             :  */
    1098    28993551 : static inline bool list_add_tail_lockless(struct list_head *new,
    1099             :                                           struct list_head *head)
    1100             : {
    1101    28993551 :         struct list_head *prev;
    1102             : 
    1103             :         /*
    1104             :          * This is simple 'new->next = head' operation, but cmpxchg()
    1105             :          * is used in order to detect that same element has been just
    1106             :          * added to the list from another CPU: the winner observes
    1107             :          * new->next == new.
    1108             :          */
    1109    28993551 :         if (!try_cmpxchg(&new->next, &new, head))
    1110             :                 return false;
    1111             : 
    1112             :         /*
    1113             :          * Initially ->next of a new element must be updated with the head
    1114             :          * (we are inserting to the tail) and only then pointers are atomically
    1115             :          * exchanged.  XCHG guarantees memory ordering, thus ->next should be
    1116             :          * updated before pointers are actually swapped and pointers are
    1117             :          * swapped before prev->next is updated.
    1118             :          */
    1119             : 
    1120    28993628 :         prev = xchg(&head->prev, new);
    1121             : 
    1122             :         /*
    1123             :          * It is safe to modify prev->next and new->prev, because a new element
    1124             :          * is added only to the tail and new->next is updated before XCHG.
    1125             :          */
    1126             : 
    1127    28993576 :         prev->next = new;
    1128    28993576 :         new->prev = prev;
    1129             : 
    1130    28993576 :         return true;
    1131             : }
    1132             : 
    1133             : /*
    1134             :  * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
    1135             :  * i.e. multiple CPUs are allowed to call this function concurrently.
    1136             :  *
    1137             :  * Return: %false if epi element has been already chained, %true otherwise.
    1138             :  */
    1139      322402 : static inline bool chain_epi_lockless(struct epitem *epi)
    1140             : {
    1141      322402 :         struct eventpoll *ep = epi->ep;
    1142             : 
    1143             :         /* Fast preliminary check */
    1144      322402 :         if (epi->next != EP_UNACTIVE_PTR)
    1145             :                 return false;
    1146             : 
    1147             :         /* Check that the same epi has not been just chained from another CPU */
    1148      283530 :         if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
    1149             :                 return false;
    1150             : 
    1151             :         /* Atomically exchange tail */
    1152      283530 :         epi->next = xchg(&ep->ovflist, epi);
    1153             : 
    1154      283530 :         return true;
    1155             : }
    1156             : 
    1157             : /*
    1158             :  * This is the callback that is passed to the wait queue wakeup
    1159             :  * mechanism. It is called by the stored file descriptors when they
    1160             :  * have events to report.
    1161             :  *
    1162             :  * This callback takes a read lock in order not to contend with concurrent
    1163             :  * events from another file descriptor, thus all modifications to ->rdllist
    1164             :  * or ->ovflist are lockless.  Read lock is paired with the write lock from
    1165             :  * ep_scan_ready_list(), which stops all list modifications and guarantees
    1166             :  * that lists state is seen correctly.
    1167             :  *
    1168             :  * Another thing worth to mention is that ep_poll_callback() can be called
    1169             :  * concurrently for the same @epi from different CPUs if poll table was inited
    1170             :  * with several wait queues entries.  Plural wakeup from different CPUs of a
    1171             :  * single wait queue is serialized by wq.lock, but the case when multiple wait
    1172             :  * queues are used should be detected accordingly.  This is detected using
    1173             :  * cmpxchg() operation.
    1174             :  */
    1175   145132077 : static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
    1176             : {
    1177   145132077 :         int pwake = 0;
    1178   145132077 :         struct epitem *epi = ep_item_from_wait(wait);
    1179   145132077 :         struct eventpoll *ep = epi->ep;
    1180   145132077 :         __poll_t pollflags = key_to_poll(key);
    1181   145132077 :         unsigned long flags;
    1182   145132077 :         int ewake = 0;
    1183             : 
    1184   145132077 :         read_lock_irqsave(&ep->lock, flags);
    1185             : 
    1186   145205722 :         ep_set_busy_poll_napi_id(epi);
    1187             : 
    1188             :         /*
    1189             :          * If the event mask does not contain any poll(2) event, we consider the
    1190             :          * descriptor to be disabled. This condition is likely the effect of the
    1191             :          * EPOLLONESHOT bit that disables the descriptor when an event is received,
    1192             :          * until the next EPOLL_CTL_MOD will be issued.
    1193             :          */
    1194   145234690 :         if (!(epi->event.events & ~EP_PRIVATE_BITS))
    1195           0 :                 goto out_unlock;
    1196             : 
    1197             :         /*
    1198             :          * Check the events coming with the callback. At this stage, not
    1199             :          * every device reports the events in the "key" parameter of the
    1200             :          * callback. We need to be able to handle both cases here, hence the
    1201             :          * test for "key" != NULL before the event match test.
    1202             :          */
    1203   145234690 :         if (pollflags && !(pollflags & epi->event.events))
    1204    64721015 :                 goto out_unlock;
    1205             : 
    1206             :         /*
    1207             :          * If we are transferring events to userspace, we can hold no locks
    1208             :          * (because we're accessing user memory, and because of linux f_op->poll()
    1209             :          * semantics). All the events that happen during that period of time are
    1210             :          * chained in ep->ovflist and requeued later on.
    1211             :          */
    1212    80513675 :         if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
    1213      322401 :                 if (chain_epi_lockless(epi))
    1214      283530 :                         ep_pm_stay_awake_rcu(epi);
    1215    80191274 :         } else if (!ep_is_linked(epi)) {
    1216             :                 /* In the usual case, add event to ready list. */
    1217    28993491 :                 if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
    1218    28993645 :                         ep_pm_stay_awake_rcu(epi);
    1219             :         }
    1220             : 
    1221             :         /*
    1222             :          * Wake up ( if active ) both the eventpoll wait list and the ->poll()
    1223             :          * wait list.
    1224             :          */
    1225    80513827 :         if (waitqueue_active(&ep->wq)) {
    1226    28526280 :                 if ((epi->event.events & EPOLLEXCLUSIVE) &&
    1227           0 :                                         !(pollflags & POLLFREE)) {
    1228           0 :                         switch (pollflags & EPOLLINOUT_BITS) {
    1229           0 :                         case EPOLLIN:
    1230           0 :                                 if (epi->event.events & EPOLLIN)
    1231           0 :                                         ewake = 1;
    1232             :                                 break;
    1233           0 :                         case EPOLLOUT:
    1234           0 :                                 if (epi->event.events & EPOLLOUT)
    1235           0 :                                         ewake = 1;
    1236             :                                 break;
    1237           0 :                         case 0:
    1238           0 :                                 ewake = 1;
    1239           0 :                                 break;
    1240             :                         }
    1241             :                 }
    1242    28526280 :                 wake_up(&ep->wq);
    1243             :         }
    1244    80515699 :         if (waitqueue_active(&ep->poll_wait))
    1245      138096 :                 pwake++;
    1246             : 
    1247    80377603 : out_unlock:
    1248   145236714 :         read_unlock_irqrestore(&ep->lock, flags);
    1249             : 
    1250             :         /* We have to call this outside the lock */
    1251   145263661 :         if (pwake)
    1252      138096 :                 ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
    1253             : 
    1254   145263661 :         if (!(epi->event.events & EPOLLEXCLUSIVE))
    1255   145263796 :                 ewake = 1;
    1256             : 
    1257   145263661 :         if (pollflags & POLLFREE) {
    1258             :                 /*
    1259             :                  * If we race with ep_remove_wait_queue() it can miss
    1260             :                  * ->whead = NULL and do another remove_wait_queue() after
    1261             :                  * us, so we can't use __remove_wait_queue().
    1262             :                  */
    1263           0 :                 list_del_init(&wait->entry);
    1264             :                 /*
    1265             :                  * ->whead != NULL protects us from the race with
    1266             :                  * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
    1267             :                  * takes whead->lock held by the caller. Once we nullify it,
    1268             :                  * nothing protects ep/epi or even wait.
    1269             :                  */
    1270           0 :                 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
    1271             :         }
    1272             : 
    1273   145263661 :         return ewake;
    1274             : }
    1275             : 
    1276             : /*
    1277             :  * This is the callback that is used to add our wait queue to the
    1278             :  * target file wakeup lists.
    1279             :  */
    1280      464200 : static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
    1281             :                                  poll_table *pt)
    1282             : {
    1283      464200 :         struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
    1284      464200 :         struct epitem *epi = epq->epi;
    1285      464200 :         struct eppoll_entry *pwq;
    1286             : 
    1287      464200 :         if (unlikely(!epi))     // an earlier allocation has failed
    1288             :                 return;
    1289             : 
    1290      464200 :         pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
    1291      464205 :         if (unlikely(!pwq)) {
    1292           0 :                 epq->epi = NULL;
    1293           0 :                 return;
    1294             :         }
    1295             : 
    1296      464205 :         init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
    1297      464205 :         pwq->whead = whead;
    1298      464205 :         pwq->base = epi;
    1299      464205 :         if (epi->event.events & EPOLLEXCLUSIVE)
    1300           0 :                 add_wait_queue_exclusive(whead, &pwq->wait);
    1301             :         else
    1302      464205 :                 add_wait_queue(whead, &pwq->wait);
    1303      464197 :         pwq->next = epi->pwqlist;
    1304      464197 :         epi->pwqlist = pwq;
    1305             : }
    1306             : 
    1307      620470 : static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
    1308             : {
    1309      620470 :         int kcmp;
    1310      620470 :         struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
    1311      620470 :         struct epitem *epic;
    1312      620470 :         bool leftmost = true;
    1313             : 
    1314     2060577 :         while (*p) {
    1315     1440107 :                 parent = *p;
    1316     1440107 :                 epic = rb_entry(parent, struct epitem, rbn);
    1317     1440107 :                 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
    1318           0 :                 if (kcmp > 0) {
    1319      855451 :                         p = &parent->rb_right;
    1320      855451 :                         leftmost = false;
    1321             :                 } else
    1322      584656 :                         p = &parent->rb_left;
    1323             :         }
    1324      620470 :         rb_link_node(&epi->rbn, parent, p);
    1325      620470 :         rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
    1326      620473 : }
    1327             : 
    1328             : 
    1329             : 
    1330             : #define PATH_ARR_SIZE 5
    1331             : /*
    1332             :  * These are the number paths of length 1 to 5, that we are allowing to emanate
    1333             :  * from a single file of interest. For example, we allow 1000 paths of length
    1334             :  * 1, to emanate from each file of interest. This essentially represents the
    1335             :  * potential wakeup paths, which need to be limited in order to avoid massive
    1336             :  * uncontrolled wakeup storms. The common use case should be a single ep which
    1337             :  * is connected to n file sources. In this case each file source has 1 path
    1338             :  * of length 1. Thus, the numbers below should be more than sufficient. These
    1339             :  * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
    1340             :  * and delete can't add additional paths. Protected by the epnested_mutex.
    1341             :  */
    1342             : static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
    1343             : static int path_count[PATH_ARR_SIZE];
    1344             : 
    1345       10052 : static int path_count_inc(int nests)
    1346             : {
    1347             :         /* Allow an arbitrary number of depth 1 paths */
    1348       10052 :         if (nests == 0)
    1349             :                 return 0;
    1350             : 
    1351       10052 :         if (++path_count[nests] > path_limits[nests])
    1352           0 :                 return -1;
    1353             :         return 0;
    1354             : }
    1355             : 
    1356             : static void path_count_init(void)
    1357             : {
    1358             :         int i;
    1359             : 
    1360       60312 :         for (i = 0; i < PATH_ARR_SIZE; i++)
    1361       50260 :                 path_count[i] = 0;
    1362             : }
    1363             : 
    1364       20104 : static int reverse_path_check_proc(struct hlist_head *refs, int depth)
    1365             : {
    1366       20104 :         int error = 0;
    1367       20104 :         struct epitem *epi;
    1368             : 
    1369       20104 :         if (depth > EP_MAX_NESTS) /* too deep nesting */
    1370             :                 return -1;
    1371             : 
    1372             :         /* CTL_DEL can remove links here, but that can't increase our count */
    1373       80416 :         hlist_for_each_entry_rcu(epi, refs, fllink) {
    1374       20104 :                 struct hlist_head *refs = &epi->ep->refs;
    1375       20104 :                 if (hlist_empty(refs))
    1376       10052 :                         error = path_count_inc(depth);
    1377             :                 else
    1378       10052 :                         error = reverse_path_check_proc(refs, depth + 1);
    1379       20104 :                 if (error != 0)
    1380             :                         break;
    1381             :         }
    1382             :         return error;
    1383             : }
    1384             : 
    1385             : /**
    1386             :  * reverse_path_check - The tfile_check_list is list of epitem_head, which have
    1387             :  *                      links that are proposed to be newly added. We need to
    1388             :  *                      make sure that those added links don't add too many
    1389             :  *                      paths such that we will spend all our time waking up
    1390             :  *                      eventpoll objects.
    1391             :  *
    1392             :  * Return: %zero if the proposed links don't create too many paths,
    1393             :  *          %-1 otherwise.
    1394             :  */
    1395        5026 : static int reverse_path_check(void)
    1396             : {
    1397        5026 :         struct epitems_head *p;
    1398             : 
    1399       15078 :         for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
    1400             :                 int error;
    1401             :                 path_count_init();
    1402       10052 :                 rcu_read_lock();
    1403       10052 :                 error = reverse_path_check_proc(&p->epitems, 0);
    1404       10052 :                 rcu_read_unlock();
    1405       10052 :                 if (error)
    1406           0 :                         return error;
    1407             :         }
    1408             :         return 0;
    1409             : }
    1410             : 
    1411           0 : static int ep_create_wakeup_source(struct epitem *epi)
    1412             : {
    1413           0 :         struct name_snapshot n;
    1414           0 :         struct wakeup_source *ws;
    1415             : 
    1416           0 :         if (!epi->ep->ws) {
    1417           0 :                 epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
    1418           0 :                 if (!epi->ep->ws)
    1419             :                         return -ENOMEM;
    1420             :         }
    1421             : 
    1422           0 :         take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
    1423           0 :         ws = wakeup_source_register(NULL, n.name.name);
    1424           0 :         release_dentry_name_snapshot(&n);
    1425             : 
    1426           0 :         if (!ws)
    1427             :                 return -ENOMEM;
    1428           0 :         rcu_assign_pointer(epi->ws, ws);
    1429             : 
    1430           0 :         return 0;
    1431             : }
    1432             : 
    1433             : /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
    1434           0 : static noinline void ep_destroy_wakeup_source(struct epitem *epi)
    1435             : {
    1436           0 :         struct wakeup_source *ws = ep_wakeup_source(epi);
    1437             : 
    1438           0 :         RCU_INIT_POINTER(epi->ws, NULL);
    1439             : 
    1440             :         /*
    1441             :          * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
    1442             :          * used internally by wakeup_source_remove, too (called by
    1443             :          * wakeup_source_unregister), so we cannot use call_rcu
    1444             :          */
    1445           0 :         synchronize_rcu();
    1446           0 :         wakeup_source_unregister(ws);
    1447           0 : }
    1448             : 
    1449      620473 : static int attach_epitem(struct file *file, struct epitem *epi)
    1450             : {
    1451      620473 :         struct epitems_head *to_free = NULL;
    1452      620473 :         struct hlist_head *head = NULL;
    1453      620473 :         struct eventpoll *ep = NULL;
    1454             : 
    1455      620473 :         if (is_file_epoll(file))
    1456        5026 :                 ep = file->private_data;
    1457             : 
    1458        5026 :         if (ep) {
    1459        5026 :                 head = &ep->refs;
    1460      615447 :         } else if (!READ_ONCE(file->f_ep)) {
    1461      610119 : allocate:
    1462      610119 :                 to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
    1463      610118 :                 if (!to_free)
    1464             :                         return -ENOMEM;
    1465      610118 :                 head = &to_free->epitems;
    1466             :         }
    1467      620472 :         spin_lock(&file->f_lock);
    1468      620479 :         if (!file->f_ep) {
    1469      615152 :                 if (unlikely(!head)) {
    1470           0 :                         spin_unlock(&file->f_lock);
    1471           0 :                         goto allocate;
    1472             :                 }
    1473      615152 :                 file->f_ep = head;
    1474      615152 :                 to_free = NULL;
    1475             :         }
    1476      620479 :         hlist_add_head_rcu(&epi->fllink, file->f_ep);
    1477      620480 :         spin_unlock(&file->f_lock);
    1478      620469 :         free_ephead(to_free);
    1479      620469 :         return 0;
    1480             : }
    1481             : 
    1482             : /*
    1483             :  * Must be called with "mtx" held.
    1484             :  */
    1485      620478 : static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
    1486             :                      struct file *tfile, int fd, int full_check)
    1487             : {
    1488      620478 :         int error, pwake = 0;
    1489      620478 :         __poll_t revents;
    1490      620478 :         struct epitem *epi;
    1491      620478 :         struct ep_pqueue epq;
    1492      620478 :         struct eventpoll *tep = NULL;
    1493             : 
    1494      620478 :         if (is_file_epoll(tfile))
    1495        5026 :                 tep = tfile->private_data;
    1496             : 
    1497      620478 :         lockdep_assert_irqs_enabled();
    1498             : 
    1499      620478 :         if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
    1500             :                                             max_user_watches) >= 0))
    1501             :                 return -ENOSPC;
    1502      620480 :         percpu_counter_inc(&ep->user->epoll_watches);
    1503             : 
    1504      620479 :         if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
    1505           0 :                 percpu_counter_dec(&ep->user->epoll_watches);
    1506           0 :                 return -ENOMEM;
    1507             :         }
    1508             : 
    1509             :         /* Item initialization follow here ... */
    1510      620475 :         INIT_LIST_HEAD(&epi->rdllink);
    1511      620475 :         epi->ep = ep;
    1512      620475 :         ep_set_ffd(&epi->ffd, tfile, fd);
    1513      620475 :         epi->event = *event;
    1514      620475 :         epi->next = EP_UNACTIVE_PTR;
    1515             : 
    1516      620475 :         if (tep)
    1517        5026 :                 mutex_lock_nested(&tep->mtx, 1);
    1518             :         /* Add the current item to the list of active epoll hook for this file */
    1519      620475 :         if (unlikely(attach_epitem(tfile, epi) < 0)) {
    1520           0 :                 if (tep)
    1521           0 :                         mutex_unlock(&tep->mtx);
    1522           0 :                 kmem_cache_free(epi_cache, epi);
    1523           0 :                 percpu_counter_dec(&ep->user->epoll_watches);
    1524           0 :                 return -ENOMEM;
    1525             :         }
    1526             : 
    1527      620471 :         if (full_check && !tep)
    1528           0 :                 list_file(tfile);
    1529             : 
    1530             :         /*
    1531             :          * Add the current item to the RB tree. All RB tree operations are
    1532             :          * protected by "mtx", and ep_insert() is called with "mtx" held.
    1533             :          */
    1534      620471 :         ep_rbtree_insert(ep, epi);
    1535      620473 :         if (tep)
    1536        5026 :                 mutex_unlock(&tep->mtx);
    1537             : 
    1538             :         /*
    1539             :          * ep_remove_safe() calls in the later error paths can't lead to
    1540             :          * ep_free() as the ep file itself still holds an ep reference.
    1541             :          */
    1542      620473 :         ep_get(ep);
    1543             : 
    1544             :         /* now check if we've created too many backpaths */
    1545      620473 :         if (unlikely(full_check && reverse_path_check())) {
    1546           0 :                 ep_remove_safe(ep, epi);
    1547           0 :                 return -EINVAL;
    1548             :         }
    1549             : 
    1550      620473 :         if (epi->event.events & EPOLLWAKEUP) {
    1551           0 :                 error = ep_create_wakeup_source(epi);
    1552           0 :                 if (error) {
    1553           0 :                         ep_remove_safe(ep, epi);
    1554           0 :                         return error;
    1555             :                 }
    1556             :         }
    1557             : 
    1558             :         /* Initialize the poll table using the queue callback */
    1559      620473 :         epq.epi = epi;
    1560      620473 :         init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
    1561             : 
    1562             :         /*
    1563             :          * Attach the item to the poll hooks and get current event bits.
    1564             :          * We can safely use the file* here because its usage count has
    1565             :          * been increased by the caller of this function. Note that after
    1566             :          * this operation completes, the poll callback can start hitting
    1567             :          * the new item.
    1568             :          */
    1569      620473 :         revents = ep_item_poll(epi, &epq.pt, 1);
    1570             : 
    1571             :         /*
    1572             :          * We have to check if something went wrong during the poll wait queue
    1573             :          * install process. Namely an allocation for a wait queue failed due
    1574             :          * high memory pressure.
    1575             :          */
    1576      620478 :         if (unlikely(!epq.epi)) {
    1577           0 :                 ep_remove_safe(ep, epi);
    1578           0 :                 return -ENOMEM;
    1579             :         }
    1580             : 
    1581             :         /* We have to drop the new item inside our item list to keep track of it */
    1582      620478 :         write_lock_irq(&ep->lock);
    1583             : 
    1584             :         /* record NAPI ID of new item if present */
    1585      620472 :         ep_set_busy_poll_napi_id(epi);
    1586             : 
    1587             :         /* If the file is already "ready" we drop it inside the ready list */
    1588      620480 :         if (revents && !ep_is_linked(epi)) {
    1589      267529 :                 list_add_tail(&epi->rdllink, &ep->rdllist);
    1590      267529 :                 ep_pm_stay_awake(epi);
    1591             : 
    1592             :                 /* Notify waiting tasks that events are available */
    1593      267529 :                 if (waitqueue_active(&ep->wq))
    1594      126346 :                         wake_up(&ep->wq);
    1595      267529 :                 if (waitqueue_active(&ep->poll_wait))
    1596           0 :                         pwake++;
    1597             :         }
    1598             : 
    1599      620480 :         write_unlock_irq(&ep->lock);
    1600             : 
    1601             :         /* We have to call this outside the lock */
    1602      620477 :         if (pwake)
    1603           0 :                 ep_poll_safewake(ep, NULL, 0);
    1604             : 
    1605             :         return 0;
    1606             : }
    1607             : 
    1608             : /*
    1609             :  * Modify the interest event mask by dropping an event if the new mask
    1610             :  * has a match in the current file status. Must be called with "mtx" held.
    1611             :  */
    1612      503353 : static int ep_modify(struct eventpoll *ep, struct epitem *epi,
    1613             :                      const struct epoll_event *event)
    1614             : {
    1615      503353 :         int pwake = 0;
    1616      503353 :         poll_table pt;
    1617             : 
    1618      503353 :         lockdep_assert_irqs_enabled();
    1619             : 
    1620      503353 :         init_poll_funcptr(&pt, NULL);
    1621             : 
    1622             :         /*
    1623             :          * Set the new event interest mask before calling f_op->poll();
    1624             :          * otherwise we might miss an event that happens between the
    1625             :          * f_op->poll() call and the new event set registering.
    1626             :          */
    1627      503353 :         epi->event.events = event->events; /* need barrier below */
    1628      503353 :         epi->event.data = event->data; /* protected by mtx */
    1629      503353 :         if (epi->event.events & EPOLLWAKEUP) {
    1630           0 :                 if (!ep_has_wakeup_source(epi))
    1631           0 :                         ep_create_wakeup_source(epi);
    1632      503353 :         } else if (ep_has_wakeup_source(epi)) {
    1633           0 :                 ep_destroy_wakeup_source(epi);
    1634             :         }
    1635             : 
    1636             :         /*
    1637             :          * The following barrier has two effects:
    1638             :          *
    1639             :          * 1) Flush epi changes above to other CPUs.  This ensures
    1640             :          *    we do not miss events from ep_poll_callback if an
    1641             :          *    event occurs immediately after we call f_op->poll().
    1642             :          *    We need this because we did not take ep->lock while
    1643             :          *    changing epi above (but ep_poll_callback does take
    1644             :          *    ep->lock).
    1645             :          *
    1646             :          * 2) We also need to ensure we do not miss _past_ events
    1647             :          *    when calling f_op->poll().  This barrier also
    1648             :          *    pairs with the barrier in wq_has_sleeper (see
    1649             :          *    comments for wq_has_sleeper).
    1650             :          *
    1651             :          * This barrier will now guarantee ep_poll_callback or f_op->poll
    1652             :          * (or both) will notice the readiness of an item.
    1653             :          */
    1654      503353 :         smp_mb();
    1655             : 
    1656             :         /*
    1657             :          * Get current event bits. We can safely use the file* here because
    1658             :          * its usage count has been increased by the caller of this function.
    1659             :          * If the item is "hot" and it is not registered inside the ready
    1660             :          * list, push it inside.
    1661             :          */
    1662      503323 :         if (ep_item_poll(epi, &pt, 1)) {
    1663      209680 :                 write_lock_irq(&ep->lock);
    1664      209679 :                 if (!ep_is_linked(epi)) {
    1665      122121 :                         list_add_tail(&epi->rdllink, &ep->rdllist);
    1666      122121 :                         ep_pm_stay_awake(epi);
    1667             : 
    1668             :                         /* Notify waiting tasks that events are available */
    1669      122121 :                         if (waitqueue_active(&ep->wq))
    1670           0 :                                 wake_up(&ep->wq);
    1671      122122 :                         if (waitqueue_active(&ep->poll_wait))
    1672           0 :                                 pwake++;
    1673             :                 }
    1674      209680 :                 write_unlock_irq(&ep->lock);
    1675             :         }
    1676             : 
    1677             :         /* We have to call this outside the lock */
    1678      209680 :         if (pwake)
    1679           0 :                 ep_poll_safewake(ep, NULL, 0);
    1680             : 
    1681      503380 :         return 0;
    1682             : }
    1683             : 
    1684    73926179 : static int ep_send_events(struct eventpoll *ep,
    1685             :                           struct epoll_event __user *events, int maxevents)
    1686             : {
    1687    73926179 :         struct epitem *epi, *tmp;
    1688    73926179 :         LIST_HEAD(txlist);
    1689    73926179 :         poll_table pt;
    1690    73926179 :         int res = 0;
    1691             : 
    1692             :         /*
    1693             :          * Always short-circuit for fatal signals to allow threads to make a
    1694             :          * timely exit without the chance of finding more events available and
    1695             :          * fetching repeatedly.
    1696             :          */
    1697    73926179 :         if (fatal_signal_pending(current))
    1698             :                 return -EINTR;
    1699             : 
    1700    73925009 :         init_poll_funcptr(&pt, NULL);
    1701             : 
    1702    73925009 :         mutex_lock(&ep->mtx);
    1703    73969370 :         ep_start_scan(ep, &txlist);
    1704             : 
    1705             :         /*
    1706             :          * We can loop without lock because we are passed a task private list.
    1707             :          * Items cannot vanish during the loop we are holding ep->mtx.
    1708             :          */
    1709   148324024 :         list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
    1710    74360670 :                 struct wakeup_source *ws;
    1711    74360670 :                 __poll_t revents;
    1712             : 
    1713    74360670 :                 if (res >= maxevents)
    1714             :                         break;
    1715             : 
    1716             :                 /*
    1717             :                  * Activate ep->ws before deactivating epi->ws to prevent
    1718             :                  * triggering auto-suspend here (in case we reactive epi->ws
    1719             :                  * below).
    1720             :                  *
    1721             :                  * This could be rearranged to delay the deactivation of epi->ws
    1722             :                  * instead, but then epi->ws would temporarily be out of sync
    1723             :                  * with ep_is_linked().
    1724             :                  */
    1725    74355888 :                 ws = ep_wakeup_source(epi);
    1726    74355888 :                 if (ws) {
    1727        2625 :                         if (ws->active)
    1728           0 :                                 __pm_stay_awake(ep->ws);
    1729        2625 :                         __pm_relax(ws);
    1730             :                 }
    1731             : 
    1732    74403013 :                 list_del_init(&epi->rdllink);
    1733             : 
    1734             :                 /*
    1735             :                  * If the event mask intersect the caller-requested one,
    1736             :                  * deliver the event to userspace. Again, we are holding ep->mtx,
    1737             :                  * so no operations coming from userspace can change the item.
    1738             :                  */
    1739    74373893 :                 revents = ep_item_poll(epi, &pt, 1);
    1740    74414155 :                 if (!revents)
    1741    28762490 :                         continue;
    1742             : 
    1743    45651665 :                 events = epoll_put_uevent(revents, epi->event.data, events);
    1744    45652447 :                 if (!events) {
    1745           0 :                         list_add(&epi->rdllink, &txlist);
    1746           0 :                         ep_pm_stay_awake(epi);
    1747           0 :                         if (!res)
    1748           0 :                                 res = -EFAULT;
    1749             :                         break;
    1750             :                 }
    1751    45652447 :                 res++;
    1752    45652447 :                 if (epi->event.events & EPOLLONESHOT)
    1753       50378 :                         epi->event.events &= EP_PRIVATE_BITS;
    1754    45602069 :                 else if (!(epi->event.events & EPOLLET)) {
    1755             :                         /*
    1756             :                          * If this file has been added with Level
    1757             :                          * Trigger mode, we need to insert back inside
    1758             :                          * the ready list, so that the next call to
    1759             :                          * epoll_wait() will check again the events
    1760             :                          * availability. At this point, no one can insert
    1761             :                          * into ep->rdllist besides us. The epoll_ctl()
    1762             :                          * callers are locked out by
    1763             :                          * ep_scan_ready_list() holding "mtx" and the
    1764             :                          * poll callback will queue them in ep->ovflist.
    1765             :                          */
    1766    45337833 :                         list_add_tail(&epi->rdllink, &ep->rdllist);
    1767    45330117 :                         ep_pm_stay_awake(epi);
    1768             :                 }
    1769             :         }
    1770    73968136 :         ep_done_scan(ep, &txlist);
    1771    73992380 :         mutex_unlock(&ep->mtx);
    1772             : 
    1773    73992380 :         return res;
    1774             : }
    1775             : 
    1776    46966026 : static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
    1777             : {
    1778    46966026 :         struct timespec64 now;
    1779             : 
    1780    46966026 :         if (ms < 0)
    1781             :                 return NULL;
    1782             : 
    1783     2578101 :         if (!ms) {
    1784     2544288 :                 to->tv_sec = 0;
    1785     2544288 :                 to->tv_nsec = 0;
    1786     2544288 :                 return to;
    1787             :         }
    1788             : 
    1789       33813 :         to->tv_sec = ms / MSEC_PER_SEC;
    1790       33813 :         to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
    1791             : 
    1792       33813 :         ktime_get_ts64(&now);
    1793       33813 :         *to = timespec64_add_safe(now, *to);
    1794       33813 :         return to;
    1795             : }
    1796             : 
    1797             : /*
    1798             :  * autoremove_wake_function, but remove even on failure to wake up, because we
    1799             :  * know that default_wake_function/ttwu will only fail if the thread is already
    1800             :  * woken, and in that case the ep_poll loop will remove the entry anyways, not
    1801             :  * try to reuse it.
    1802             :  */
    1803    28675862 : static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
    1804             :                                        unsigned int mode, int sync, void *key)
    1805             : {
    1806    28675862 :         int ret = default_wake_function(wq_entry, mode, sync, key);
    1807             : 
    1808             :         /*
    1809             :          * Pairs with list_empty_careful in ep_poll, and ensures future loop
    1810             :          * iterations see the cause of this wakeup.
    1811             :          */
    1812    28675900 :         list_del_init_careful(&wq_entry->entry);
    1813    28675924 :         return ret;
    1814             : }
    1815             : 
    1816             : /**
    1817             :  * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
    1818             :  *           event buffer.
    1819             :  *
    1820             :  * @ep: Pointer to the eventpoll context.
    1821             :  * @events: Pointer to the userspace buffer where the ready events should be
    1822             :  *          stored.
    1823             :  * @maxevents: Size (in terms of number of events) of the caller event buffer.
    1824             :  * @timeout: Maximum timeout for the ready events fetch operation, in
    1825             :  *           timespec. If the timeout is zero, the function will not block,
    1826             :  *           while if the @timeout ptr is NULL, the function will block
    1827             :  *           until at least one event has been retrieved (or an error
    1828             :  *           occurred).
    1829             :  *
    1830             :  * Return: the number of ready events which have been fetched, or an
    1831             :  *          error code, in case of error.
    1832             :  */
    1833    46967263 : static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
    1834             :                    int maxevents, struct timespec64 *timeout)
    1835             : {
    1836    46967263 :         int res, eavail, timed_out = 0;
    1837    46967263 :         u64 slack = 0;
    1838    46967263 :         wait_queue_entry_t wait;
    1839    46967263 :         ktime_t expires, *to = NULL;
    1840             : 
    1841    46967263 :         lockdep_assert_irqs_enabled();
    1842             : 
    1843    46967263 :         if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
    1844       33813 :                 slack = select_estimate_accuracy(timeout);
    1845       33813 :                 to = &expires;
    1846       67626 :                 *to = timespec64_to_ktime(*timeout);
    1847    46933450 :         } else if (timeout) {
    1848             :                 /*
    1849             :                  * Avoid the unnecessary trip to the wait queue loop, if the
    1850             :                  * caller specified a non blocking operation.
    1851             :                  */
    1852     2545420 :                 timed_out = 1;
    1853             :         }
    1854             : 
    1855             :         /*
    1856             :          * This call is racy: We may or may not see events that are being added
    1857             :          * to the ready list under the lock (e.g., in IRQ callbacks). For cases
    1858             :          * with a non-zero timeout, this thread will check the ready list under
    1859             :          * lock and will add to the wait queue.  For cases with a zero
    1860             :          * timeout, the user by definition should not care and will have to
    1861             :          * recheck again.
    1862             :          */
    1863    46967263 :         eavail = ep_events_available(ep);
    1864             : 
    1865    46982514 :         while (1) {
    1866    46982514 :                 if (eavail) {
    1867             :                         /*
    1868             :                          * Try to transfer events to user space. In case we get
    1869             :                          * 0 events and there's still timeout left over, we go
    1870             :                          * trying again in search of more luck.
    1871             :                          */
    1872    74047028 :                         res = ep_send_events(ep, events, maxevents);
    1873    73961358 :                         if (res)
    1874    45437250 :                                 return res;
    1875             :                 }
    1876             : 
    1877    30157702 :                 if (timed_out)
    1878             :                         return 0;
    1879             : 
    1880    28700761 :                 eavail = ep_busy_loop(ep, timed_out);
    1881    28706659 :                 if (eavail)
    1882           0 :                         continue;
    1883             : 
    1884    28706659 :                 if (signal_pending(current))
    1885             :                         return -EINTR;
    1886             : 
    1887             :                 /*
    1888             :                  * Internally init_wait() uses autoremove_wake_function(),
    1889             :                  * thus wait entry is removed from the wait queue on each
    1890             :                  * wakeup. Why it is important? In case of several waiters
    1891             :                  * each new wakeup will hit the next waiter, giving it the
    1892             :                  * chance to harvest new event. Otherwise wakeup can be
    1893             :                  * lost. This is also good performance-wise, because on
    1894             :                  * normal wakeup path no need to call __remove_wait_queue()
    1895             :                  * explicitly, thus ep->lock is not taken, which halts the
    1896             :                  * event delivery.
    1897             :                  *
    1898             :                  * In fact, we now use an even more aggressive function that
    1899             :                  * unconditionally removes, because we don't reuse the wait
    1900             :                  * entry between loop iterations. This lets us also avoid the
    1901             :                  * performance issue if a process is killed, causing all of its
    1902             :                  * threads to wake up without being removed normally.
    1903             :                  */
    1904    28707054 :                 init_wait(&wait);
    1905    28707054 :                 wait.func = ep_autoremove_wake_function;
    1906             : 
    1907    28707054 :                 write_lock_irq(&ep->lock);
    1908             :                 /*
    1909             :                  * Barrierless variant, waitqueue_active() is called under
    1910             :                  * the same lock on wakeup ep_poll_callback() side, so it
    1911             :                  * is safe to avoid an explicit barrier.
    1912             :                  */
    1913    28707054 :                 __set_current_state(TASK_INTERRUPTIBLE);
    1914             : 
    1915             :                 /*
    1916             :                  * Do the final check under the lock. ep_scan_ready_list()
    1917             :                  * plays with two lists (->rdllist and ->ovflist) and there
    1918             :                  * is always a race when both lists are empty for short
    1919             :                  * period of time although events are pending, so lock is
    1920             :                  * important.
    1921             :                  */
    1922    28707271 :                 eavail = ep_events_available(ep);
    1923    28675376 :                 if (!eavail)
    1924    28675376 :                         __add_wait_queue_exclusive(&ep->wq, &wait);
    1925             : 
    1926    28707021 :                 write_unlock_irq(&ep->lock);
    1927             : 
    1928    28707074 :                 if (!eavail)
    1929    28674514 :                         timed_out = !schedule_hrtimeout_range(to, slack,
    1930             :                                                               HRTIMER_MODE_ABS);
    1931    28707952 :                 __set_current_state(TASK_RUNNING);
    1932             : 
    1933             :                 /*
    1934             :                  * We were woken up, thus go and try to harvest some events.
    1935             :                  * If timed out and still on the wait queue, recheck eavail
    1936             :                  * carefully under lock, below.
    1937             :                  */
    1938    28707952 :                 eavail = 1;
    1939             : 
    1940    28707952 :                 if (!list_empty_careful(&wait.entry)) {
    1941        9213 :                         write_lock_irq(&ep->lock);
    1942             :                         /*
    1943             :                          * If the thread timed out and is not on the wait queue,
    1944             :                          * it means that the thread was woken up after its
    1945             :                          * timeout expired before it could reacquire the lock.
    1946             :                          * Thus, when wait.entry is empty, it needs to harvest
    1947             :                          * events.
    1948             :                          */
    1949        8624 :                         if (timed_out)
    1950           0 :                                 eavail = list_empty(&wait.entry);
    1951        8624 :                         __remove_wait_queue(&ep->wq, &wait);
    1952        8624 :                         write_unlock_irq(&ep->lock);
    1953             :                 }
    1954             :         }
    1955             : }
    1956             : 
    1957             : /**
    1958             :  * ep_loop_check_proc - verify that adding an epoll file inside another
    1959             :  *                      epoll structure does not violate the constraints, in
    1960             :  *                      terms of closed loops, or too deep chains (which can
    1961             :  *                      result in excessive stack usage).
    1962             :  *
    1963             :  * @ep: the &struct eventpoll to be currently checked.
    1964             :  * @depth: Current depth of the path being checked.
    1965             :  *
    1966             :  * Return: %zero if adding the epoll @file inside current epoll
    1967             :  *          structure @ep does not violate the constraints, or %-1 otherwise.
    1968             :  */
    1969        5026 : static int ep_loop_check_proc(struct eventpoll *ep, int depth)
    1970             : {
    1971        5026 :         int error = 0;
    1972        5026 :         struct rb_node *rbp;
    1973        5026 :         struct epitem *epi;
    1974             : 
    1975        5026 :         mutex_lock_nested(&ep->mtx, depth + 1);
    1976        5026 :         ep->gen = loop_check_gen;
    1977       15078 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
    1978       10052 :                 epi = rb_entry(rbp, struct epitem, rbn);
    1979       10052 :                 if (unlikely(is_file_epoll(epi->ffd.file))) {
    1980           0 :                         struct eventpoll *ep_tovisit;
    1981           0 :                         ep_tovisit = epi->ffd.file->private_data;
    1982           0 :                         if (ep_tovisit->gen == loop_check_gen)
    1983           0 :                                 continue;
    1984           0 :                         if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
    1985             :                                 error = -1;
    1986             :                         else
    1987           0 :                                 error = ep_loop_check_proc(ep_tovisit, depth + 1);
    1988           0 :                         if (error != 0)
    1989             :                                 break;
    1990             :                 } else {
    1991             :                         /*
    1992             :                          * If we've reached a file that is not associated with
    1993             :                          * an ep, then we need to check if the newly added
    1994             :                          * links are going to add too many wakeup paths. We do
    1995             :                          * this by adding it to the tfile_check_list, if it's
    1996             :                          * not already there, and calling reverse_path_check()
    1997             :                          * during ep_insert().
    1998             :                          */
    1999       10052 :                         list_file(epi->ffd.file);
    2000             :                 }
    2001             :         }
    2002        5026 :         mutex_unlock(&ep->mtx);
    2003             : 
    2004        5026 :         return error;
    2005             : }
    2006             : 
    2007             : /**
    2008             :  * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
    2009             :  *                 into another epoll file (represented by @ep) does not create
    2010             :  *                 closed loops or too deep chains.
    2011             :  *
    2012             :  * @ep: Pointer to the epoll we are inserting into.
    2013             :  * @to: Pointer to the epoll to be inserted.
    2014             :  *
    2015             :  * Return: %zero if adding the epoll @to inside the epoll @from
    2016             :  * does not violate the constraints, or %-1 otherwise.
    2017             :  */
    2018             : static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
    2019             : {
    2020        5026 :         inserting_into = ep;
    2021        5026 :         return ep_loop_check_proc(to, 0);
    2022             : }
    2023             : 
    2024        5026 : static void clear_tfile_check_list(void)
    2025             : {
    2026        5026 :         rcu_read_lock();
    2027       15078 :         while (tfile_check_list != EP_UNACTIVE_PTR) {
    2028       10052 :                 struct epitems_head *head = tfile_check_list;
    2029       10052 :                 tfile_check_list = head->next;
    2030       10052 :                 unlist_file(head);
    2031             :         }
    2032        5026 :         rcu_read_unlock();
    2033        5026 : }
    2034             : 
    2035             : /*
    2036             :  * Open an eventpoll file descriptor.
    2037             :  */
    2038       86584 : static int do_epoll_create(int flags)
    2039             : {
    2040       86584 :         int error, fd;
    2041       86584 :         struct eventpoll *ep = NULL;
    2042       86584 :         struct file *file;
    2043             : 
    2044             :         /* Check the EPOLL_* constant for consistency.  */
    2045       86584 :         BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
    2046             : 
    2047       86584 :         if (flags & ~EPOLL_CLOEXEC)
    2048             :                 return -EINVAL;
    2049             :         /*
    2050             :          * Create the internal data structure ("struct eventpoll").
    2051             :          */
    2052       86584 :         error = ep_alloc(&ep);
    2053       86581 :         if (error < 0)
    2054             :                 return error;
    2055             :         /*
    2056             :          * Creates all the items needed to setup an eventpoll file. That is,
    2057             :          * a file structure and a free file descriptor.
    2058             :          */
    2059       86581 :         fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    2060       86583 :         if (fd < 0) {
    2061           0 :                 error = fd;
    2062           0 :                 goto out_free_ep;
    2063             :         }
    2064       86583 :         file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
    2065             :                                  O_RDWR | (flags & O_CLOEXEC));
    2066       86583 :         if (IS_ERR(file)) {
    2067           0 :                 error = PTR_ERR(file);
    2068           0 :                 goto out_free_fd;
    2069             :         }
    2070       86583 :         ep->file = file;
    2071       86583 :         fd_install(fd, file);
    2072       86583 :         return fd;
    2073             : 
    2074             : out_free_fd:
    2075           0 :         put_unused_fd(fd);
    2076           0 : out_free_ep:
    2077           0 :         ep_clear_and_put(ep);
    2078           0 :         return error;
    2079             : }
    2080             : 
    2081      173166 : SYSCALL_DEFINE1(epoll_create1, int, flags)
    2082             : {
    2083       86582 :         return do_epoll_create(flags);
    2084             : }
    2085             : 
    2086           0 : SYSCALL_DEFINE1(epoll_create, int, size)
    2087             : {
    2088           0 :         if (size <= 0)
    2089             :                 return -EINVAL;
    2090             : 
    2091           0 :         return do_epoll_create(0);
    2092             : }
    2093             : 
    2094             : #ifdef CONFIG_PM_SLEEP
    2095     1123711 : static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
    2096             : {
    2097     1123711 :         if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
    2098           0 :                 epev->events &= ~EPOLLWAKEUP;
    2099     1123711 : }
    2100             : #else
    2101             : static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
    2102             : {
    2103             :         epev->events &= ~EPOLLWAKEUP;
    2104             : }
    2105             : #endif
    2106             : 
    2107     1649499 : static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
    2108             :                                    bool nonblock)
    2109             : {
    2110     1649499 :         if (!nonblock) {
    2111     1649499 :                 mutex_lock_nested(mutex, depth);
    2112     1649499 :                 return 0;
    2113             :         }
    2114           0 :         if (mutex_trylock(mutex))
    2115           0 :                 return 0;
    2116             :         return -EAGAIN;
    2117             : }
    2118             : 
    2119     2016082 : int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
    2120             :                  bool nonblock)
    2121             : {
    2122     2016082 :         int error;
    2123     2016082 :         int full_check = 0;
    2124     2016082 :         struct fd f, tf;
    2125     2016082 :         struct eventpoll *ep;
    2126     2016082 :         struct epitem *epi;
    2127     2016082 :         struct eventpoll *tep = NULL;
    2128             : 
    2129     2016082 :         error = -EBADF;
    2130     2016082 :         f = fdget(epfd);
    2131     2016142 :         if (!f.file)
    2132           0 :                 goto error_return;
    2133             : 
    2134             :         /* Get the "struct file *" for the target file */
    2135     2016142 :         tf = fdget(fd);
    2136     2016257 :         if (!tf.file)
    2137           0 :                 goto error_fput;
    2138             : 
    2139             :         /* The target file descriptor must support poll */
    2140     2016257 :         error = -EPERM;
    2141     2016257 :         if (!file_can_poll(tf.file))
    2142      376696 :                 goto error_tgt_fput;
    2143             : 
    2144             :         /* Check if EPOLLWAKEUP is allowed */
    2145     1639561 :         if (ep_op_has_event(op))
    2146     1123683 :                 ep_take_care_of_epollwakeup(epds);
    2147             : 
    2148             :         /*
    2149             :          * We have to check that the file structure underneath the file descriptor
    2150             :          * the user passed to us _is_ an eventpoll file. And also we do not permit
    2151             :          * adding an epoll file descriptor inside itself.
    2152             :          */
    2153     1639576 :         error = -EINVAL;
    2154     1639576 :         if (f.file == tf.file || !is_file_epoll(f.file))
    2155           0 :                 goto error_tgt_fput;
    2156             : 
    2157             :         /*
    2158             :          * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
    2159             :          * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
    2160             :          * Also, we do not currently supported nested exclusive wakeups.
    2161             :          */
    2162     1639576 :         if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
    2163           0 :                 if (op == EPOLL_CTL_MOD)
    2164           0 :                         goto error_tgt_fput;
    2165           0 :                 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
    2166           0 :                                 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
    2167           0 :                         goto error_tgt_fput;
    2168             :         }
    2169             : 
    2170             :         /*
    2171             :          * At this point it is safe to assume that the "private_data" contains
    2172             :          * our own data structure.
    2173             :          */
    2174     1639576 :         ep = f.file->private_data;
    2175             : 
    2176             :         /*
    2177             :          * When we insert an epoll file descriptor inside another epoll file
    2178             :          * descriptor, there is the chance of creating closed loops, which are
    2179             :          * better be handled here, than in more critical paths. While we are
    2180             :          * checking for loops we also determine the list of files reachable
    2181             :          * and hang them on the tfile_check_list, so we can check that we
    2182             :          * haven't created too many possible wakeup paths.
    2183             :          *
    2184             :          * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
    2185             :          * the epoll file descriptor is attaching directly to a wakeup source,
    2186             :          * unless the epoll file descriptor is nested. The purpose of taking the
    2187             :          * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
    2188             :          * deep wakeup paths from forming in parallel through multiple
    2189             :          * EPOLL_CTL_ADD operations.
    2190             :          */
    2191     1639576 :         error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
    2192     1639723 :         if (error)
    2193           0 :                 goto error_tgt_fput;
    2194     1639723 :         if (op == EPOLL_CTL_ADD) {
    2195      620482 :                 if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
    2196             :                     is_file_epoll(tf.file)) {
    2197        5045 :                         mutex_unlock(&ep->mtx);
    2198        4999 :                         error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
    2199        5026 :                         if (error)
    2200           0 :                                 goto error_tgt_fput;
    2201        5026 :                         loop_check_gen++;
    2202        5026 :                         full_check = 1;
    2203        5026 :                         if (is_file_epoll(tf.file)) {
    2204        5026 :                                 tep = tf.file->private_data;
    2205        5026 :                                 error = -ELOOP;
    2206        5026 :                                 if (ep_loop_check(ep, tep) != 0)
    2207           0 :                                         goto error_tgt_fput;
    2208             :                         }
    2209        5026 :                         error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
    2210        5026 :                         if (error)
    2211           0 :                                 goto error_tgt_fput;
    2212             :                 }
    2213             :         }
    2214             : 
    2215             :         /*
    2216             :          * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
    2217             :          * above, we can be sure to be able to use the item looked up by
    2218             :          * ep_find() till we release the mutex.
    2219             :          */
    2220     1639704 :         epi = ep_find(ep, tf.file, fd);
    2221             : 
    2222     1639679 :         error = -EINVAL;
    2223     1639679 :         switch (op) {
    2224      620484 :         case EPOLL_CTL_ADD:
    2225      620484 :                 if (!epi) {
    2226      620483 :                         epds->events |= EPOLLERR | EPOLLHUP;
    2227      620483 :                         error = ep_insert(ep, epds, tf.file, fd, full_check);
    2228             :                 } else
    2229             :                         error = -EEXIST;
    2230             :                 break;
    2231      515846 :         case EPOLL_CTL_DEL:
    2232      515846 :                 if (epi) {
    2233             :                         /*
    2234             :                          * The eventpoll itself is still alive: the refcount
    2235             :                          * can't go to zero here.
    2236             :                          */
    2237      515846 :                         ep_remove_safe(ep, epi);
    2238      515846 :                         error = 0;
    2239             :                 } else {
    2240             :                         error = -ENOENT;
    2241             :                 }
    2242             :                 break;
    2243      503349 :         case EPOLL_CTL_MOD:
    2244      503349 :                 if (epi) {
    2245      503349 :                         if (!(epi->event.events & EPOLLEXCLUSIVE)) {
    2246      503337 :                                 epds->events |= EPOLLERR | EPOLLHUP;
    2247      503337 :                                 error = ep_modify(ep, epi, epds);
    2248             :                         }
    2249             :                 } else
    2250             :                         error = -ENOENT;
    2251             :                 break;
    2252             :         }
    2253     1639608 :         mutex_unlock(&ep->mtx);
    2254             : 
    2255             : error_tgt_fput:
    2256     2015905 :         if (full_check) {
    2257        5026 :                 clear_tfile_check_list();
    2258        5026 :                 loop_check_gen++;
    2259        5026 :                 mutex_unlock(&epnested_mutex);
    2260             :         }
    2261             : 
    2262     2015905 :         fdput(tf);
    2263     2015905 : error_fput:
    2264     2015905 :         fdput(f);
    2265     2015905 : error_return:
    2266             : 
    2267     2015905 :         return error;
    2268             : }
    2269             : 
    2270             : /*
    2271             :  * The following function implements the controller interface for
    2272             :  * the eventpoll file that enables the insertion/removal/change of
    2273             :  * file descriptors inside the interest set.
    2274             :  */
    2275     4032169 : SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
    2276             :                 struct epoll_event __user *, event)
    2277             : {
    2278     2016064 :         struct epoll_event epds;
    2279             : 
    2280     3327978 :         if (ep_op_has_event(op) &&
    2281             :             copy_from_user(&epds, event, sizeof(struct epoll_event)))
    2282             :                 return -EFAULT;
    2283             : 
    2284     2016043 :         return do_epoll_ctl(epfd, op, fd, &epds, false);
    2285             : }
    2286             : 
    2287             : /*
    2288             :  * Implement the event wait interface for the eventpoll file. It is the kernel
    2289             :  * part of the user space epoll_wait(2).
    2290             :  */
    2291    46969026 : static int do_epoll_wait(int epfd, struct epoll_event __user *events,
    2292             :                          int maxevents, struct timespec64 *to)
    2293             : {
    2294    46969026 :         int error;
    2295    46969026 :         struct fd f;
    2296    46969026 :         struct eventpoll *ep;
    2297             : 
    2298             :         /* The maximum number of event must be greater than zero */
    2299    46969026 :         if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
    2300             :                 return -EINVAL;
    2301             : 
    2302             :         /* Verify that the area passed by the user is writeable */
    2303    46969026 :         if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
    2304             :                 return -EFAULT;
    2305             : 
    2306             :         /* Get the "struct file *" for the eventpoll file */
    2307    46968739 :         f = fdget(epfd);
    2308    46965709 :         if (!f.file)
    2309             :                 return -EBADF;
    2310             : 
    2311             :         /*
    2312             :          * We have to check that the file structure underneath the fd
    2313             :          * the user passed to us _is_ an eventpoll file.
    2314             :          */
    2315    46965709 :         error = -EINVAL;
    2316    46965709 :         if (!is_file_epoll(f.file))
    2317           0 :                 goto error_fput;
    2318             : 
    2319             :         /*
    2320             :          * At this point it is safe to assume that the "private_data" contains
    2321             :          * our own data structure.
    2322             :          */
    2323    46965709 :         ep = f.file->private_data;
    2324             : 
    2325             :         /* Time to fish for events ... */
    2326    46965709 :         error = ep_poll(ep, events, maxevents, to);
    2327             : 
    2328    46890087 : error_fput:
    2329    46890087 :         fdput(f);
    2330    46890093 :         return error;
    2331             : }
    2332             : 
    2333           0 : SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
    2334             :                 int, maxevents, int, timeout)
    2335             : {
    2336           0 :         struct timespec64 to;
    2337             : 
    2338           0 :         return do_epoll_wait(epfd, events, maxevents,
    2339             :                              ep_timeout_to_timespec(&to, timeout));
    2340             : }
    2341             : 
    2342             : /*
    2343             :  * Implement the event wait interface for the eventpoll file. It is the kernel
    2344             :  * part of the user space epoll_pwait(2).
    2345             :  */
    2346    46965104 : static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
    2347             :                           int maxevents, struct timespec64 *to,
    2348             :                           const sigset_t __user *sigmask, size_t sigsetsize)
    2349             : {
    2350    46965104 :         int error;
    2351             : 
    2352             :         /*
    2353             :          * If the caller wants a certain signal mask to be set during the wait,
    2354             :          * we apply it here.
    2355             :          */
    2356    46965104 :         error = set_user_sigmask(sigmask, sigsetsize);
    2357    46966720 :         if (error)
    2358             :                 return error;
    2359             : 
    2360    46965963 :         error = do_epoll_wait(epfd, events, maxevents, to);
    2361             : 
    2362    46894780 :         restore_saved_sigmask_unless(error == -EINTR);
    2363             : 
    2364    46894780 :         return error;
    2365             : }
    2366             : 
    2367    93933661 : SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
    2368             :                 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
    2369             :                 size_t, sigsetsize)
    2370             : {
    2371    46966031 :         struct timespec64 to;
    2372             : 
    2373    46966031 :         return do_epoll_pwait(epfd, events, maxevents,
    2374             :                               ep_timeout_to_timespec(&to, timeout),
    2375             :                               sigmask, sigsetsize);
    2376             : }
    2377             : 
    2378           0 : SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
    2379             :                 int, maxevents, const struct __kernel_timespec __user *, timeout,
    2380             :                 const sigset_t __user *, sigmask, size_t, sigsetsize)
    2381             : {
    2382           0 :         struct timespec64 ts, *to = NULL;
    2383             : 
    2384           0 :         if (timeout) {
    2385           0 :                 if (get_timespec64(&ts, timeout))
    2386             :                         return -EFAULT;
    2387           0 :                 to = &ts;
    2388           0 :                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
    2389             :                         return -EINVAL;
    2390             :         }
    2391             : 
    2392           0 :         return do_epoll_pwait(epfd, events, maxevents, to,
    2393             :                               sigmask, sigsetsize);
    2394             : }
    2395             : 
    2396             : #ifdef CONFIG_COMPAT
    2397             : static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
    2398             :                                  int maxevents, struct timespec64 *timeout,
    2399             :                                  const compat_sigset_t __user *sigmask,
    2400             :                                  compat_size_t sigsetsize)
    2401             : {
    2402             :         long err;
    2403             : 
    2404             :         /*
    2405             :          * If the caller wants a certain signal mask to be set during the wait,
    2406             :          * we apply it here.
    2407             :          */
    2408             :         err = set_compat_user_sigmask(sigmask, sigsetsize);
    2409             :         if (err)
    2410             :                 return err;
    2411             : 
    2412             :         err = do_epoll_wait(epfd, events, maxevents, timeout);
    2413             : 
    2414             :         restore_saved_sigmask_unless(err == -EINTR);
    2415             : 
    2416             :         return err;
    2417             : }
    2418             : 
    2419             : COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
    2420             :                        struct epoll_event __user *, events,
    2421             :                        int, maxevents, int, timeout,
    2422             :                        const compat_sigset_t __user *, sigmask,
    2423             :                        compat_size_t, sigsetsize)
    2424             : {
    2425             :         struct timespec64 to;
    2426             : 
    2427             :         return do_compat_epoll_pwait(epfd, events, maxevents,
    2428             :                                      ep_timeout_to_timespec(&to, timeout),
    2429             :                                      sigmask, sigsetsize);
    2430             : }
    2431             : 
    2432             : COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
    2433             :                        struct epoll_event __user *, events,
    2434             :                        int, maxevents,
    2435             :                        const struct __kernel_timespec __user *, timeout,
    2436             :                        const compat_sigset_t __user *, sigmask,
    2437             :                        compat_size_t, sigsetsize)
    2438             : {
    2439             :         struct timespec64 ts, *to = NULL;
    2440             : 
    2441             :         if (timeout) {
    2442             :                 if (get_timespec64(&ts, timeout))
    2443             :                         return -EFAULT;
    2444             :                 to = &ts;
    2445             :                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
    2446             :                         return -EINVAL;
    2447             :         }
    2448             : 
    2449             :         return do_compat_epoll_pwait(epfd, events, maxevents, to,
    2450             :                                      sigmask, sigsetsize);
    2451             : }
    2452             : 
    2453             : #endif
    2454             : 
    2455           0 : static int __init eventpoll_init(void)
    2456             : {
    2457           0 :         struct sysinfo si;
    2458             : 
    2459           0 :         si_meminfo(&si);
    2460             :         /*
    2461             :          * Allows top 4% of lomem to be allocated for epoll watches (per user).
    2462             :          */
    2463           0 :         max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
    2464             :                 EP_ITEM_COST;
    2465           0 :         BUG_ON(max_user_watches < 0);
    2466             : 
    2467             :         /*
    2468             :          * We can have many thousands of epitems, so prevent this from
    2469             :          * using an extra cache line on 64-bit (and smaller) CPUs
    2470             :          */
    2471           0 :         BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
    2472             : 
    2473             :         /* Allocates slab cache used to allocate "struct epitem" items */
    2474           0 :         epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
    2475             :                         0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2476             : 
    2477             :         /* Allocates slab cache used to allocate "struct eppoll_entry" */
    2478           0 :         pwq_cache = kmem_cache_create("eventpoll_pwq",
    2479             :                 sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2480           0 :         epoll_sysctls_init();
    2481             : 
    2482           0 :         ephead_cache = kmem_cache_create("ep_head",
    2483             :                 sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2484             : 
    2485           0 :         return 0;
    2486             : }
    2487             : fs_initcall(eventpoll_init);

Generated by: LCOV version 1.14