LCOV - code coverage report
Current view: top level - fs/xfs - xfs_icache.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwx @ Mon Jul 31 20:08:22 PDT 2023 Lines: 827 894 92.5 %
Date: 2023-07-31 20:08:22 Functions: 64 65 98.5 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
       4             :  * All Rights Reserved.
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_log_format.h"
      11             : #include "xfs_trans_resv.h"
      12             : #include "xfs_mount.h"
      13             : #include "xfs_inode.h"
      14             : #include "xfs_trans.h"
      15             : #include "xfs_trans_priv.h"
      16             : #include "xfs_inode_item.h"
      17             : #include "xfs_quota.h"
      18             : #include "xfs_trace.h"
      19             : #include "xfs_icache.h"
      20             : #include "xfs_bmap_util.h"
      21             : #include "xfs_dquot_item.h"
      22             : #include "xfs_dquot.h"
      23             : #include "xfs_reflink.h"
      24             : #include "xfs_ialloc.h"
      25             : #include "xfs_ag.h"
      26             : #include "xfs_log_priv.h"
      27             : 
      28             : #include <linux/iversion.h>
      29             : 
      30             : /* Radix tree tags for incore inode tree. */
      31             : 
      32             : /* inode is to be reclaimed */
      33             : #define XFS_ICI_RECLAIM_TAG     0
      34             : /* Inode has speculative preallocations (posteof or cow) to clean. */
      35             : #define XFS_ICI_BLOCKGC_TAG     1
      36             : 
      37             : /*
      38             :  * The goal for walking incore inodes.  These can correspond with incore inode
      39             :  * radix tree tags when convenient.  Avoid existing XFS_IWALK namespace.
      40             :  */
      41             : enum xfs_icwalk_goal {
      42             :         /* Goals directly associated with tagged inodes. */
      43             :         XFS_ICWALK_BLOCKGC      = XFS_ICI_BLOCKGC_TAG,
      44             :         XFS_ICWALK_RECLAIM      = XFS_ICI_RECLAIM_TAG,
      45             : };
      46             : 
      47             : static int xfs_icwalk(struct xfs_mount *mp,
      48             :                 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
      49             : static int xfs_icwalk_ag(struct xfs_perag *pag,
      50             :                 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
      51             : 
      52             : /*
      53             :  * Private inode cache walk flags for struct xfs_icwalk.  Must not
      54             :  * coincide with XFS_ICWALK_FLAGS_VALID.
      55             :  */
      56             : 
      57             : /* Stop scanning after icw_scan_limit inodes. */
      58             : #define XFS_ICWALK_FLAG_SCAN_LIMIT      (1U << 28)
      59             : 
      60             : #define XFS_ICWALK_FLAG_RECLAIM_SICK    (1U << 27)
      61             : #define XFS_ICWALK_FLAG_UNION           (1U << 26) /* union filter algorithm */
      62             : 
      63             : #define XFS_ICWALK_PRIVATE_FLAGS        (XFS_ICWALK_FLAG_SCAN_LIMIT | \
      64             :                                          XFS_ICWALK_FLAG_RECLAIM_SICK | \
      65             :                                          XFS_ICWALK_FLAG_UNION)
      66             : 
      67             : /*
      68             :  * Allocate and initialise an xfs_inode.
      69             :  */
      70             : struct xfs_inode *
      71   440000898 : xfs_inode_alloc(
      72             :         struct xfs_mount        *mp,
      73             :         xfs_ino_t               ino)
      74             : {
      75   440000898 :         struct xfs_inode        *ip;
      76             : 
      77             :         /*
      78             :          * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
      79             :          * and return NULL here on ENOMEM.
      80             :          */
      81   440000898 :         ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
      82             : 
      83   440350090 :         if (inode_init_always(mp->m_super, VFS_I(ip))) {
      84           0 :                 kmem_cache_free(xfs_inode_cache, ip);
      85           0 :                 return NULL;
      86             :         }
      87             : 
      88             :         /* VFS doesn't initialise i_mode or i_state! */
      89   440371359 :         VFS_I(ip)->i_mode = 0;
      90   440371359 :         VFS_I(ip)->i_state = 0;
      91   440371359 :         mapping_set_large_folios(VFS_I(ip)->i_mapping);
      92             : 
      93   440375746 :         XFS_STATS_INC(mp, vn_active);
      94   439983615 :         ASSERT(atomic_read(&ip->i_pincount) == 0);
      95   439983615 :         ASSERT(ip->i_ino == 0);
      96             : 
      97             :         /* initialise the xfs inode */
      98   439983615 :         ip->i_ino = ino;
      99   439983615 :         ip->i_mount = mp;
     100   439983615 :         memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
     101   439983615 :         ip->i_cowfp = NULL;
     102   439983615 :         memset(&ip->i_af, 0, sizeof(ip->i_af));
     103   439983615 :         ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
     104   439983615 :         memset(&ip->i_df, 0, sizeof(ip->i_df));
     105   439983615 :         ip->i_flags = 0;
     106   439983615 :         ip->i_delayed_blks = 0;
     107   439983615 :         ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
     108   439983615 :         ip->i_nblocks = 0;
     109   439983615 :         ip->i_forkoff = 0;
     110   439983615 :         ip->i_sick = 0;
     111   439983615 :         ip->i_checked = 0;
     112   439983615 :         INIT_WORK(&ip->i_ioend_work, xfs_end_io);
     113   439983615 :         INIT_LIST_HEAD(&ip->i_ioend_list);
     114   439983615 :         spin_lock_init(&ip->i_ioend_lock);
     115   440325282 :         ip->i_next_unlinked = NULLAGINO;
     116   440325282 :         ip->i_prev_unlinked = NULLAGINO;
     117             : 
     118   440325282 :         return ip;
     119             : }
     120             : 
     121             : STATIC void
     122   439514671 : xfs_inode_free_callback(
     123             :         struct rcu_head         *head)
     124             : {
     125   439514671 :         struct inode            *inode = container_of(head, struct inode, i_rcu);
     126   439514671 :         struct xfs_inode        *ip = XFS_I(inode);
     127             : 
     128   439514671 :         switch (VFS_I(ip)->i_mode & S_IFMT) {
     129   282098078 :         case S_IFREG:
     130             :         case S_IFDIR:
     131             :         case S_IFLNK:
     132   282098078 :                 xfs_idestroy_fork(&ip->i_df);
     133   282098078 :                 break;
     134             :         }
     135             : 
     136   439525497 :         xfs_ifork_zap_attr(ip);
     137             : 
     138   439883299 :         if (ip->i_cowfp) {
     139    97073260 :                 xfs_idestroy_fork(ip->i_cowfp);
     140    97069355 :                 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
     141             :         }
     142   439776026 :         if (ip->i_itemp) {
     143    57732866 :                 ASSERT(!test_bit(XFS_LI_IN_AIL,
     144             :                                  &ip->i_itemp->ili_item.li_flags));
     145    57732866 :                 xfs_inode_item_destroy(ip);
     146    57773738 :                 ip->i_itemp = NULL;
     147             :         }
     148             : 
     149   439816898 :         kmem_cache_free(xfs_inode_cache, ip);
     150   439785164 : }
     151             : 
     152             : static void
     153   440580202 : __xfs_inode_free(
     154             :         struct xfs_inode        *ip)
     155             : {
     156             :         /* asserts to verify all state is correct here */
     157   440580202 :         ASSERT(atomic_read(&ip->i_pincount) == 0);
     158   440580202 :         ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
     159   440580202 :         XFS_STATS_DEC(ip->i_mount, vn_active);
     160             : 
     161   440580234 :         call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
     162   440580569 : }
     163             : 
     164             : void
     165      704913 : xfs_inode_free(
     166             :         struct xfs_inode        *ip)
     167             : {
     168     1409824 :         ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
     169             : 
     170             :         /*
     171             :          * Because we use RCU freeing we need to ensure the inode always
     172             :          * appears to be reclaimed with an invalid inode number when in the
     173             :          * free state. The ip->i_flags_lock provides the barrier against lookup
     174             :          * races.
     175             :          */
     176      704911 :         spin_lock(&ip->i_flags_lock);
     177      704912 :         ip->i_flags = XFS_IRECLAIM;
     178      704912 :         ip->i_ino = 0;
     179      704912 :         spin_unlock(&ip->i_flags_lock);
     180             : 
     181      704912 :         __xfs_inode_free(ip);
     182      704910 : }
     183             : 
     184             : /*
     185             :  * Queue background inode reclaim work if there are reclaimable inodes and there
     186             :  * isn't reclaim work already scheduled or in progress.
     187             :  */
     188             : static void
     189     3931602 : xfs_reclaim_work_queue(
     190             :         struct xfs_mount        *mp)
     191             : {
     192             : 
     193     3931602 :         rcu_read_lock();
     194     3931094 :         if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
     195     3922059 :                 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
     196     3922912 :                         msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
     197             :         }
     198     3931773 :         rcu_read_unlock();
     199     3929744 : }
     200             : 
     201             : /*
     202             :  * Background scanning to trim preallocated space. This is queued based on the
     203             :  * 'speculative_prealloc_lifetime' tunable (5m by default).
     204             :  */
     205             : static inline void
     206     3163122 : xfs_blockgc_queue(
     207             :         struct xfs_perag        *pag)
     208             : {
     209     3163122 :         struct xfs_mount        *mp = pag->pag_mount;
     210             : 
     211     6326244 :         if (!xfs_is_blockgc_enabled(mp))
     212             :                 return;
     213             : 
     214     3163006 :         rcu_read_lock();
     215     3162920 :         if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
     216     3120323 :                 queue_delayed_work(pag->pag_mount->m_blockgc_wq,
     217             :                                    &pag->pag_blockgc_work,
     218     3120338 :                                    msecs_to_jiffies(xfs_blockgc_secs * 1000));
     219     3162968 :         rcu_read_unlock();
     220             : }
     221             : 
     222             : /* Set a tag on both the AG incore inode tree and the AG radix tree. */
     223             : static void
     224  1091685323 : xfs_perag_set_inode_tag(
     225             :         struct xfs_perag        *pag,
     226             :         xfs_agino_t             agino,
     227             :         unsigned int            tag)
     228             : {
     229  1091685323 :         struct xfs_mount        *mp = pag->pag_mount;
     230  1091685323 :         bool                    was_tagged;
     231             : 
     232  1091685323 :         lockdep_assert_held(&pag->pag_ici_lock);
     233             : 
     234  1091685323 :         was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
     235  1091765141 :         radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
     236             : 
     237  1092001645 :         if (tag == XFS_ICI_RECLAIM_TAG)
     238  1086995575 :                 pag->pag_ici_reclaimable++;
     239             : 
     240  1092001645 :         if (was_tagged)
     241             :                 return;
     242             : 
     243             :         /* propagate the tag up into the perag radix tree */
     244     4282022 :         spin_lock(&mp->m_perag_lock);
     245     4289417 :         radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
     246     4289417 :         spin_unlock(&mp->m_perag_lock);
     247             : 
     248             :         /* start background work */
     249     4289407 :         switch (tag) {
     250     3855408 :         case XFS_ICI_RECLAIM_TAG:
     251     3855408 :                 xfs_reclaim_work_queue(mp);
     252     3855408 :                 break;
     253      433999 :         case XFS_ICI_BLOCKGC_TAG:
     254      433999 :                 xfs_blockgc_queue(pag);
     255      433999 :                 break;
     256             :         }
     257             : 
     258     4282064 :         trace_xfs_perag_set_inode_tag(pag, _RET_IP_);
     259             : }
     260             : 
     261             : /* Clear a tag on both the AG incore inode tree and the AG radix tree. */
     262             : static void
     263  1106448209 : xfs_perag_clear_inode_tag(
     264             :         struct xfs_perag        *pag,
     265             :         xfs_agino_t             agino,
     266             :         unsigned int            tag)
     267             : {
     268  1106448209 :         struct xfs_mount        *mp = pag->pag_mount;
     269             : 
     270  1106448209 :         lockdep_assert_held(&pag->pag_ici_lock);
     271             : 
     272             :         /*
     273             :          * Reclaim can signal (with a null agino) that it cleared its own tag
     274             :          * by removing the inode from the radix tree.
     275             :          */
     276  1106448209 :         if (agino != NULLAGINO)
     277   666572524 :                 radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
     278             :         else
     279   439875685 :                 ASSERT(tag == XFS_ICI_RECLAIM_TAG);
     280             : 
     281  1106388970 :         if (tag == XFS_ICI_RECLAIM_TAG)
     282  1087385245 :                 pag->pag_ici_reclaimable--;
     283             : 
     284  1106388970 :         if (radix_tree_tagged(&pag->pag_ici_root, tag))
     285             :                 return;
     286             : 
     287             :         /* clear the tag from the perag radix tree */
     288    15684093 :         spin_lock(&mp->m_perag_lock);
     289    15751171 :         radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
     290    15751171 :         spin_unlock(&mp->m_perag_lock);
     291             : 
     292    15750442 :         trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
     293             : }
     294             : 
     295             : /*
     296             :  * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
     297             :  * part of the structure. This is made more complex by the fact we store
     298             :  * information about the on-disk values in the VFS inode and so we can't just
     299             :  * overwrite the values unconditionally. Hence we save the parameters we
     300             :  * need to retain across reinitialisation, and rewrite them into the VFS inode
     301             :  * after reinitialisation even if it fails.
     302             :  */
     303             : static int
     304   647263808 : xfs_reinit_inode(
     305             :         struct xfs_mount        *mp,
     306             :         struct inode            *inode)
     307             : {
     308   647263808 :         int                     error;
     309   647263808 :         uint32_t                nlink = inode->i_nlink;
     310   647263808 :         uint32_t                generation = inode->i_generation;
     311   647263808 :         uint64_t                version = inode_peek_iversion(inode);
     312   647263808 :         umode_t                 mode = inode->i_mode;
     313   647263808 :         dev_t                   dev = inode->i_rdev;
     314   647263808 :         kuid_t                  uid = inode->i_uid;
     315   647263808 :         kgid_t                  gid = inode->i_gid;
     316             : 
     317   647263808 :         error = inode_init_always(mp->m_super, inode);
     318             : 
     319   647132291 :         set_nlink(inode, nlink);
     320   647055063 :         inode->i_generation = generation;
     321   647055063 :         inode_set_iversion_queried(inode, version);
     322   647055063 :         inode->i_mode = mode;
     323   647055063 :         inode->i_rdev = dev;
     324   647055063 :         inode->i_uid = uid;
     325   647055063 :         inode->i_gid = gid;
     326   647055063 :         mapping_set_large_folios(inode->i_mapping);
     327   647037720 :         return error;
     328             : }
     329             : 
     330             : /*
     331             :  * Carefully nudge an inode whose VFS state has been torn down back into a
     332             :  * usable state.  Drops the i_flags_lock and the rcu read lock.
     333             :  */
     334             : static int
     335   646234668 : xfs_iget_recycle(
     336             :         struct xfs_perag        *pag,
     337             :         struct xfs_inode        *ip) __releases(&ip->i_flags_lock)
     338             : {
     339   646234668 :         struct xfs_mount        *mp = ip->i_mount;
     340   646234668 :         struct inode            *inode = VFS_I(ip);
     341   646234668 :         int                     error;
     342             : 
     343   646234668 :         trace_xfs_iget_recycle(ip);
     344             : 
     345   646179328 :         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
     346             :                 return -EAGAIN;
     347             : 
     348             :         /*
     349             :          * We need to make it look like the inode is being reclaimed to prevent
     350             :          * the actual reclaim workers from stomping over us while we recycle
     351             :          * the inode.  We can't clear the radix tree tag yet as it requires
     352             :          * pag_ici_lock to be held exclusive.
     353             :          */
     354   646510033 :         ip->i_flags |= XFS_IRECLAIM;
     355             : 
     356   646510033 :         spin_unlock(&ip->i_flags_lock);
     357   647336062 :         rcu_read_unlock();
     358             : 
     359   647170089 :         ASSERT(!rwsem_is_locked(&inode->i_rwsem));
     360   647170089 :         error = xfs_reinit_inode(mp, inode);
     361   647044853 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     362   646732441 :         if (error) {
     363             :                 /*
     364             :                  * Re-initializing the inode failed, and we are in deep
     365             :                  * trouble.  Try to re-add it to the reclaim list.
     366             :                  */
     367           0 :                 rcu_read_lock();
     368           0 :                 spin_lock(&ip->i_flags_lock);
     369           0 :                 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
     370           0 :                 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
     371           0 :                 spin_unlock(&ip->i_flags_lock);
     372           0 :                 rcu_read_unlock();
     373             : 
     374           0 :                 trace_xfs_iget_recycle_fail(ip);
     375           0 :                 return error;
     376             :         }
     377             : 
     378   646732441 :         spin_lock(&pag->pag_ici_lock);
     379   647472105 :         spin_lock(&ip->i_flags_lock);
     380             : 
     381             :         /*
     382             :          * Clear the per-lifetime state in the inode as we are now effectively
     383             :          * a new inode and need to return to the initial state before reuse
     384             :          * occurs.
     385             :          */
     386   647586623 :         ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
     387   647586623 :         ip->i_flags |= XFS_INEW;
     388   647586623 :         xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
     389             :                         XFS_ICI_RECLAIM_TAG);
     390   647242642 :         inode->i_state = I_NEW;
     391  1294485284 :         spin_unlock(&ip->i_flags_lock);
     392   647626233 :         spin_unlock(&pag->pag_ici_lock);
     393             : 
     394   647626233 :         return 0;
     395             : }
     396             : 
     397             : /*
     398             :  * If we are allocating a new inode, then check what was returned is
     399             :  * actually a free, empty inode. If we are not allocating an inode,
     400             :  * then check we didn't find a free inode.
     401             :  *
     402             :  * Returns:
     403             :  *      0               if the inode free state matches the lookup context
     404             :  *      -ENOENT         if the inode is free and we are not allocating
     405             :  *      -EFSCORRUPTED   if there is any state mismatch at all
     406             :  */
     407             : static int
     408 57475212577 : xfs_iget_check_free_state(
     409             :         struct xfs_inode        *ip,
     410             :         int                     flags)
     411             : {
     412 57475212577 :         if (flags & XFS_IGET_CREATE) {
     413             :                 /* should be a free inode */
     414    84539389 :                 if (VFS_I(ip)->i_mode != 0) {
     415           0 :                         xfs_warn(ip->i_mount,
     416             : "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
     417             :                                 ip->i_ino, VFS_I(ip)->i_mode);
     418           0 :                         return -EFSCORRUPTED;
     419             :                 }
     420             : 
     421    84539389 :                 if (ip->i_nblocks != 0) {
     422           0 :                         xfs_warn(ip->i_mount,
     423             : "Corruption detected! Free inode 0x%llx has blocks allocated!",
     424             :                                 ip->i_ino);
     425           0 :                         return -EFSCORRUPTED;
     426             :                 }
     427             :                 return 0;
     428             :         }
     429             : 
     430             :         /* should be an allocated inode */
     431 57390673188 :         if (VFS_I(ip)->i_mode == 0)
     432     5272298 :                 return -ENOENT;
     433             : 
     434             :         return 0;
     435             : }
     436             : 
     437             : /* Make all pending inactivation work start immediately. */
     438             : static bool
     439    36573816 : xfs_inodegc_queue_all(
     440             :         struct xfs_mount        *mp)
     441             : {
     442    36573816 :         struct xfs_inodegc      *gc;
     443    36573816 :         int                     cpu;
     444    36573816 :         bool                    ret = false;
     445             : 
     446   183046942 :         for_each_online_cpu(cpu) {
     447   146247766 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
     448   146420033 :                 if (!llist_empty(&gc->list)) {
     449     1593648 :                         mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
     450     1593648 :                         ret = true;
     451             :                 }
     452             :         }
     453             : 
     454    36792502 :         return ret;
     455             : }
     456             : 
     457             : /* Wait for all queued work and collect errors */
     458             : static int
     459    14385999 : xfs_inodegc_wait_all(
     460             :         struct xfs_mount        *mp)
     461             : {
     462    14385999 :         int                     cpu;
     463    14385999 :         int                     error = 0;
     464             : 
     465    14385999 :         flush_workqueue(mp->m_inodegc_wq);
     466    87161387 :         for_each_online_cpu(cpu) {
     467    58218711 :                 struct xfs_inodegc      *gc;
     468             : 
     469    58218711 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
     470    58218806 :                 if (gc->error && !error)
     471        2260 :                         error = gc->error;
     472    58218806 :                 gc->error = 0;
     473             :         }
     474             : 
     475    14555902 :         return error;
     476             : }
     477             : 
     478             : /*
     479             :  * Check the validity of the inode we just found it the cache
     480             :  */
     481             : static int
     482 56867034145 : xfs_iget_cache_hit(
     483             :         struct xfs_perag        *pag,
     484             :         struct xfs_inode        *ip,
     485             :         xfs_ino_t               ino,
     486             :         int                     flags,
     487             :         int                     lock_flags) __releases(RCU)
     488             : {
     489 56867034145 :         struct inode            *inode = VFS_I(ip);
     490 56867034145 :         struct xfs_mount        *mp = ip->i_mount;
     491 56867034145 :         int                     error;
     492             : 
     493             :         /*
     494             :          * check for re-use of an inode within an RCU grace period due to the
     495             :          * radix tree nodes not being updated yet. We monitor for this by
     496             :          * setting the inode number to zero before freeing the inode structure.
     497             :          * If the inode has been reallocated and set up, then the inode number
     498             :          * will not match, so check for that, too.
     499             :          */
     500 56867034145 :         spin_lock(&ip->i_flags_lock);
     501 57896732367 :         if (ip->i_ino != ino)
     502          22 :                 goto out_skip;
     503             : 
     504             :         /*
     505             :          * If we are racing with another cache hit that is currently
     506             :          * instantiating this inode or currently recycling it out of
     507             :          * reclaimable state, wait for the initialisation to complete
     508             :          * before continuing.
     509             :          *
     510             :          * If we're racing with the inactivation worker we also want to wait.
     511             :          * If we're creating a new file, it's possible that the worker
     512             :          * previously marked the inode as free on disk but hasn't finished
     513             :          * updating the incore state yet.  The AGI buffer will be dirty and
     514             :          * locked to the icreate transaction, so a synchronous push of the
     515             :          * inodegc workers would result in deadlock.  For a regular iget, the
     516             :          * worker is running already, so we might as well wait.
     517             :          *
     518             :          * XXX(hch): eventually we should do something equivalent to
     519             :          *           wait_on_inode to wait for these flags to be cleared
     520             :          *           instead of polling for it.
     521             :          */
     522 57896732345 :         if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
     523      295833 :                 goto out_skip;
     524             : 
     525 57896436512 :         if (ip->i_flags & XFS_NEED_INACTIVE) {
     526             :                 /* Unlinked inodes cannot be re-grabbed. */
     527     2856477 :                 if (VFS_I(ip)->i_nlink == 0) {
     528     2833530 :                         error = -ENOENT;
     529     2833530 :                         goto out_error;
     530             :                 }
     531       22947 :                 goto out_inodegc_flush;
     532             :         }
     533             : 
     534             :         /*
     535             :          * Check the inode free state is valid. This also detects lookup
     536             :          * racing with unlinks.
     537             :          */
     538 57893580035 :         error = xfs_iget_check_free_state(ip, flags);
     539 56632720115 :         if (error)
     540     5272308 :                 goto out_error;
     541             : 
     542             :         /* Skip inodes that have no vfs state. */
     543 56627447807 :         if ((flags & XFS_IGET_INCORE) &&
     544  2174606004 :             (ip->i_flags & XFS_IRECLAIMABLE))
     545       91620 :                 goto out_skip;
     546             : 
     547             :         /* The inode fits the selection criteria; process it. */
     548 56627356187 :         if (ip->i_flags & XFS_IRECLAIMABLE) {
     549             :                 /* Drops i_flags_lock and RCU read lock. */
     550   646877741 :                 error = xfs_iget_recycle(pag, ip);
     551   647552802 :                 if (error == -EAGAIN)
     552           3 :                         goto out_skip;
     553   647552799 :                 if (error)
     554             :                         return error;
     555             :         } else {
     556             :                 /* If the VFS inode is being torn down, pause and try again. */
     557 55980478446 :                 if (!igrab(inode))
     558      274002 :                         goto out_skip;
     559             : 
     560             :                 /* We've got a live one. */
     561 56557780567 :                 spin_unlock(&ip->i_flags_lock);
     562 56962432408 :                 rcu_read_unlock();
     563 56956404452 :                 trace_xfs_iget_hit(ip);
     564             :         }
     565             : 
     566 56557118906 :         if (lock_flags != 0)
     567 51918372692 :                 xfs_ilock(ip, lock_flags);
     568             : 
     569 57098688360 :         if (!(flags & XFS_IGET_INCORE))
     570 54949339390 :                 xfs_iflags_clear(ip, XFS_ISTALE);
     571 57231245964 :         XFS_STATS_INC(mp, xs_ig_found);
     572             : 
     573 56582939682 :         return 0;
     574             : 
     575      661480 : out_skip:
     576      661480 :         trace_xfs_iget_skip(ip);
     577      661187 :         XFS_STATS_INC(mp, xs_ig_frecycle);
     578      661131 :         error = -EAGAIN;
     579     8766969 : out_error:
     580     8766969 :         spin_unlock(&ip->i_flags_lock);
     581     8767575 :         rcu_read_unlock();
     582     8767575 :         return error;
     583             : 
     584             : out_inodegc_flush:
     585       22947 :         spin_unlock(&ip->i_flags_lock);
     586       22946 :         rcu_read_unlock();
     587             :         /*
     588             :          * Do not wait for the workers, because the caller could hold an AGI
     589             :          * buffer lock.  We're just going to sleep in a loop anyway.
     590             :          */
     591       45892 :         if (xfs_is_inodegc_enabled(mp))
     592       22942 :                 xfs_inodegc_queue_all(mp);
     593             :         return -EAGAIN;
     594             : }
     595             : 
     596             : static int
     597   440042033 : xfs_iget_cache_miss(
     598             :         struct xfs_mount        *mp,
     599             :         struct xfs_perag        *pag,
     600             :         xfs_trans_t             *tp,
     601             :         xfs_ino_t               ino,
     602             :         struct xfs_inode        **ipp,
     603             :         int                     flags,
     604             :         int                     lock_flags)
     605             : {
     606   440042033 :         struct xfs_inode        *ip;
     607   440042033 :         int                     error;
     608   440042033 :         xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
     609   440042033 :         int                     iflags;
     610             : 
     611   440042033 :         ip = xfs_inode_alloc(mp, ino);
     612   439992057 :         if (!ip)
     613             :                 return -ENOMEM;
     614             : 
     615   439992057 :         error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags);
     616   440009329 :         if (error)
     617       69189 :                 goto out_destroy;
     618             : 
     619             :         /*
     620             :          * For version 5 superblocks, if we are initialising a new inode and we
     621             :          * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
     622             :          * simply build the new inode core with a random generation number.
     623             :          *
     624             :          * For version 4 (and older) superblocks, log recovery is dependent on
     625             :          * the i_flushiter field being initialised from the current on-disk
     626             :          * value and hence we must also read the inode off disk even when
     627             :          * initializing new inodes.
     628             :          */
     629   439940140 :         if (xfs_has_v3inodes(mp) &&
     630   439938042 :             (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
     631    47694716 :                 VFS_I(ip)->i_generation = get_random_u32();
     632             :         } else {
     633   392245424 :                 struct xfs_buf          *bp;
     634             : 
     635   392245424 :                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
     636   392479599 :                 if (error)
     637        7340 :                         goto out_destroy;
     638             : 
     639   392452600 :                 error = xfs_inode_from_disk(ip,
     640   392473364 :                                 xfs_buf_offset(bp, ip->i_imap.im_boffset));
     641   392470386 :                 if (!error)
     642   392469239 :                         xfs_buf_set_ref(bp, XFS_INO_REF);
     643   392468031 :                 xfs_trans_brelse(tp, bp);
     644             : 
     645   392506997 :                 if (error)
     646        1105 :                         goto out_destroy;
     647             :         }
     648             : 
     649   440053999 :         trace_xfs_iget_miss(ip);
     650             : 
     651             :         /*
     652             :          * Check the inode free state is valid. This also detects lookup
     653             :          * racing with unlinks.
     654             :          */
     655   439761049 :         error = xfs_iget_check_free_state(ip, flags);
     656   439797657 :         if (error)
     657           0 :                 goto out_destroy;
     658             : 
     659             :         /*
     660             :          * Preload the radix tree so we can insert safely under the
     661             :          * write spinlock. Note that we cannot sleep inside the preload
     662             :          * region. Since we can be called from transaction context, don't
     663             :          * recurse into the file system.
     664             :          */
     665   439797657 :         if (radix_tree_preload(GFP_NOFS)) {
     666           0 :                 error = -EAGAIN;
     667           0 :                 goto out_destroy;
     668             :         }
     669             : 
     670             :         /*
     671             :          * Because the inode hasn't been added to the radix-tree yet it can't
     672             :          * be found by another thread, so we can do the non-sleeping lock here.
     673             :          */
     674   439920349 :         if (lock_flags) {
     675   420766308 :                 if (!xfs_ilock_nowait(ip, lock_flags))
     676           0 :                         BUG();
     677             :         }
     678             : 
     679             :         /*
     680             :          * These values must be set before inserting the inode into the radix
     681             :          * tree as the moment it is inserted a concurrent lookup (allowed by the
     682             :          * RCU locking mechanism) can find it and that lookup must see that this
     683             :          * is an inode currently under construction (i.e. that XFS_INEW is set).
     684             :          * The ip->i_flags_lock that protects the XFS_INEW flag forms the
     685             :          * memory barrier that ensures this detection works correctly at lookup
     686             :          * time.
     687             :          */
     688   440086118 :         iflags = XFS_INEW;
     689   440086118 :         if (flags & XFS_IGET_DONTCACHE)
     690   373392656 :                 d_mark_dontcache(VFS_I(ip));
     691   440099841 :         ip->i_udquot = NULL;
     692   440099841 :         ip->i_gdquot = NULL;
     693   440099841 :         ip->i_pdquot = NULL;
     694   440099841 :         xfs_iflags_set(ip, iflags);
     695             : 
     696             :         /* insert the new inode */
     697   440280062 :         spin_lock(&pag->pag_ici_lock);
     698   440387201 :         error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
     699   439983340 :         if (unlikely(error)) {
     700      628384 :                 WARN_ON(error != -EEXIST);
     701      628384 :                 XFS_STATS_INC(mp, xs_ig_dup);
     702      628384 :                 error = -EAGAIN;
     703      628384 :                 goto out_preload_end;
     704             :         }
     705   439354956 :         spin_unlock(&pag->pag_ici_lock);
     706   439533426 :         radix_tree_preload_end();
     707             : 
     708   439240768 :         *ipp = ip;
     709   439240768 :         return 0;
     710             : 
     711             : out_preload_end:
     712      628384 :         spin_unlock(&pag->pag_ici_lock);
     713      628384 :         radix_tree_preload_end();
     714      628384 :         if (lock_flags)
     715      624204 :                 xfs_iunlock(ip, lock_flags);
     716        4180 : out_destroy:
     717      704913 :         __destroy_inode(VFS_I(ip));
     718      704913 :         xfs_inode_free(ip);
     719      704913 :         return error;
     720             : }
     721             : 
     722             : /*
     723             :  * Look up an inode by number in the given file system.  The inode is looked up
     724             :  * in the cache held in each AG.  If the inode is found in the cache, initialise
     725             :  * the vfs inode if necessary.
     726             :  *
     727             :  * If it is not in core, read it in from the file system's device, add it to the
     728             :  * cache and initialise the vfs inode.
     729             :  *
     730             :  * The inode is locked according to the value of the lock_flags parameter.
     731             :  * Inode lookup is only done during metadata operations and not as part of the
     732             :  * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
     733             :  */
     734             : int
     735 57388208611 : xfs_iget(
     736             :         struct xfs_mount        *mp,
     737             :         struct xfs_trans        *tp,
     738             :         xfs_ino_t               ino,
     739             :         uint                    flags,
     740             :         uint                    lock_flags,
     741             :         struct xfs_inode        **ipp)
     742             : {
     743 57388208611 :         struct xfs_inode        *ip;
     744 57388208611 :         struct xfs_perag        *pag;
     745 57388208611 :         xfs_agino_t             agino;
     746 57388208611 :         int                     error;
     747             : 
     748 57388208611 :         ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
     749             : 
     750             :         /* reject inode numbers outside existing AGs */
     751 57388208611 :         if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
     752      678300 :                 return -EINVAL;
     753             : 
     754 57387530311 :         XFS_STATS_INC(mp, xs_ig_attempts);
     755             : 
     756             :         /* get the perag structure and ensure that it's inode capable */
     757 56545365969 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
     758 57465699680 :         agino = XFS_INO_TO_AGINO(mp, ino);
     759             : 
     760 57466902101 : again:
     761 57466902101 :         error = 0;
     762 57466902101 :         rcu_read_lock();
     763 57466850878 :         ip = radix_tree_lookup(&pag->pag_ici_root, agino);
     764             : 
     765 57890359641 :         if (ip) {
     766 57391499309 :                 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
     767 56390322019 :                 if (error)
     768     8790187 :                         goto out_error_or_again;
     769             :         } else {
     770   498860332 :                 rcu_read_unlock();
     771   498871335 :                 if (flags & XFS_IGET_INCORE) {
     772    58677694 :                         error = -ENODATA;
     773    58677694 :                         goto out_error_or_again;
     774             :                 }
     775   440193641 :                 XFS_STATS_INC(mp, xs_ig_missed);
     776             : 
     777   439972431 :                 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
     778             :                                                         flags, lock_flags);
     779   439965060 :                 if (error)
     780      704910 :                         goto out_error_or_again;
     781             :         }
     782 56820791982 :         xfs_perag_put(pag);
     783             : 
     784 58009233559 :         *ipp = ip;
     785             : 
     786             :         /*
     787             :          * If we have a real type for an on-disk inode, we can setup the inode
     788             :          * now.  If it's a new inode being created, xfs_init_new_inode will
     789             :          * handle it.
     790             :          */
     791 >11631*10^7 :         if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
     792  1001988506 :                 xfs_setup_existing_inode(ip);
     793             :         return 0;
     794             : 
     795    68172791 : out_error_or_again:
     796    68172791 :         if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
     797             :             error == -EAGAIN) {
     798     1218010 :                 delay(1);
     799     1202421 :                 goto again;
     800             :         }
     801    66954781 :         xfs_perag_put(pag);
     802    66954781 :         return error;
     803             : }
     804             : 
     805             : /*
     806             :  * "Is this a cached inode that's also allocated?"
     807             :  *
     808             :  * Look up an inode by number in the given file system.  If the inode is
     809             :  * in cache and isn't in purgatory, return 1 if the inode is allocated
     810             :  * and 0 if it is not.  For all other cases (not in cache, being torn
     811             :  * down, etc.), return a negative error code.
     812             :  *
     813             :  * The caller has to prevent inode allocation and freeing activity,
     814             :  * presumably by locking the AGI buffer.   This is to ensure that an
     815             :  * inode cannot transition from allocated to freed until the caller is
     816             :  * ready to allow that.  If the inode is in an intermediate state (new,
     817             :  * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
     818             :  * inode is not in the cache, -ENOENT will be returned.  The caller must
     819             :  * deal with these scenarios appropriately.
     820             :  *
     821             :  * This is a specialized use case for the online scrubber; if you're
     822             :  * reading this, you probably want xfs_iget.
     823             :  */
     824             : int
     825  2234123251 : xfs_icache_inode_is_allocated(
     826             :         struct xfs_mount        *mp,
     827             :         struct xfs_trans        *tp,
     828             :         xfs_ino_t               ino,
     829             :         bool                    *inuse)
     830             : {
     831  2234123251 :         struct xfs_inode        *ip;
     832  2234123251 :         int                     error;
     833             : 
     834  2234123251 :         error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
     835  2237612374 :         if (error)
     836             :                 return error;
     837             : 
     838  2177978466 :         *inuse = !!(VFS_I(ip)->i_mode);
     839  2177978466 :         xfs_irele(ip);
     840  2177978466 :         return 0;
     841             : }
     842             : 
     843             : /*
     844             :  * Grab the inode for reclaim exclusively.
     845             :  *
     846             :  * We have found this inode via a lookup under RCU, so the inode may have
     847             :  * already been freed, or it may be in the process of being recycled by
     848             :  * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
     849             :  * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
     850             :  * will not be set. Hence we need to check for both these flag conditions to
     851             :  * avoid inodes that are no longer reclaim candidates.
     852             :  *
     853             :  * Note: checking for other state flags here, under the i_flags_lock or not, is
     854             :  * racy and should be avoided. Those races should be resolved only after we have
     855             :  * ensured that we are able to reclaim this inode and the world can see that we
     856             :  * are going to reclaim it.
     857             :  *
     858             :  * Return true if we grabbed it, false otherwise.
     859             :  */
     860             : static bool
     861   456697242 : xfs_reclaim_igrab(
     862             :         struct xfs_inode        *ip,
     863             :         struct xfs_icwalk       *icw)
     864             : {
     865   456697242 :         ASSERT(rcu_read_lock_held());
     866             : 
     867   456697242 :         spin_lock(&ip->i_flags_lock);
     868   456698260 :         if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
     869             :             __xfs_iflags_test(ip, XFS_IRECLAIM)) {
     870             :                 /* not a reclaim candidate. */
     871       11973 :                 spin_unlock(&ip->i_flags_lock);
     872       11973 :                 return false;
     873             :         }
     874             : 
     875             :         /* Don't reclaim a sick inode unless the caller asked for it. */
     876   456686287 :         if (ip->i_sick &&
     877          33 :             (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
     878           0 :                 spin_unlock(&ip->i_flags_lock);
     879           0 :                 return false;
     880             :         }
     881             : 
     882   456686287 :         __xfs_iflags_set(ip, XFS_IRECLAIM);
     883   456686287 :         spin_unlock(&ip->i_flags_lock);
     884   456686287 :         return true;
     885             : }
     886             : 
     887             : /*
     888             :  * Inode reclaim is non-blocking, so the default action if progress cannot be
     889             :  * made is to "requeue" the inode for reclaim by unlocking it and clearing the
     890             :  * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
     891             :  * blocking anymore and hence we can wait for the inode to be able to reclaim
     892             :  * it.
     893             :  *
     894             :  * We do no IO here - if callers require inodes to be cleaned they must push the
     895             :  * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
     896             :  * done in the background in a non-blocking manner, and enables memory reclaim
     897             :  * to make progress without blocking.
     898             :  */
     899             : static void
     900   456686034 : xfs_reclaim_inode(
     901             :         struct xfs_inode        *ip,
     902             :         struct xfs_perag        *pag)
     903             : {
     904   456686034 :         xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
     905             : 
     906   456686034 :         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
     907       10950 :                 goto out;
     908   456674299 :         if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
     909     8309420 :                 goto out_iunlock;
     910             : 
     911             :         /*
     912             :          * Check for log shutdown because aborting the inode can move the log
     913             :          * tail and corrupt in memory state. This is fine if the log is shut
     914             :          * down, but if the log is still active and only the mount is shut down
     915             :          * then the in-memory log tail movement caused by the abort can be
     916             :          * incorrectly propagated to disk.
     917             :          */
     918   896731848 :         if (xlog_is_shutdown(ip->i_mount->m_log)) {
     919   339864664 :                 xfs_iunpin_wait(ip);
     920   339864664 :                 xfs_iflush_shutdown_abort(ip);
     921   339864664 :                 goto reclaim;
     922             :         }
     923   108501260 :         if (xfs_ipincount(ip))
     924     3099242 :                 goto out_clear_flush;
     925   105402018 :         if (!xfs_inode_clean(ip))
     926     5391056 :                 goto out_clear_flush;
     927             : 
     928   100010962 :         xfs_iflags_clear(ip, XFS_IFLUSHING);
     929   439875700 : reclaim:
     930   439875700 :         trace_xfs_inode_reclaiming(ip);
     931             : 
     932             :         /*
     933             :          * Because we use RCU freeing we need to ensure the inode always appears
     934             :          * to be reclaimed with an invalid inode number when in the free state.
     935             :          * We do this as early as possible under the ILOCK so that
     936             :          * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
     937             :          * detect races with us here. By doing this, we guarantee that once
     938             :          * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
     939             :          * it will see either a valid inode that will serialise correctly, or it
     940             :          * will see an invalid inode that it can skip.
     941             :          */
     942   439875548 :         spin_lock(&ip->i_flags_lock);
     943   439875791 :         ip->i_flags = XFS_IRECLAIM;
     944   439875791 :         ip->i_ino = 0;
     945   439875791 :         ip->i_sick = 0;
     946   439875791 :         ip->i_checked = 0;
     947   439875791 :         spin_unlock(&ip->i_flags_lock);
     948             : 
     949   439875767 :         ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
     950   439875767 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     951             : 
     952   439875424 :         XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
     953             :         /*
     954             :          * Remove the inode from the per-AG radix tree.
     955             :          *
     956             :          * Because radix_tree_delete won't complain even if the item was never
     957             :          * added to the tree assert that it's been there before to catch
     958             :          * problems with the inode life time early on.
     959             :          */
     960   439875452 :         spin_lock(&pag->pag_ici_lock);
     961   439875651 :         if (!radix_tree_delete(&pag->pag_ici_root,
     962   439875821 :                                 XFS_INO_TO_AGINO(ip->i_mount, ino)))
     963           0 :                 ASSERT(0);
     964   439875651 :         xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
     965   439875410 :         spin_unlock(&pag->pag_ici_lock);
     966             : 
     967             :         /*
     968             :          * Here we do an (almost) spurious inode lock in order to coordinate
     969             :          * with inode cache radix tree lookups.  This is because the lookup
     970             :          * can reference the inodes in the cache without taking references.
     971             :          *
     972             :          * We make that OK here by ensuring that we wait until the inode is
     973             :          * unlocked after the lookup before we go ahead and free it.
     974             :          */
     975   439875800 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
     976   439875726 :         ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
     977   439875726 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     978   497681792 :         ASSERT(xfs_inode_clean(ip));
     979             : 
     980   439875535 :         __xfs_inode_free(ip);
     981   439875535 :         return;
     982             : 
     983     8490298 : out_clear_flush:
     984     8490298 :         xfs_iflags_clear(ip, XFS_IFLUSHING);
     985    16799718 : out_iunlock:
     986    16799718 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     987    16810665 : out:
     988    16810665 :         xfs_iflags_clear(ip, XFS_IRECLAIM);
     989             : }
     990             : 
     991             : /* Reclaim sick inodes if we're unmounting or the fs went down. */
     992             : static inline bool
     993      107119 : xfs_want_reclaim_sick(
     994             :         struct xfs_mount        *mp)
     995             : {
     996      262123 :         return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
     997             :                xfs_is_shutdown(mp);
     998             : }
     999             : 
    1000             : void
    1001       59234 : xfs_reclaim_inodes(
    1002             :         struct xfs_mount        *mp)
    1003             : {
    1004       59234 :         struct xfs_icwalk       icw = {
    1005             :                 .icw_flags      = 0,
    1006             :         };
    1007             : 
    1008       59234 :         if (xfs_want_reclaim_sick(mp))
    1009       59234 :                 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
    1010             : 
    1011      594400 :         while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
    1012      535166 :                 xfs_ail_push_all_sync(mp->m_ail);
    1013      535166 :                 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
    1014             :         }
    1015       59234 : }
    1016             : 
    1017             : /*
    1018             :  * The shrinker infrastructure determines how many inodes we should scan for
    1019             :  * reclaim. We want as many clean inodes ready to reclaim as possible, so we
    1020             :  * push the AIL here. We also want to proactively free up memory if we can to
    1021             :  * minimise the amount of work memory reclaim has to do so we kick the
    1022             :  * background reclaim if it isn't already scheduled.
    1023             :  */
    1024             : long
    1025       47885 : xfs_reclaim_inodes_nr(
    1026             :         struct xfs_mount        *mp,
    1027             :         unsigned long           nr_to_scan)
    1028             : {
    1029       47885 :         struct xfs_icwalk       icw = {
    1030             :                 .icw_flags      = XFS_ICWALK_FLAG_SCAN_LIMIT,
    1031       47885 :                 .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
    1032             :         };
    1033             : 
    1034       47885 :         if (xfs_want_reclaim_sick(mp))
    1035           0 :                 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
    1036             : 
    1037             :         /* kick background reclaimer and push the AIL */
    1038       47885 :         xfs_reclaim_work_queue(mp);
    1039       47885 :         xfs_ail_push_all(mp->m_ail);
    1040             : 
    1041       47885 :         xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
    1042       47885 :         return 0;
    1043             : }
    1044             : 
    1045             : /*
    1046             :  * Return the number of reclaimable inodes in the filesystem for
    1047             :  * the shrinker to determine how much to reclaim.
    1048             :  */
    1049             : long
    1050      257854 : xfs_reclaim_inodes_count(
    1051             :         struct xfs_mount        *mp)
    1052             : {
    1053      257854 :         struct xfs_perag        *pag;
    1054      257854 :         xfs_agnumber_t          ag = 0;
    1055      257854 :         long                    reclaimable = 0;
    1056             : 
    1057      611921 :         while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
    1058      354067 :                 ag = pag->pag_agno + 1;
    1059      354067 :                 reclaimable += pag->pag_ici_reclaimable;
    1060      354067 :                 xfs_perag_put(pag);
    1061             :         }
    1062      257855 :         return reclaimable;
    1063             : }
    1064             : 
    1065             : STATIC bool
    1066     2879084 : xfs_icwalk_match_id(
    1067             :         struct xfs_inode        *ip,
    1068             :         struct xfs_icwalk       *icw)
    1069             : {
    1070     2879084 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
    1071             :             !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
    1072             :                 return false;
    1073             : 
    1074     2879084 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
    1075             :             !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
    1076             :                 return false;
    1077             : 
    1078     2879084 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
    1079           0 :             ip->i_projid != icw->icw_prid)
    1080           0 :                 return false;
    1081             : 
    1082             :         return true;
    1083             : }
    1084             : 
    1085             : /*
    1086             :  * A union-based inode filtering algorithm. Process the inode if any of the
    1087             :  * criteria match. This is for global/internal scans only.
    1088             :  */
    1089             : STATIC bool
    1090        3228 : xfs_icwalk_match_id_union(
    1091             :         struct xfs_inode        *ip,
    1092             :         struct xfs_icwalk       *icw)
    1093             : {
    1094        3228 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
    1095             :             uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
    1096             :                 return true;
    1097             : 
    1098         148 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
    1099             :             gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
    1100             :                 return true;
    1101             : 
    1102           0 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
    1103           0 :             ip->i_projid == icw->icw_prid)
    1104           0 :                 return true;
    1105             : 
    1106             :         return false;
    1107             : }
    1108             : 
    1109             : /*
    1110             :  * Is this inode @ip eligible for eof/cow block reclamation, given some
    1111             :  * filtering parameters @icw?  The inode is eligible if @icw is null or
    1112             :  * if the predicate functions match.
    1113             :  */
    1114             : static bool
    1115     4338782 : xfs_icwalk_match(
    1116             :         struct xfs_inode        *ip,
    1117             :         struct xfs_icwalk       *icw)
    1118             : {
    1119     4338782 :         bool                    match;
    1120             : 
    1121     4338782 :         if (!icw)
    1122             :                 return true;
    1123             : 
    1124     2863667 :         if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
    1125        3225 :                 match = xfs_icwalk_match_id_union(ip, icw);
    1126             :         else
    1127     2860442 :                 match = xfs_icwalk_match_id(ip, icw);
    1128     2863667 :         if (!match)
    1129             :                 return false;
    1130             : 
    1131             :         /* skip the inode if the file size is too small */
    1132     2858091 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
    1133           0 :             XFS_ISIZE(ip) < icw->icw_min_file_size)
    1134           0 :                 return false;
    1135             : 
    1136             :         return true;
    1137             : }
    1138             : 
    1139             : /*
    1140             :  * This is a fast pass over the inode cache to try to get reclaim moving on as
    1141             :  * many inodes as possible in a short period of time. It kicks itself every few
    1142             :  * seconds, as well as being kicked by the inode cache shrinker when memory
    1143             :  * goes low.
    1144             :  */
    1145             : void
    1146       32096 : xfs_reclaim_worker(
    1147             :         struct work_struct *work)
    1148             : {
    1149       32096 :         struct xfs_mount *mp = container_of(to_delayed_work(work),
    1150             :                                         struct xfs_mount, m_reclaim_work);
    1151             : 
    1152       32096 :         xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
    1153       32096 :         xfs_reclaim_work_queue(mp);
    1154       32096 : }
    1155             : 
    1156             : STATIC int
    1157    27721429 : xfs_inode_free_eofblocks(
    1158             :         struct xfs_inode        *ip,
    1159             :         struct xfs_icwalk       *icw,
    1160             :         unsigned int            *lockflags)
    1161             : {
    1162    27721429 :         bool                    wait;
    1163             : 
    1164    27721429 :         wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
    1165             : 
    1166    55752533 :         if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
    1167             :                 return 0;
    1168             : 
    1169             :         /*
    1170             :          * If the mapping is dirty the operation can block and wait for some
    1171             :          * time. Unless we are waiting, skip it.
    1172             :          */
    1173     4695425 :         if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
    1174             :                 return 0;
    1175             : 
    1176     2707720 :         if (!xfs_icwalk_match(ip, icw))
    1177             :                 return 0;
    1178             : 
    1179             :         /*
    1180             :          * If the caller is waiting, return -EAGAIN to keep the background
    1181             :          * scanner moving and revisit the inode in a subsequent pass.
    1182             :          */
    1183     2707720 :         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
    1184     1777070 :                 if (wait)
    1185             :                         return -EAGAIN;
    1186       99592 :                 return 0;
    1187             :         }
    1188      957502 :         *lockflags |= XFS_IOLOCK_EXCL;
    1189             : 
    1190      957502 :         if (xfs_can_free_eofblocks(ip, false))
    1191      390626 :                 return xfs_free_eofblocks(ip);
    1192             : 
    1193             :         /* inode could be preallocated or append-only */
    1194      566851 :         trace_xfs_inode_free_eofblocks_invalid(ip);
    1195      566839 :         xfs_inode_clear_eofblocks_tag(ip);
    1196      566839 :         return 0;
    1197             : }
    1198             : 
    1199             : static void
    1200     8938691 : xfs_blockgc_set_iflag(
    1201             :         struct xfs_inode        *ip,
    1202             :         unsigned long           iflag)
    1203             : {
    1204     8938691 :         struct xfs_mount        *mp = ip->i_mount;
    1205     8938691 :         struct xfs_perag        *pag;
    1206             : 
    1207     8938691 :         ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
    1208             : 
    1209             :         /*
    1210             :          * Don't bother locking the AG and looking up in the radix trees
    1211             :          * if we already know that we have the tag set.
    1212             :          */
    1213     8938691 :         if (ip->i_flags & iflag)
    1214             :                 return;
    1215     4877912 :         spin_lock(&ip->i_flags_lock);
    1216     4882995 :         ip->i_flags |= iflag;
    1217     4882995 :         spin_unlock(&ip->i_flags_lock);
    1218             : 
    1219     4882554 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
    1220     4888354 :         spin_lock(&pag->pag_ici_lock);
    1221             : 
    1222     4887794 :         xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
    1223             :                         XFS_ICI_BLOCKGC_TAG);
    1224             : 
    1225     4881738 :         spin_unlock(&pag->pag_ici_lock);
    1226     4879879 :         xfs_perag_put(pag);
    1227             : }
    1228             : 
    1229             : void
    1230     4803750 : xfs_inode_set_eofblocks_tag(
    1231             :         xfs_inode_t     *ip)
    1232             : {
    1233     4803750 :         trace_xfs_inode_set_eofblocks_tag(ip);
    1234     4803504 :         return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
    1235             : }
    1236             : 
    1237             : static void
    1238    24970123 : xfs_blockgc_clear_iflag(
    1239             :         struct xfs_inode        *ip,
    1240             :         unsigned long           iflag)
    1241             : {
    1242    24970123 :         struct xfs_mount        *mp = ip->i_mount;
    1243    24970123 :         struct xfs_perag        *pag;
    1244    24970123 :         bool                    clear_tag;
    1245             : 
    1246    24970123 :         ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
    1247             : 
    1248    24970123 :         spin_lock(&ip->i_flags_lock);
    1249    25024437 :         ip->i_flags &= ~iflag;
    1250    25024437 :         clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
    1251    25024437 :         spin_unlock(&ip->i_flags_lock);
    1252             : 
    1253    24997565 :         if (!clear_tag)
    1254             :                 return;
    1255             : 
    1256    19047261 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
    1257    19086664 :         spin_lock(&pag->pag_ici_lock);
    1258             : 
    1259    19087857 :         xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
    1260             :                         XFS_ICI_BLOCKGC_TAG);
    1261             : 
    1262    19084015 :         spin_unlock(&pag->pag_ici_lock);
    1263    19097963 :         xfs_perag_put(pag);
    1264             : }
    1265             : 
    1266             : void
    1267    19330444 : xfs_inode_clear_eofblocks_tag(
    1268             :         xfs_inode_t     *ip)
    1269             : {
    1270    19330444 :         trace_xfs_inode_clear_eofblocks_tag(ip);
    1271    19275966 :         return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
    1272             : }
    1273             : 
    1274             : /*
    1275             :  * Set ourselves up to free CoW blocks from this file.  If it's already clean
    1276             :  * then we can bail out quickly, but otherwise we must back off if the file
    1277             :  * is undergoing some kind of write.
    1278             :  */
    1279             : static bool
    1280    23856404 : xfs_prep_free_cowblocks(
    1281             :         struct xfs_inode        *ip)
    1282             : {
    1283             :         /*
    1284             :          * Just clear the tag if we have an empty cow fork or none at all. It's
    1285             :          * possible the inode was fully unshared since it was originally tagged.
    1286             :          */
    1287    47712808 :         if (!xfs_inode_has_cow_data(ip)) {
    1288      302242 :                 trace_xfs_inode_free_cowblocks_invalid(ip);
    1289      301840 :                 xfs_inode_clear_cowblocks_tag(ip);
    1290      301840 :                 return false;
    1291             :         }
    1292             : 
    1293             :         /*
    1294             :          * If the mapping is dirty or under writeback we cannot touch the
    1295             :          * CoW fork.  Leave it alone if we're in the midst of a directio.
    1296             :          */
    1297    26501882 :         if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
    1298     5894494 :             mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
    1299     5066199 :             mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
    1300             :             atomic_read(&VFS_I(ip)->i_dio_count))
    1301    21437709 :                 return false;
    1302             : 
    1303             :         return true;
    1304             : }
    1305             : 
    1306             : /*
    1307             :  * Automatic CoW Reservation Freeing
    1308             :  *
    1309             :  * These functions automatically garbage collect leftover CoW reservations
    1310             :  * that were made on behalf of a cowextsize hint when we start to run out
    1311             :  * of quota or when the reservations sit around for too long.  If the file
    1312             :  * has dirty pages or is undergoing writeback, its CoW reservations will
    1313             :  * be retained.
    1314             :  *
    1315             :  * The actual garbage collection piggybacks off the same code that runs
    1316             :  * the speculative EOF preallocation garbage collector.
    1317             :  */
    1318             : STATIC int
    1319    26191497 : xfs_inode_free_cowblocks(
    1320             :         struct xfs_inode        *ip,
    1321             :         struct xfs_icwalk       *icw,
    1322             :         unsigned int            *lockflags)
    1323             : {
    1324    26191497 :         bool                    wait;
    1325    26191497 :         int                     ret = 0;
    1326             : 
    1327    26191497 :         wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
    1328             : 
    1329    52756080 :         if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
    1330             :                 return 0;
    1331             : 
    1332    23458410 :         if (!xfs_prep_free_cowblocks(ip))
    1333             :                 return 0;
    1334             : 
    1335     1611442 :         if (!xfs_icwalk_match(ip, icw))
    1336             :                 return 0;
    1337             : 
    1338             :         /*
    1339             :          * If the caller is waiting, return -EAGAIN to keep the background
    1340             :          * scanner moving and revisit the inode in a subsequent pass.
    1341             :          */
    1342     3196923 :         if (!(*lockflags & XFS_IOLOCK_EXCL) &&
    1343     1575510 :             !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
    1344     1114623 :                 if (wait)
    1345             :                         return -EAGAIN;
    1346       82419 :                 return 0;
    1347             :         }
    1348      506790 :         *lockflags |= XFS_IOLOCK_EXCL;
    1349             : 
    1350      506790 :         if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
    1351         100 :                 if (wait)
    1352             :                         return -EAGAIN;
    1353          63 :                 return 0;
    1354             :         }
    1355      505879 :         *lockflags |= XFS_MMAPLOCK_EXCL;
    1356             : 
    1357             :         /*
    1358             :          * Check again, nobody else should be able to dirty blocks or change
    1359             :          * the reflink iflag now that we have the first two locks held.
    1360             :          */
    1361      505879 :         if (xfs_prep_free_cowblocks(ip))
    1362      505868 :                 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
    1363             :         return ret;
    1364             : }
    1365             : 
    1366             : void
    1367     4136628 : xfs_inode_set_cowblocks_tag(
    1368             :         xfs_inode_t     *ip)
    1369             : {
    1370     4136628 :         trace_xfs_inode_set_cowblocks_tag(ip);
    1371     4132429 :         return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
    1372             : }
    1373             : 
    1374             : void
    1375     5677874 : xfs_inode_clear_cowblocks_tag(
    1376             :         xfs_inode_t     *ip)
    1377             : {
    1378     5677874 :         trace_xfs_inode_clear_cowblocks_tag(ip);
    1379     5675409 :         return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
    1380             : }
    1381             : 
    1382             : /* Disable post-EOF and CoW block auto-reclamation. */
    1383             : void
    1384      122114 : xfs_blockgc_stop(
    1385             :         struct xfs_mount        *mp)
    1386             : {
    1387      122114 :         struct xfs_perag        *pag;
    1388      122114 :         xfs_agnumber_t          agno;
    1389             : 
    1390      122114 :         if (!xfs_clear_blockgc_enabled(mp))
    1391          72 :                 return;
    1392             : 
    1393      838609 :         for_each_perag(mp, agno, pag)
    1394      716567 :                 cancel_delayed_work_sync(&pag->pag_blockgc_work);
    1395      122042 :         trace_xfs_blockgc_stop(mp, __return_address);
    1396             : }
    1397             : 
    1398             : /* Enable post-EOF and CoW block auto-reclamation. */
    1399             : void
    1400      122205 : xfs_blockgc_start(
    1401             :         struct xfs_mount        *mp)
    1402             : {
    1403      122205 :         struct xfs_perag        *pag;
    1404      122205 :         xfs_agnumber_t          agno;
    1405             : 
    1406      122205 :         if (xfs_set_blockgc_enabled(mp))
    1407             :                 return;
    1408             : 
    1409      122183 :         trace_xfs_blockgc_start(mp, __return_address);
    1410      197783 :         for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
    1411       75600 :                 xfs_blockgc_queue(pag);
    1412             : }
    1413             : 
    1414             : /* Don't try to run block gc on an inode that's in any of these states. */
    1415             : #define XFS_BLOCKGC_NOGRAB_IFLAGS       (XFS_INEW | \
    1416             :                                          XFS_NEED_INACTIVE | \
    1417             :                                          XFS_INACTIVATING | \
    1418             :                                          XFS_IRECLAIMABLE | \
    1419             :                                          XFS_IRECLAIM)
    1420             : /*
    1421             :  * Decide if the given @ip is eligible for garbage collection of speculative
    1422             :  * preallocations, and grab it if so.  Returns true if it's ready to go or
    1423             :  * false if we should just ignore it.
    1424             :  */
    1425             : static bool
    1426    28018071 : xfs_blockgc_igrab(
    1427             :         struct xfs_inode        *ip)
    1428             : {
    1429    28018071 :         struct inode            *inode = VFS_I(ip);
    1430             : 
    1431    28018071 :         ASSERT(rcu_read_lock_held());
    1432             : 
    1433             :         /* Check for stale RCU freed inode */
    1434    28018071 :         spin_lock(&ip->i_flags_lock);
    1435    28468038 :         if (!ip->i_ino)
    1436           0 :                 goto out_unlock_noent;
    1437             : 
    1438    28468038 :         if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
    1439      296331 :                 goto out_unlock_noent;
    1440    28171707 :         spin_unlock(&ip->i_flags_lock);
    1441             : 
    1442             :         /* nothing to sync during shutdown */
    1443    56136218 :         if (xfs_is_shutdown(ip->i_mount))
    1444             :                 return false;
    1445             : 
    1446             :         /* If we can't grab the inode, it must on it's way to reclaim. */
    1447    28044380 :         if (!igrab(inode))
    1448        1947 :                 return false;
    1449             : 
    1450             :         /* inode is valid */
    1451             :         return true;
    1452             : 
    1453      296331 : out_unlock_noent:
    1454      296331 :         spin_unlock(&ip->i_flags_lock);
    1455      296331 :         return false;
    1456             : }
    1457             : 
    1458             : /* Scan one incore inode for block preallocations that we can remove. */
    1459             : static int
    1460    27773972 : xfs_blockgc_scan_inode(
    1461             :         struct xfs_inode        *ip,
    1462             :         struct xfs_icwalk       *icw)
    1463             : {
    1464    27773972 :         unsigned int            lockflags = 0;
    1465    27773972 :         int                     error;
    1466             : 
    1467    27773972 :         error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
    1468    27942353 :         if (error)
    1469     1671953 :                 goto unlock;
    1470             : 
    1471    26270400 :         error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
    1472    28039151 : unlock:
    1473    28039151 :         if (lockflags)
    1474     1427583 :                 xfs_iunlock(ip, lockflags);
    1475    28039101 :         xfs_irele(ip);
    1476    28006302 :         return error;
    1477             : }
    1478             : 
    1479             : /* Background worker that trims preallocated space. */
    1480             : void
    1481     2654568 : xfs_blockgc_worker(
    1482             :         struct work_struct      *work)
    1483             : {
    1484     2654568 :         struct xfs_perag        *pag = container_of(to_delayed_work(work),
    1485             :                                         struct xfs_perag, pag_blockgc_work);
    1486     2654568 :         struct xfs_mount        *mp = pag->pag_mount;
    1487     2654568 :         int                     error;
    1488             : 
    1489     2654568 :         trace_xfs_blockgc_worker(mp, __return_address);
    1490             : 
    1491     2654564 :         error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
    1492     2653592 :         if (error)
    1493           0 :                 xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
    1494             :                                 pag->pag_agno, error);
    1495     2653592 :         xfs_blockgc_queue(pag);
    1496     2653278 : }
    1497             : 
    1498             : /*
    1499             :  * Try to free space in the filesystem by purging inactive inodes, eofblocks
    1500             :  * and cowblocks.
    1501             :  */
    1502             : int
    1503     1169123 : xfs_blockgc_free_space(
    1504             :         struct xfs_mount        *mp,
    1505             :         struct xfs_icwalk       *icw)
    1506             : {
    1507     1169123 :         int                     error;
    1508             : 
    1509     1169123 :         trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
    1510             : 
    1511     1168671 :         error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
    1512     1168057 :         if (error)
    1513             :                 return error;
    1514             : 
    1515     1168027 :         return xfs_inodegc_flush(mp);
    1516             : }
    1517             : 
    1518             : /*
    1519             :  * Reclaim all the free space that we can by scheduling the background blockgc
    1520             :  * and inodegc workers immediately and waiting for them all to clear.
    1521             :  */
    1522             : int
    1523    12207189 : xfs_blockgc_flush_all(
    1524             :         struct xfs_mount        *mp)
    1525             : {
    1526    12207189 :         struct xfs_perag        *pag;
    1527    12207189 :         xfs_agnumber_t          agno;
    1528             : 
    1529    12207189 :         trace_xfs_blockgc_flush_all(mp, __return_address);
    1530             : 
    1531             :         /*
    1532             :          * For each blockgc worker, move its queue time up to now.  If it
    1533             :          * wasn't queued, it will not be requeued.  Then flush whatever's
    1534             :          * left.
    1535             :          */
    1536    14190919 :         for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
    1537     2271623 :                 mod_delayed_work(pag->pag_mount->m_blockgc_wq,
    1538             :                                 &pag->pag_blockgc_work, 0);
    1539             : 
    1540    14354913 :         for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
    1541     2257244 :                 flush_delayed_work(&pag->pag_blockgc_work);
    1542             : 
    1543    12205171 :         return xfs_inodegc_flush(mp);
    1544             : }
    1545             : 
    1546             : /*
    1547             :  * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
    1548             :  * quota caused an allocation failure, so we make a best effort by including
    1549             :  * each quota under low free space conditions (less than 1% free space) in the
    1550             :  * scan.
    1551             :  *
    1552             :  * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
    1553             :  * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
    1554             :  * MMAPLOCK.
    1555             :  */
    1556             : int
    1557       32989 : xfs_blockgc_free_dquots(
    1558             :         struct xfs_mount        *mp,
    1559             :         struct xfs_dquot        *udqp,
    1560             :         struct xfs_dquot        *gdqp,
    1561             :         struct xfs_dquot        *pdqp,
    1562             :         unsigned int            iwalk_flags)
    1563             : {
    1564       32989 :         struct xfs_icwalk       icw = {0};
    1565       32989 :         bool                    do_work = false;
    1566             : 
    1567       32989 :         if (!udqp && !gdqp && !pdqp)
    1568             :                 return 0;
    1569             : 
    1570             :         /*
    1571             :          * Run a scan to free blocks using the union filter to cover all
    1572             :          * applicable quotas in a single scan.
    1573             :          */
    1574       32989 :         icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
    1575             : 
    1576       32989 :         if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
    1577       32013 :                 icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
    1578       32010 :                 icw.icw_flags |= XFS_ICWALK_FLAG_UID;
    1579       32010 :                 do_work = true;
    1580             :         }
    1581             : 
    1582       32986 :         if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
    1583       32119 :                 icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
    1584       32119 :                 icw.icw_flags |= XFS_ICWALK_FLAG_GID;
    1585       32119 :                 do_work = true;
    1586             :         }
    1587             : 
    1588       32986 :         if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
    1589       26854 :                 icw.icw_prid = pdqp->q_id;
    1590       26854 :                 icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
    1591       26854 :                 do_work = true;
    1592             :         }
    1593             : 
    1594       32986 :         if (!do_work)
    1595             :                 return 0;
    1596             : 
    1597       32571 :         return xfs_blockgc_free_space(mp, &icw);
    1598             : }
    1599             : 
    1600             : /* Run cow/eofblocks scans on the quotas attached to the inode. */
    1601             : int
    1602       22037 : xfs_blockgc_free_quota(
    1603             :         struct xfs_inode        *ip,
    1604             :         unsigned int            iwalk_flags)
    1605             : {
    1606       22037 :         return xfs_blockgc_free_dquots(ip->i_mount,
    1607             :                         xfs_inode_dquot(ip, XFS_DQTYPE_USER),
    1608             :                         xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
    1609             :                         xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
    1610             : }
    1611             : 
    1612             : /* XFS Inode Cache Walking Code */
    1613             : 
    1614             : /*
    1615             :  * The inode lookup is done in batches to keep the amount of lock traffic and
    1616             :  * radix tree lookups to a minimum. The batch size is a trade off between
    1617             :  * lookup reduction and stack usage. This is in the reclaim path, so we can't
    1618             :  * be too greedy.
    1619             :  */
    1620             : #define XFS_LOOKUP_BATCH        32
    1621             : 
    1622             : 
    1623             : /*
    1624             :  * Decide if we want to grab this inode in anticipation of doing work towards
    1625             :  * the goal.
    1626             :  */
    1627             : static inline bool
    1628   484758407 : xfs_icwalk_igrab(
    1629             :         enum xfs_icwalk_goal    goal,
    1630             :         struct xfs_inode        *ip,
    1631             :         struct xfs_icwalk       *icw)
    1632             : {
    1633   484758407 :         switch (goal) {
    1634    28061045 :         case XFS_ICWALK_BLOCKGC:
    1635    28061045 :                 return xfs_blockgc_igrab(ip);
    1636   456697362 :         case XFS_ICWALK_RECLAIM:
    1637   456697362 :                 return xfs_reclaim_igrab(ip, icw);
    1638             :         default:
    1639             :                 return false;
    1640             :         }
    1641             : }
    1642             : 
    1643             : /*
    1644             :  * Process an inode.  Each processing function must handle any state changes
    1645             :  * made by the icwalk igrab function.  Return -EAGAIN to skip an inode.
    1646             :  */
    1647             : static inline int
    1648   484523841 : xfs_icwalk_process_inode(
    1649             :         enum xfs_icwalk_goal    goal,
    1650             :         struct xfs_inode        *ip,
    1651             :         struct xfs_perag        *pag,
    1652             :         struct xfs_icwalk       *icw)
    1653             : {
    1654   484523841 :         int                     error = 0;
    1655             : 
    1656   484523841 :         switch (goal) {
    1657    27837759 :         case XFS_ICWALK_BLOCKGC:
    1658    27837759 :                 error = xfs_blockgc_scan_inode(ip, icw);
    1659    27837759 :                 break;
    1660   456686082 :         case XFS_ICWALK_RECLAIM:
    1661   456686082 :                 xfs_reclaim_inode(ip, pag);
    1662   456686082 :                 break;
    1663             :         }
    1664   484644598 :         return error;
    1665             : }
    1666             : 
    1667             : /*
    1668             :  * For a given per-AG structure @pag and a goal, grab qualifying inodes and
    1669             :  * process them in some manner.
    1670             :  */
    1671             : static int
    1672     3932680 : xfs_icwalk_ag(
    1673             :         struct xfs_perag        *pag,
    1674             :         enum xfs_icwalk_goal    goal,
    1675             :         struct xfs_icwalk       *icw)
    1676             : {
    1677     3932680 :         struct xfs_mount        *mp = pag->pag_mount;
    1678     3932680 :         uint32_t                first_index;
    1679     3932680 :         int                     last_error = 0;
    1680     5464347 :         int                     skipped;
    1681     5464347 :         bool                    done;
    1682     5464347 :         int                     nr_found;
    1683             : 
    1684     5464347 : restart:
    1685     5464347 :         done = false;
    1686     5464347 :         skipped = 0;
    1687     5464347 :         if (goal == XFS_ICWALK_RECLAIM)
    1688      804981 :                 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
    1689             :         else
    1690             :                 first_index = 0;
    1691             :         nr_found = 0;
    1692    25210037 :         do {
    1693    25210037 :                 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
    1694    25210037 :                 int             error = 0;
    1695    25210037 :                 int             i;
    1696             : 
    1697    25210037 :                 rcu_read_lock();
    1698             : 
    1699    25157603 :                 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
    1700             :                                 (void **) batch, first_index,
    1701             :                                 XFS_LOOKUP_BATCH, goal);
    1702    25187941 :                 if (!nr_found) {
    1703     5312867 :                         done = true;
    1704     5312867 :                         rcu_read_unlock();
    1705     5463983 :                         break;
    1706             :                 }
    1707             : 
    1708             :                 /*
    1709             :                  * Grab the inodes before we drop the lock. if we found
    1710             :                  * nothing, nr == 0 and the loop will be skipped.
    1711             :                  */
    1712   504779604 :                 for (i = 0; i < nr_found; i++) {
    1713   484880627 :                         struct xfs_inode *ip = batch[i];
    1714             : 
    1715   484836433 :                         if (done || !xfs_icwalk_igrab(goal, ip, icw))
    1716      321453 :                                 batch[i] = NULL;
    1717             : 
    1718             :                         /*
    1719             :                          * Update the index for the next lookup. Catch
    1720             :                          * overflows into the next AG range which can occur if
    1721             :                          * we have inodes in the last block of the AG and we
    1722             :                          * are currently pointing to the last inode.
    1723             :                          *
    1724             :                          * Because we may see inodes that are from the wrong AG
    1725             :                          * due to RCU freeing and reallocation, only update the
    1726             :                          * index if it lies in this AG. It was a race that lead
    1727             :                          * us to see this inode, so another lookup from the
    1728             :                          * same index will not find it again.
    1729             :                          */
    1730   484904530 :                         if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
    1731         440 :                                 continue;
    1732   484904090 :                         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
    1733   484904090 :                         if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
    1734          32 :                                 done = true;
    1735             :                 }
    1736             : 
    1737             :                 /* unlock now we've grabbed the inodes. */
    1738    19898977 :                 rcu_read_unlock();
    1739             : 
    1740   524800463 :                 for (i = 0; i < nr_found; i++) {
    1741   485023271 :                         if (!batch[i])
    1742      308793 :                                 continue;
    1743   484624031 :                         error = xfs_icwalk_process_inode(goal, batch[i], pag,
    1744             :                                         icw);
    1745   484695187 :                         if (error == -EAGAIN) {
    1746     2675062 :                                 skipped++;
    1747     2675062 :                                 continue;
    1748             :                         }
    1749   482020125 :                         if (error && last_error != -EFSCORRUPTED)
    1750           0 :                                 last_error = error;
    1751             :                 }
    1752             : 
    1753             :                 /* bail out if the filesystem is corrupted.  */
    1754    19878215 :                 if (error == -EFSCORRUPTED)
    1755             :                         break;
    1756             : 
    1757    19878215 :                 cond_resched();
    1758             : 
    1759    19881112 :                 if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
    1760      419399 :                         icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
    1761      419399 :                         if (icw->icw_scan_limit <= 0)
    1762             :                                 break;
    1763             :                 }
    1764    19745722 :         } while (nr_found && !done);
    1765             : 
    1766     5464015 :         if (goal == XFS_ICWALK_RECLAIM) {
    1767      804981 :                 if (done)
    1768      669591 :                         first_index = 0;
    1769      804981 :                 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
    1770             :         }
    1771             : 
    1772     5464015 :         if (skipped) {
    1773     1532633 :                 delay(1);
    1774     1531667 :                 goto restart;
    1775             :         }
    1776     3931382 :         return last_error;
    1777             : }
    1778             : 
    1779             : /* Walk all incore inodes to achieve a given goal. */
    1780             : static int
    1781     1783860 : xfs_icwalk(
    1782             :         struct xfs_mount        *mp,
    1783             :         enum xfs_icwalk_goal    goal,
    1784             :         struct xfs_icwalk       *icw)
    1785             : {
    1786     1783860 :         struct xfs_perag        *pag;
    1787     1783860 :         int                     error = 0;
    1788     1783860 :         int                     last_error = 0;
    1789     1783860 :         xfs_agnumber_t          agno;
    1790             : 
    1791     3061977 :         for_each_perag_tag(mp, agno, pag, goal) {
    1792     1278560 :                 error = xfs_icwalk_ag(pag, goal, icw);
    1793     1278117 :                 if (error) {
    1794           0 :                         last_error = error;
    1795           0 :                         if (error == -EFSCORRUPTED) {
    1796           0 :                                 xfs_perag_rele(pag);
    1797           0 :                                 break;
    1798             :                         }
    1799             :                 }
    1800             :         }
    1801     1783688 :         return last_error;
    1802             :         BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
    1803             : }
    1804             : 
    1805             : #ifdef DEBUG
    1806             : static void
    1807           0 : xfs_check_delalloc(
    1808             :         struct xfs_inode        *ip,
    1809             :         int                     whichfork)
    1810             : {
    1811           0 :         struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, whichfork);
    1812           0 :         struct xfs_bmbt_irec    got;
    1813           0 :         struct xfs_iext_cursor  icur;
    1814             : 
    1815           0 :         if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
    1816           0 :                 return;
    1817           0 :         do {
    1818           0 :                 if (isnullstartblock(got.br_startblock)) {
    1819           0 :                         xfs_warn(ip->i_mount,
    1820             :         "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
    1821             :                                 ip->i_ino,
    1822             :                                 whichfork == XFS_DATA_FORK ? "data" : "cow",
    1823             :                                 got.br_startoff, got.br_blockcount);
    1824             :                 }
    1825           0 :         } while (xfs_iext_next_extent(ifp, &icur, &got));
    1826             : }
    1827             : #else
    1828             : #define xfs_check_delalloc(ip, whichfork)       do { } while (0)
    1829             : #endif
    1830             : 
    1831             : /* Schedule the inode for reclaim. */
    1832             : static void
    1833  1086144762 : xfs_inodegc_set_reclaimable(
    1834             :         struct xfs_inode        *ip)
    1835             : {
    1836  1086144762 :         struct xfs_mount        *mp = ip->i_mount;
    1837  1086144762 :         struct xfs_perag        *pag;
    1838             : 
    1839  2172289524 :         if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
    1840           0 :                 xfs_check_delalloc(ip, XFS_DATA_FORK);
    1841           0 :                 xfs_check_delalloc(ip, XFS_COW_FORK);
    1842           0 :                 ASSERT(0);
    1843             :         }
    1844             : 
    1845  1086144762 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
    1846  1087086795 :         spin_lock(&pag->pag_ici_lock);
    1847  1087395271 :         spin_lock(&ip->i_flags_lock);
    1848             : 
    1849  1087476632 :         trace_xfs_inode_set_reclaimable(ip);
    1850  1086768609 :         ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
    1851  1086768609 :         ip->i_flags |= XFS_IRECLAIMABLE;
    1852  1086768609 :         xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
    1853             :                         XFS_ICI_RECLAIM_TAG);
    1854             : 
    1855  1086972967 :         spin_unlock(&ip->i_flags_lock);
    1856  1087409367 :         spin_unlock(&pag->pag_ici_lock);
    1857  1087358703 :         xfs_perag_put(pag);
    1858  1087181650 : }
    1859             : 
    1860             : /*
    1861             :  * Free all speculative preallocations and possibly even the inode itself.
    1862             :  * This is the last chance to make changes to an otherwise unreferenced file
    1863             :  * before incore reclamation happens.
    1864             :  */
    1865             : static int
    1866    58052332 : xfs_inodegc_inactivate(
    1867             :         struct xfs_inode        *ip)
    1868             : {
    1869    58052332 :         int                     error;
    1870             : 
    1871    58052332 :         trace_xfs_inode_inactivating(ip);
    1872    57986876 :         error = xfs_inactive(ip);
    1873    58077096 :         xfs_inodegc_set_reclaimable(ip);
    1874    58100246 :         return error;
    1875             : 
    1876             : }
    1877             : 
    1878             : void
    1879     5435694 : xfs_inodegc_worker(
    1880             :         struct work_struct      *work)
    1881             : {
    1882     5435694 :         struct xfs_inodegc      *gc = container_of(to_delayed_work(work),
    1883             :                                                 struct xfs_inodegc, work);
    1884     5435694 :         struct llist_node       *node = llist_del_all(&gc->list);
    1885     5444655 :         struct xfs_inode        *ip, *n;
    1886     5444655 :         unsigned int            nofs_flag;
    1887             : 
    1888     5444655 :         ASSERT(gc->cpu == smp_processor_id());
    1889             : 
    1890     5442631 :         WRITE_ONCE(gc->items, 0);
    1891             : 
    1892     5442631 :         if (!node)
    1893             :                 return;
    1894             : 
    1895             :         /*
    1896             :          * We can allocate memory here while doing writeback on behalf of
    1897             :          * memory reclaim.  To avoid memory allocation deadlocks set the
    1898             :          * task-wide nofs context for the following operations.
    1899             :          */
    1900     5440719 :         nofs_flag = memalloc_nofs_save();
    1901             : 
    1902     5440719 :         ip = llist_entry(node, struct xfs_inode, i_gclist);
    1903     5440719 :         trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
    1904             : 
    1905     5438479 :         WRITE_ONCE(gc->shrinker_hits, 0);
    1906    63532842 :         llist_for_each_entry_safe(ip, n, node, i_gclist) {
    1907    58082863 :                 int     error;
    1908             : 
    1909    58082863 :                 xfs_iflags_set(ip, XFS_INACTIVATING);
    1910    58050169 :                 error = xfs_inodegc_inactivate(ip);
    1911    58094363 :                 if (error && !gc->error)
    1912        2467 :                         gc->error = error;
    1913             :         }
    1914             : 
    1915     5449979 :         memalloc_nofs_restore(nofs_flag);
    1916             : }
    1917             : 
    1918             : /*
    1919             :  * Expedite all pending inodegc work to run immediately. This does not wait for
    1920             :  * completion of the work.
    1921             :  */
    1922             : void
    1923    36288197 : xfs_inodegc_push(
    1924             :         struct xfs_mount        *mp)
    1925             : {
    1926    72576394 :         if (!xfs_is_inodegc_enabled(mp))
    1927             :                 return;
    1928    36052566 :         trace_xfs_inodegc_push(mp, __return_address);
    1929    36040884 :         xfs_inodegc_queue_all(mp);
    1930             : }
    1931             : 
    1932             : /*
    1933             :  * Force all currently queued inode inactivation work to run immediately and
    1934             :  * wait for the work to finish.
    1935             :  */
    1936             : int
    1937    14314238 : xfs_inodegc_flush(
    1938             :         struct xfs_mount        *mp)
    1939             : {
    1940    14314238 :         xfs_inodegc_push(mp);
    1941    14427754 :         trace_xfs_inodegc_flush(mp, __return_address);
    1942    14396712 :         return xfs_inodegc_wait_all(mp);
    1943             : }
    1944             : 
    1945             : /*
    1946             :  * Flush all the pending work and then disable the inode inactivation background
    1947             :  * workers and wait for them to stop.  Caller must hold sb->s_umount to
    1948             :  * coordinate changes in the inodegc_enabled state.
    1949             :  */
    1950             : void
    1951      122242 : xfs_inodegc_stop(
    1952             :         struct xfs_mount        *mp)
    1953             : {
    1954      122242 :         bool                    rerun;
    1955             : 
    1956      122242 :         if (!xfs_clear_inodegc_enabled(mp))
    1957             :                 return;
    1958             : 
    1959             :         /*
    1960             :          * Drain all pending inodegc work, including inodes that could be
    1961             :          * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
    1962             :          * threads that sample the inodegc state just prior to us clearing it.
    1963             :          * The inodegc flag state prevents new threads from queuing more
    1964             :          * inodes, so we queue pending work items and flush the workqueue until
    1965             :          * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
    1966             :          * here because it does not allow other unserialized mechanisms to
    1967             :          * reschedule inodegc work while this draining is in progress.
    1968             :          */
    1969      122170 :         xfs_inodegc_queue_all(mp);
    1970      122170 :         do {
    1971      122170 :                 flush_workqueue(mp->m_inodegc_wq);
    1972      122170 :                 rerun = xfs_inodegc_queue_all(mp);
    1973      122170 :         } while (rerun);
    1974             : 
    1975      122170 :         trace_xfs_inodegc_stop(mp, __return_address);
    1976             : }
    1977             : 
    1978             : /*
    1979             :  * Enable the inode inactivation background workers and schedule deferred inode
    1980             :  * inactivation work if there is any.  Caller must hold sb->s_umount to
    1981             :  * coordinate changes in the inodegc_enabled state.
    1982             :  */
    1983             : void
    1984      122205 : xfs_inodegc_start(
    1985             :         struct xfs_mount        *mp)
    1986             : {
    1987      122205 :         if (xfs_set_inodegc_enabled(mp))
    1988             :                 return;
    1989             : 
    1990      122183 :         trace_xfs_inodegc_start(mp, __return_address);
    1991      122183 :         xfs_inodegc_queue_all(mp);
    1992             : }
    1993             : 
    1994             : #ifdef CONFIG_XFS_RT
    1995             : static inline bool
    1996    42332368 : xfs_inodegc_want_queue_rt_file(
    1997             :         struct xfs_inode        *ip)
    1998             : {
    1999    42332368 :         struct xfs_mount        *mp = ip->i_mount;
    2000             : 
    2001    42332368 :         if (!XFS_IS_REALTIME_INODE(ip))
    2002             :                 return false;
    2003             : 
    2004     6794669 :         if (__percpu_counter_compare(&mp->m_frextents,
    2005     6794731 :                                 mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
    2006             :                                 XFS_FDBLOCKS_BATCH) < 0)
    2007      196412 :                 return true;
    2008             : 
    2009             :         return false;
    2010             : }
    2011             : #else
    2012             : # define xfs_inodegc_want_queue_rt_file(ip)     (false)
    2013             : #endif /* CONFIG_XFS_RT */
    2014             : 
    2015             : /*
    2016             :  * Schedule the inactivation worker when:
    2017             :  *
    2018             :  *  - We've accumulated more than one inode cluster buffer's worth of inodes.
    2019             :  *  - There is less than 5% free space left.
    2020             :  *  - Any of the quotas for this inode are near an enforcement limit.
    2021             :  */
    2022             : static inline bool
    2023    58073506 : xfs_inodegc_want_queue_work(
    2024             :         struct xfs_inode        *ip,
    2025             :         unsigned int            items)
    2026             : {
    2027    58073506 :         struct xfs_mount        *mp = ip->i_mount;
    2028             : 
    2029    58073506 :         if (items > mp->m_ino_geo.inodes_per_cluster)
    2030             :                 return true;
    2031             : 
    2032    42667012 :         if (__percpu_counter_compare(&mp->m_fdblocks,
    2033    42675311 :                                 mp->m_low_space[XFS_LOWSP_5_PCNT],
    2034             :                                 XFS_FDBLOCKS_BATCH) < 0)
    2035             :                 return true;
    2036             : 
    2037    42333079 :         if (xfs_inodegc_want_queue_rt_file(ip))
    2038             :                 return true;
    2039             : 
    2040    42130051 :         if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
    2041             :                 return true;
    2042             : 
    2043    42135282 :         if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
    2044             :                 return true;
    2045             : 
    2046    42135459 :         if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
    2047         114 :                 return true;
    2048             : 
    2049             :         return false;
    2050             : }
    2051             : 
    2052             : /*
    2053             :  * Upper bound on the number of inodes in each AG that can be queued for
    2054             :  * inactivation at any given time, to avoid monopolizing the workqueue.
    2055             :  */
    2056             : #define XFS_INODEGC_MAX_BACKLOG         (4 * XFS_INODES_PER_CHUNK)
    2057             : 
    2058             : /*
    2059             :  * Make the frontend wait for inactivations when:
    2060             :  *
    2061             :  *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
    2062             :  *  - The queue depth exceeds the maximum allowable percpu backlog.
    2063             :  *
    2064             :  * Note: If the current thread is running a transaction, we don't ever want to
    2065             :  * wait for other transactions because that could introduce a deadlock.
    2066             :  */
    2067             : static inline bool
    2068             : xfs_inodegc_want_flush_work(
    2069             :         struct xfs_inode        *ip,
    2070             :         unsigned int            items,
    2071             :         unsigned int            shrinker_hits)
    2072             : {
    2073    58070013 :         if (current->journal_info)
    2074             :                 return false;
    2075             : 
    2076    58008238 :         if (shrinker_hits > 0)
    2077             :                 return true;
    2078             : 
    2079    58008238 :         if (items > XFS_INODEGC_MAX_BACKLOG)
    2080             :                 return true;
    2081             : 
    2082             :         return false;
    2083             : }
    2084             : 
    2085             : /*
    2086             :  * Queue a background inactivation worker if there are inodes that need to be
    2087             :  * inactivated and higher level xfs code hasn't disabled the background
    2088             :  * workers.
    2089             :  */
    2090             : static void
    2091    58049066 : xfs_inodegc_queue(
    2092             :         struct xfs_inode        *ip)
    2093             : {
    2094    58049066 :         struct xfs_mount        *mp = ip->i_mount;
    2095    58049066 :         struct xfs_inodegc      *gc;
    2096    58049066 :         int                     items;
    2097    58049066 :         unsigned int            shrinker_hits;
    2098    58049066 :         unsigned long           queue_delay = 1;
    2099             : 
    2100    58049066 :         trace_xfs_inode_set_need_inactive(ip);
    2101    58029056 :         spin_lock(&ip->i_flags_lock);
    2102    58111945 :         ip->i_flags |= XFS_NEED_INACTIVE;
    2103    58111945 :         spin_unlock(&ip->i_flags_lock);
    2104             : 
    2105    58119529 :         gc = get_cpu_ptr(mp->m_inodegc);
    2106    58103406 :         llist_add(&ip->i_gclist, &gc->list);
    2107    58090904 :         items = READ_ONCE(gc->items);
    2108    58090904 :         WRITE_ONCE(gc->items, items + 1);
    2109    58090904 :         shrinker_hits = READ_ONCE(gc->shrinker_hits);
    2110             : 
    2111             :         /*
    2112             :          * We queue the work while holding the current CPU so that the work
    2113             :          * is scheduled to run on this CPU.
    2114             :          */
    2115   116181808 :         if (!xfs_is_inodegc_enabled(mp)) {
    2116           0 :                 put_cpu_ptr(gc);
    2117           0 :                 return;
    2118             :         }
    2119             : 
    2120    58090904 :         if (xfs_inodegc_want_queue_work(ip, items))
    2121    15931018 :                 queue_delay = 0;
    2122             : 
    2123    58064694 :         trace_xfs_inodegc_queue(mp, __return_address);
    2124    58056167 :         mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
    2125             :                         queue_delay);
    2126    58033399 :         put_cpu_ptr(gc);
    2127             : 
    2128    58070013 :         if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
    2129     5638780 :                 trace_xfs_inodegc_throttle(mp, __return_address);
    2130     5631216 :                 flush_delayed_work(&gc->work);
    2131             :         }
    2132             : }
    2133             : 
    2134             : /*
    2135             :  * Fold the dead CPU inodegc queue into the current CPUs queue.
    2136             :  */
    2137             : void
    2138         140 : xfs_inodegc_cpu_dead(
    2139             :         struct xfs_mount        *mp,
    2140             :         unsigned int            dead_cpu)
    2141             : {
    2142         140 :         struct xfs_inodegc      *dead_gc, *gc;
    2143         140 :         struct llist_node       *first, *last;
    2144         140 :         unsigned int            count = 0;
    2145             : 
    2146         140 :         dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
    2147         140 :         cancel_delayed_work_sync(&dead_gc->work);
    2148             : 
    2149         140 :         if (llist_empty(&dead_gc->list))
    2150             :                 return;
    2151             : 
    2152           0 :         first = dead_gc->list.first;
    2153           0 :         last = first;
    2154           0 :         while (last->next) {
    2155           0 :                 last = last->next;
    2156           0 :                 count++;
    2157             :         }
    2158           0 :         dead_gc->list.first = NULL;
    2159           0 :         dead_gc->items = 0;
    2160             : 
    2161             :         /* Add pending work to current CPU */
    2162           0 :         gc = get_cpu_ptr(mp->m_inodegc);
    2163           0 :         llist_add_batch(first, last, &gc->list);
    2164           0 :         count += READ_ONCE(gc->items);
    2165           0 :         WRITE_ONCE(gc->items, count);
    2166             : 
    2167           0 :         if (xfs_is_inodegc_enabled(mp)) {
    2168           0 :                 trace_xfs_inodegc_queue(mp, __return_address);
    2169           0 :                 mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
    2170             :                                 0);
    2171             :         }
    2172           0 :         put_cpu_ptr(gc);
    2173             : }
    2174             : 
    2175             : /*
    2176             :  * We set the inode flag atomically with the radix tree tag.  Once we get tag
    2177             :  * lookups on the radix tree, this inode flag can go away.
    2178             :  *
    2179             :  * We always use background reclaim here because even if the inode is clean, it
    2180             :  * still may be under IO and hence we have wait for IO completion to occur
    2181             :  * before we can reclaim the inode. The background reclaim path handles this
    2182             :  * more efficiently than we can here, so simply let background reclaim tear down
    2183             :  * all inodes.
    2184             :  */
    2185             : void
    2186  1087148287 : xfs_inode_mark_reclaimable(
    2187             :         struct xfs_inode        *ip)
    2188             : {
    2189  1087148287 :         struct xfs_mount        *mp = ip->i_mount;
    2190  1087148287 :         bool                    need_inactive;
    2191             : 
    2192  1087148287 :         XFS_STATS_INC(mp, vn_reclaim);
    2193             : 
    2194             :         /*
    2195             :          * We should never get here with any of the reclaim flags already set.
    2196             :          */
    2197  2173649712 :         ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
    2198             : 
    2199  1087427296 :         need_inactive = xfs_inode_needs_inactive(ip);
    2200  1086235480 :         if (need_inactive) {
    2201    58032369 :                 xfs_inodegc_queue(ip);
    2202    58032369 :                 return;
    2203             :         }
    2204             : 
    2205             :         /* Going straight to reclaim, so drop the dquots. */
    2206  1028203111 :         xfs_qm_dqdetach(ip);
    2207  1028202646 :         xfs_inodegc_set_reclaimable(ip);
    2208             : }
    2209             : 
    2210             : /*
    2211             :  * Register a phony shrinker so that we can run background inodegc sooner when
    2212             :  * there's memory pressure.  Inactivation does not itself free any memory but
    2213             :  * it does make inodes reclaimable, which eventually frees memory.
    2214             :  *
    2215             :  * The count function, seek value, and batch value are crafted to trigger the
    2216             :  * scan function during the second round of scanning.  Hopefully this means
    2217             :  * that we reclaimed enough memory that initiating metadata transactions won't
    2218             :  * make things worse.
    2219             :  */
    2220             : #define XFS_INODEGC_SHRINKER_COUNT      (1UL << DEF_PRIORITY)
    2221             : #define XFS_INODEGC_SHRINKER_BATCH      ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
    2222             : 
    2223             : static unsigned long
    2224        7034 : xfs_inodegc_shrinker_count(
    2225             :         struct shrinker         *shrink,
    2226             :         struct shrink_control   *sc)
    2227             : {
    2228        7034 :         struct xfs_mount        *mp = container_of(shrink, struct xfs_mount,
    2229             :                                                    m_inodegc_shrinker);
    2230        7034 :         struct xfs_inodegc      *gc;
    2231        7034 :         int                     cpu;
    2232             : 
    2233       14068 :         if (!xfs_is_inodegc_enabled(mp))
    2234             :                 return 0;
    2235             : 
    2236       35014 :         for_each_online_cpu(cpu) {
    2237       28038 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
    2238       28038 :                 if (!llist_empty(&gc->list))
    2239             :                         return XFS_INODEGC_SHRINKER_COUNT;
    2240             :         }
    2241             : 
    2242             :         return 0;
    2243             : }
    2244             : 
    2245             : static unsigned long
    2246          22 : xfs_inodegc_shrinker_scan(
    2247             :         struct shrinker         *shrink,
    2248             :         struct shrink_control   *sc)
    2249             : {
    2250          22 :         struct xfs_mount        *mp = container_of(shrink, struct xfs_mount,
    2251             :                                                    m_inodegc_shrinker);
    2252          22 :         struct xfs_inodegc      *gc;
    2253          22 :         int                     cpu;
    2254          22 :         bool                    no_items = true;
    2255             : 
    2256          44 :         if (!xfs_is_inodegc_enabled(mp))
    2257             :                 return SHRINK_STOP;
    2258             : 
    2259          22 :         trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
    2260             : 
    2261         132 :         for_each_online_cpu(cpu) {
    2262          88 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
    2263          88 :                 if (!llist_empty(&gc->list)) {
    2264          29 :                         unsigned int    h = READ_ONCE(gc->shrinker_hits);
    2265             : 
    2266          29 :                         WRITE_ONCE(gc->shrinker_hits, h + 1);
    2267          29 :                         mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
    2268          29 :                         no_items = false;
    2269             :                 }
    2270             :         }
    2271             : 
    2272             :         /*
    2273             :          * If there are no inodes to inactivate, we don't want the shrinker
    2274             :          * to think there's deferred work to call us back about.
    2275             :          */
    2276          22 :         if (no_items)
    2277           0 :                 return LONG_MAX;
    2278             : 
    2279             :         return SHRINK_STOP;
    2280             : }
    2281             : 
    2282             : /* Register a shrinker so we can accelerate inodegc and throttle queuing. */
    2283             : int
    2284       59289 : xfs_inodegc_register_shrinker(
    2285             :         struct xfs_mount        *mp)
    2286             : {
    2287       59289 :         struct shrinker         *shrink = &mp->m_inodegc_shrinker;
    2288             : 
    2289       59289 :         shrink->count_objects = xfs_inodegc_shrinker_count;
    2290       59289 :         shrink->scan_objects = xfs_inodegc_shrinker_scan;
    2291       59289 :         shrink->seeks = 0;
    2292       59289 :         shrink->flags = SHRINKER_NONSLAB;
    2293       59289 :         shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
    2294             : 
    2295       59289 :         return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
    2296             : }

Generated by: LCOV version 1.14