LCOV - fstests of 6.5.0-rc3-achx @ Mon Jul 31 20:08:12 PDT 2023

LCOV - code coverage report

Current view:	top level - fs/xfs - xfs_icache.c (source / functions)		Hit	Total	Coverage
Test:	fstests of 6.5.0-rc3-achx @ Mon Jul 31 20:08:12 PDT 2023	Lines:	824	899	91.7 %
Date:	2023-07-31 20:08:12	Functions:	63	64	98.4 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
       4             :  * All Rights Reserved.
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_log_format.h"
      11             : #include "xfs_trans_resv.h"
      12             : #include "xfs_mount.h"
      13             : #include "xfs_inode.h"
      14             : #include "xfs_trans.h"
      15             : #include "xfs_trans_priv.h"
      16             : #include "xfs_inode_item.h"
      17             : #include "xfs_quota.h"
      18             : #include "xfs_trace.h"
      19             : #include "xfs_icache.h"
      20             : #include "xfs_bmap_util.h"
      21             : #include "xfs_dquot_item.h"
      22             : #include "xfs_dquot.h"
      23             : #include "xfs_reflink.h"
      24             : #include "xfs_ialloc.h"
      25             : #include "xfs_ag.h"
      26             : #include "xfs_log_priv.h"
      27             : #include "xfs_health.h"
      28             : 
      29             : #include <linux/iversion.h>
      30             : 
      31             : /* Radix tree tags for incore inode tree. */
      32             : 
      33             : /* inode is to be reclaimed */
      34             : #define XFS_ICI_RECLAIM_TAG     0
      35             : /* Inode has speculative preallocations (posteof or cow) to clean. */
      36             : #define XFS_ICI_BLOCKGC_TAG     1
      37             : 
      38             : /*
      39             :  * The goal for walking incore inodes.  These can correspond with incore inode
      40             :  * radix tree tags when convenient.  Avoid existing XFS_IWALK namespace.
      41             :  */
      42             : enum xfs_icwalk_goal {
      43             :         /* Goals directly associated with tagged inodes. */
      44             :         XFS_ICWALK_BLOCKGC      = XFS_ICI_BLOCKGC_TAG,
      45             :         XFS_ICWALK_RECLAIM      = XFS_ICI_RECLAIM_TAG,
      46             : };
      47             : 
      48             : static int xfs_icwalk(struct xfs_mount *mp,
      49             :                 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
      50             : static int xfs_icwalk_ag(struct xfs_perag *pag,
      51             :                 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
      52             : 
      53             : /*
      54             :  * Private inode cache walk flags for struct xfs_icwalk.  Must not
      55             :  * coincide with XFS_ICWALK_FLAGS_VALID.
      56             :  */
      57             : 
      58             : /* Stop scanning after icw_scan_limit inodes. */
      59             : #define XFS_ICWALK_FLAG_SCAN_LIMIT      (1U << 28)
      60             : 
      61             : #define XFS_ICWALK_FLAG_RECLAIM_SICK    (1U << 27)
      62             : #define XFS_ICWALK_FLAG_UNION           (1U << 26) /* union filter algorithm */
      63             : 
      64             : #define XFS_ICWALK_PRIVATE_FLAGS        (XFS_ICWALK_FLAG_SCAN_LIMIT | \
      65             :                                          XFS_ICWALK_FLAG_RECLAIM_SICK | \
      66             :                                          XFS_ICWALK_FLAG_UNION)
      67             : 
      68             : /*
      69             :  * Allocate and initialise an xfs_inode.
      70             :  */
      71             : struct xfs_inode *
      72   479643093 : xfs_inode_alloc(
      73             :         struct xfs_mount        *mp,
      74             :         xfs_ino_t               ino)
      75             : {
      76   479643093 :         struct xfs_inode        *ip;
      77             : 
      78             :         /*
      79             :          * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
      80             :          * and return NULL here on ENOMEM.
      81             :          */
      82   479643093 :         ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
      83             : 
      84   480102414 :         if (inode_init_always(mp->m_super, VFS_I(ip))) {
      85           0 :                 kmem_cache_free(xfs_inode_cache, ip);
      86           0 :                 return NULL;
      87             :         }
      88             : 
      89             :         /* VFS doesn't initialise i_mode or i_state! */
      90   480027848 :         VFS_I(ip)->i_mode = 0;
      91   480027848 :         VFS_I(ip)->i_state = 0;
      92   480027848 :         mapping_set_large_folios(VFS_I(ip)->i_mapping);
      93             : 
      94   480051860 :         XFS_STATS_INC(mp, vn_active);
      95   479698640 :         ASSERT(atomic_read(&ip->i_pincount) == 0);
      96   479698640 :         ASSERT(ip->i_ino == 0);
      97             : 
      98             :         /* initialise the xfs inode */
      99   479698640 :         ip->i_ino = ino;
     100   479698640 :         ip->i_mount = mp;
     101   479698640 :         memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
     102   479698640 :         ip->i_cowfp = NULL;
     103   479698640 :         memset(&ip->i_af, 0, sizeof(ip->i_af));
     104   479698640 :         ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
     105   479698640 :         memset(&ip->i_df, 0, sizeof(ip->i_df));
     106   479698640 :         ip->i_flags = 0;
     107   479698640 :         ip->i_delayed_blks = 0;
     108   479698640 :         ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
     109   479698640 :         ip->i_nblocks = 0;
     110   479698640 :         ip->i_forkoff = 0;
     111   479698640 :         ip->i_sick = 0;
     112   479698640 :         ip->i_checked = 0;
     113   479698640 :         INIT_WORK(&ip->i_ioend_work, xfs_end_io);
     114   479698640 :         INIT_LIST_HEAD(&ip->i_ioend_list);
     115   479698640 :         spin_lock_init(&ip->i_ioend_lock);
     116   480076304 :         ip->i_next_unlinked = NULLAGINO;
     117   480076304 :         ip->i_prev_unlinked = 0;
     118             : 
     119   480076304 :         return ip;
     120             : }
     121             : 
     122             : STATIC void
     123   478635109 : xfs_inode_free_callback(
     124             :         struct rcu_head         *head)
     125             : {
     126   478635109 :         struct inode            *inode = container_of(head, struct inode, i_rcu);
     127   478635109 :         struct xfs_inode        *ip = XFS_I(inode);
     128             : 
     129   478635109 :         switch (VFS_I(ip)->i_mode & S_IFMT) {
     130   308047348 :         case S_IFREG:
     131             :         case S_IFDIR:
     132             :         case S_IFLNK:
     133   308047348 :                 xfs_idestroy_fork(&ip->i_df);
     134   308047348 :                 break;
     135             :         }
     136             : 
     137   478675614 :         xfs_ifork_zap_attr(ip);
     138             : 
     139   479120517 :         if (ip->i_cowfp) {
     140   103574722 :                 xfs_idestroy_fork(ip->i_cowfp);
     141   103563436 :                 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
     142             :         }
     143   478843286 :         if (ip->i_itemp) {
     144    79854909 :                 ASSERT(!test_bit(XFS_LI_IN_AIL,
     145             :                                  &ip->i_itemp->ili_item.li_flags));
     146    79854909 :                 xfs_inode_item_destroy(ip);
     147    79951133 :                 ip->i_itemp = NULL;
     148             :         }
     149             : 
     150   478939510 :         kmem_cache_free(xfs_inode_cache, ip);
     151   479008446 : }
     152             : 
     153             : static void
     154   480294272 : __xfs_inode_free(
     155             :         struct xfs_inode        *ip)
     156             : {
     157             :         /* asserts to verify all state is correct here */
     158   480294272 :         ASSERT(atomic_read(&ip->i_pincount) == 0);
     159   480294272 :         ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
     160   480294272 :         XFS_STATS_DEC(ip->i_mount, vn_active);
     161             : 
     162   480294280 :         call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
     163   480294286 : }
     164             : 
     165             : void
     166      788795 : xfs_inode_free(
     167             :         struct xfs_inode        *ip)
     168             : {
     169     1577592 :         ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
     170             : 
     171             :         /*
     172             :          * Because we use RCU freeing we need to ensure the inode always
     173             :          * appears to be reclaimed with an invalid inode number when in the
     174             :          * free state. The ip->i_flags_lock provides the barrier against lookup
     175             :          * races.
     176             :          */
     177      788797 :         spin_lock(&ip->i_flags_lock);
     178      788796 :         ip->i_flags = XFS_IRECLAIM;
     179      788796 :         ip->i_ino = 0;
     180      788796 :         spin_unlock(&ip->i_flags_lock);
     181             : 
     182      788796 :         __xfs_inode_free(ip);
     183      788794 : }
     184             : 
     185             : /*
     186             :  * Queue background inode reclaim work if there are reclaimable inodes and there
     187             :  * isn't reclaim work already scheduled or in progress.
     188             :  */
     189             : static void
     190    14796389 : xfs_reclaim_work_queue(
     191             :         struct xfs_mount        *mp)
     192             : {
     193             : 
     194    14796389 :         rcu_read_lock();
     195    14796966 :         if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
     196    14781879 :                 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
     197    14784779 :                         msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
     198             :         }
     199    14795577 :         rcu_read_unlock();
     200    14791126 : }
     201             : 
     202             : /*
     203             :  * Background scanning to trim preallocated space. This is queued based on the
     204             :  * 'speculative_prealloc_lifetime' tunable (5m by default).
     205             :  */
     206             : static inline void
     207     3851520 : xfs_blockgc_queue(
     208             :         struct xfs_perag        *pag)
     209             : {
     210     3851520 :         struct xfs_mount        *mp = pag->pag_mount;
     211             : 
     212     7703040 :         if (!xfs_is_blockgc_enabled(mp))
     213             :                 return;
     214             : 
     215     3851410 :         rcu_read_lock();
     216     3851344 :         if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
     217     3795744 :                 queue_delayed_work(pag->pag_mount->m_blockgc_wq,
     218             :                                    &pag->pag_blockgc_work,
     219     3795735 :                                    msecs_to_jiffies(xfs_blockgc_secs * 1000));
     220     3851497 :         rcu_read_unlock();
     221             : }
     222             : 
     223             : /* Set a tag on both the AG incore inode tree and the AG radix tree. */
     224             : static void
     225  1121699424 : xfs_perag_set_inode_tag(
     226             :         struct xfs_perag        *pag,
     227             :         xfs_agino_t             agino,
     228             :         unsigned int            tag)
     229             : {
     230  1121699424 :         struct xfs_mount        *mp = pag->pag_mount;
     231  1121699424 :         bool                    was_tagged;
     232             : 
     233  1121699424 :         lockdep_assert_held(&pag->pag_ici_lock);
     234             : 
     235  1121699424 :         was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
     236  1121682620 :         radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
     237             : 
     238  1121703551 :         if (tag == XFS_ICI_RECLAIM_TAG)
     239  1115779083 :                 pag->pag_ici_reclaimable++;
     240             : 
     241  1121703551 :         if (was_tagged)
     242             :                 return;
     243             : 
     244             :         /* propagate the tag up into the perag radix tree */
     245    15167095 :         spin_lock(&mp->m_perag_lock);
     246    15187101 :         radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
     247    15187101 :         spin_unlock(&mp->m_perag_lock);
     248             : 
     249             :         /* start background work */
     250    15187077 :         switch (tag) {
     251    14694737 :         case XFS_ICI_RECLAIM_TAG:
     252    14694737 :                 xfs_reclaim_work_queue(mp);
     253    14694737 :                 break;
     254      492340 :         case XFS_ICI_BLOCKGC_TAG:
     255      492340 :                 xfs_blockgc_queue(pag);
     256      492340 :                 break;
     257             :         }
     258             : 
     259    15172555 :         trace_xfs_perag_set_inode_tag(pag, _RET_IP_);
     260             : }
     261             : 
     262             : /* Clear a tag on both the AG incore inode tree and the AG radix tree. */
     263             : static void
     264  1138009556 : xfs_perag_clear_inode_tag(
     265             :         struct xfs_perag        *pag,
     266             :         xfs_agino_t             agino,
     267             :         unsigned int            tag)
     268             : {
     269  1138009556 :         struct xfs_mount        *mp = pag->pag_mount;
     270             : 
     271  1138009556 :         lockdep_assert_held(&pag->pag_ici_lock);
     272             : 
     273             :         /*
     274             :          * Reclaim can signal (with a null agino) that it cleared its own tag
     275             :          * by removing the inode from the radix tree.
     276             :          */
     277  1138009556 :         if (agino != NULLAGINO)
     278   658503900 :                 radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
     279             :         else
     280   479505656 :                 ASSERT(tag == XFS_ICI_RECLAIM_TAG);
     281             : 
     282  1137980278 :         if (tag == XFS_ICI_RECLAIM_TAG)
     283  1115918218 :                 pag->pag_ici_reclaimable--;
     284             : 
     285  1137980278 :         if (radix_tree_tagged(&pag->pag_ici_root, tag))
     286             :                 return;
     287             : 
     288             :         /* clear the tag from the perag radix tree */
     289    27595040 :         spin_lock(&mp->m_perag_lock);
     290    27695911 :         radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
     291    27695911 :         spin_unlock(&mp->m_perag_lock);
     292             : 
     293    27695336 :         trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
     294             : }
     295             : 
     296             : /*
     297             :  * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
     298             :  * part of the structure. This is made more complex by the fact we store
     299             :  * information about the on-disk values in the VFS inode and so we can't just
     300             :  * overwrite the values unconditionally. Hence we save the parameters we
     301             :  * need to retain across reinitialisation, and rewrite them into the VFS inode
     302             :  * after reinitialisation even if it fails.
     303             :  */
     304             : static int
     305   636249806 : xfs_reinit_inode(
     306             :         struct xfs_mount        *mp,
     307             :         struct inode            *inode)
     308             : {
     309   636249806 :         int                     error;
     310   636249806 :         uint32_t                nlink = inode->i_nlink;
     311   636249806 :         uint32_t                generation = inode->i_generation;
     312   636249806 :         uint64_t                version = inode_peek_iversion(inode);
     313   636249806 :         umode_t                 mode = inode->i_mode;
     314   636249806 :         dev_t                   dev = inode->i_rdev;
     315   636249806 :         kuid_t                  uid = inode->i_uid;
     316   636249806 :         kgid_t                  gid = inode->i_gid;
     317             : 
     318   636249806 :         error = inode_init_always(mp->m_super, inode);
     319             : 
     320   636173580 :         set_nlink(inode, nlink);
     321   636001626 :         inode->i_generation = generation;
     322   636001626 :         inode_set_iversion_queried(inode, version);
     323   636001626 :         inode->i_mode = mode;
     324   636001626 :         inode->i_rdev = dev;
     325   636001626 :         inode->i_uid = uid;
     326   636001626 :         inode->i_gid = gid;
     327   636001626 :         mapping_set_large_folios(inode->i_mapping);
     328   635972201 :         return error;
     329             : }
     330             : 
     331             : /*
     332             :  * Carefully nudge an inode whose VFS state has been torn down back into a
     333             :  * usable state.  Drops the i_flags_lock and the rcu read lock.
     334             :  */
     335             : static int
     336   635926778 : xfs_iget_recycle(
     337             :         struct xfs_perag        *pag,
     338             :         struct xfs_inode        *ip) __releases(&ip->i_flags_lock)
     339             : {
     340   635926778 :         struct xfs_mount        *mp = ip->i_mount;
     341   635926778 :         struct inode            *inode = VFS_I(ip);
     342   635926778 :         int                     error;
     343             : 
     344   635926778 :         trace_xfs_iget_recycle(ip);
     345             : 
     346   635834120 :         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
     347             :                 return -EAGAIN;
     348             : 
     349             :         /*
     350             :          * We need to make it look like the inode is being reclaimed to prevent
     351             :          * the actual reclaim workers from stomping over us while we recycle
     352             :          * the inode.  We can't clear the radix tree tag yet as it requires
     353             :          * pag_ici_lock to be held exclusive.
     354             :          */
     355   636013193 :         ip->i_flags |= XFS_IRECLAIM;
     356             : 
     357   636013193 :         spin_unlock(&ip->i_flags_lock);
     358   636308707 :         rcu_read_unlock();
     359             : 
     360   636247411 :         ASSERT(!rwsem_is_locked(&inode->i_rwsem));
     361   636247411 :         error = xfs_reinit_inode(mp, inode);
     362   635983775 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     363   635998243 :         if (error) {
     364             :                 /*
     365             :                  * Re-initializing the inode failed, and we are in deep
     366             :                  * trouble.  Try to re-add it to the reclaim list.
     367             :                  */
     368           0 :                 rcu_read_lock();
     369           0 :                 spin_lock(&ip->i_flags_lock);
     370           0 :                 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
     371           0 :                 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
     372           0 :                 spin_unlock(&ip->i_flags_lock);
     373           0 :                 rcu_read_unlock();
     374             : 
     375           0 :                 trace_xfs_iget_recycle_fail(ip);
     376           0 :                 return error;
     377             :         }
     378             : 
     379   635998243 :         spin_lock(&pag->pag_ici_lock);
     380   636471198 :         spin_lock(&ip->i_flags_lock);
     381             : 
     382             :         /*
     383             :          * Clear the per-lifetime state in the inode as we are now effectively
     384             :          * a new inode and need to return to the initial state before reuse
     385             :          * occurs.
     386             :          */
     387   636519609 :         ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
     388   636519609 :         ip->i_flags |= XFS_INEW;
     389   636519609 :         xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
     390             :                         XFS_ICI_RECLAIM_TAG);
     391   636342686 :         inode->i_state = I_NEW;
     392   636342686 :         spin_unlock(&ip->i_flags_lock);
     393   636514233 :         spin_unlock(&pag->pag_ici_lock);
     394             : 
     395   636514233 :         return 0;
     396             : }
     397             : 
     398             : /*
     399             :  * If we are allocating a new inode, then check what was returned is
     400             :  * actually a free, empty inode. If we are not allocating an inode,
     401             :  * then check we didn't find a free inode.
     402             :  *
     403             :  * Returns:
     404             :  *      0               if the inode free state matches the lookup context
     405             :  *      -ENOENT         if the inode is free and we are not allocating
     406             :  *      -EFSCORRUPTED   if there is any state mismatch at all
     407             :  */
     408             : static int
     409 >11424*10^7 : xfs_iget_check_free_state(
     410             :         struct xfs_inode        *ip,
     411             :         int                     flags)
     412             : {
     413 >11424*10^7 :         if (flags & XFS_IGET_CREATE) {
     414             :                 /* should be a free inode */
     415   127249482 :                 if (VFS_I(ip)->i_mode != 0) {
     416           0 :                         xfs_warn(ip->i_mount,
     417             : "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
     418             :                                 ip->i_ino, VFS_I(ip)->i_mode);
     419           0 :                         xfs_agno_mark_sick(ip->i_mount,
     420           0 :                                         XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
     421             :                                         XFS_SICK_AG_INOBT);
     422           0 :                         return -EFSCORRUPTED;
     423             :                 }
     424             : 
     425   127249482 :                 if (ip->i_nblocks != 0) {
     426           0 :                         xfs_warn(ip->i_mount,
     427             : "Corruption detected! Free inode 0x%llx has blocks allocated!",
     428             :                                 ip->i_ino);
     429           0 :                         xfs_agno_mark_sick(ip->i_mount,
     430           0 :                                         XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
     431             :                                         XFS_SICK_AG_INOBT);
     432           0 :                         return -EFSCORRUPTED;
     433             :                 }
     434             :                 return 0;
     435             :         }
     436             : 
     437             :         /* should be an allocated inode */
     438 >11412*10^7 :         if (VFS_I(ip)->i_mode == 0)
     439     2768542 :                 return -ENOENT;
     440             : 
     441             :         return 0;
     442             : }
     443             : 
     444             : /* Make all pending inactivation work start immediately. */
     445             : static bool
     446    34814650 : xfs_inodegc_queue_all(
     447             :         struct xfs_mount        *mp)
     448             : {
     449    34814650 :         struct xfs_inodegc      *gc;
     450    34814650 :         int                     cpu;
     451    34814650 :         bool                    ret = false;
     452             : 
     453   173931481 :         for_each_online_cpu(cpu) {
     454   139081044 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
     455   139106276 :                 if (!llist_empty(&gc->list)) {
     456     3912409 :                         mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
     457     3912409 :                         ret = true;
     458             :                 }
     459             :         }
     460             : 
     461    34810308 :         return ret;
     462             : }
     463             : 
     464             : /* Wait for all queued work and collect errors */
     465             : static int
     466    11431440 : xfs_inodegc_wait_all(
     467             :         struct xfs_mount        *mp)
     468             : {
     469    11431440 :         int                     cpu;
     470    11431440 :         int                     error = 0;
     471             : 
     472    11431440 :         flush_workqueue(mp->m_inodegc_wq);
     473    68664741 :         for_each_online_cpu(cpu) {
     474    45785359 :                 struct xfs_inodegc      *gc;
     475             : 
     476    45785359 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
     477    45785580 :                 if (gc->error && !error)
     478        2313 :                         error = gc->error;
     479    45785580 :                 gc->error = 0;
     480             :         }
     481             : 
     482    11447232 :         return error;
     483             : }
     484             : 
     485             : /*
     486             :  * Check the validity of the inode we just found it the cache
     487             :  */
     488             : static int
     489 >11320*10^7 : xfs_iget_cache_hit(
     490             :         struct xfs_perag        *pag,
     491             :         struct xfs_inode        *ip,
     492             :         xfs_ino_t               ino,
     493             :         int                     flags,
     494             :         int                     lock_flags) __releases(RCU)
     495             : {
     496 >11320*10^7 :         struct inode            *inode = VFS_I(ip);
     497 >11320*10^7 :         struct xfs_mount        *mp = ip->i_mount;
     498 >11320*10^7 :         int                     error;
     499             : 
     500             :         /*
     501             :          * check for re-use of an inode within an RCU grace period due to the
     502             :          * radix tree nodes not being updated yet. We monitor for this by
     503             :          * setting the inode number to zero before freeing the inode structure.
     504             :          * If the inode has been reallocated and set up, then the inode number
     505             :          * will not match, so check for that, too.
     506             :          */
     507 >11320*10^7 :         spin_lock(&ip->i_flags_lock);
     508 >11412*10^7 :         if (ip->i_ino != ino)
     509           9 :                 goto out_skip;
     510             : 
     511             :         /*
     512             :          * If we are racing with another cache hit that is currently
     513             :          * instantiating this inode or currently recycling it out of
     514             :          * reclaimable state, wait for the initialisation to complete
     515             :          * before continuing.
     516             :          *
     517             :          * If we're racing with the inactivation worker we also want to wait.
     518             :          * If we're creating a new file, it's possible that the worker
     519             :          * previously marked the inode as free on disk but hasn't finished
     520             :          * updating the incore state yet.  The AGI buffer will be dirty and
     521             :          * locked to the icreate transaction, so a synchronous push of the
     522             :          * inodegc workers would result in deadlock.  For a regular iget, the
     523             :          * worker is running already, so we might as well wait.
     524             :          *
     525             :          * XXX(hch): eventually we should do something equivalent to
     526             :          *           wait_on_inode to wait for these flags to be cleared
     527             :          *           instead of polling for it.
     528             :          */
     529 >11412*10^7 :         if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
     530      411215 :                 goto out_skip;
     531             : 
     532 >11412*10^7 :         if (ip->i_flags & XFS_NEED_INACTIVE) {
     533             :                 /* Unlinked inodes cannot be re-grabbed. */
     534     5768002 :                 if (VFS_I(ip)->i_nlink == 0) {
     535     5745312 :                         error = -ENOENT;
     536     5745312 :                         goto out_error;
     537             :                 }
     538       22690 :                 goto out_inodegc_flush;
     539             :         }
     540             : 
     541             :         /*
     542             :          * Check the inode free state is valid. This also detects lookup
     543             :          * racing with unlinks.
     544             :          */
     545 >11412*10^7 :         error = xfs_iget_check_free_state(ip, flags);
     546 >11360*10^7 :         if (error)
     547     2768500 :                 goto out_error;
     548             : 
     549             :         /* Skip inodes that have no vfs state. */
     550 >11360*10^7 :         if ((flags & XFS_IGET_INCORE) &&
     551           0 :             (ip->i_flags & XFS_IRECLAIMABLE))
     552           0 :                 goto out_skip;
     553             : 
     554             :         /* The inode fits the selection criteria; process it. */
     555 >11360*10^7 :         if (ip->i_flags & XFS_IRECLAIMABLE) {
     556             :                 /* Drops i_flags_lock and RCU read lock. */
     557   635948452 :                 error = xfs_iget_recycle(pag, ip);
     558   636509090 :                 if (error == -EAGAIN)
     559           4 :                         goto out_skip;
     560   636509086 :                 if (error)
     561             :                         return error;
     562             :         } else {
     563             :                 /* If the VFS inode is being torn down, pause and try again. */
     564 >11296*10^7 :                 if (!igrab(inode))
     565      378800 :                         goto out_skip;
     566             : 
     567             :                 /* We've got a live one. */
     568 >11319*10^7 :                 spin_unlock(&ip->i_flags_lock);
     569 >11332*10^7 :                 rcu_read_unlock();
     570 >11330*10^7 :                 trace_xfs_iget_hit(ip);
     571             :         }
     572             : 
     573 >11353*10^7 :         if (lock_flags != 0)
     574 72459523670 :                 xfs_ilock(ip, lock_flags);
     575             : 
     576 >11366*10^7 :         if (!(flags & XFS_IGET_INCORE))
     577 >11362*10^7 :                 xfs_iflags_clear(ip, XFS_ISTALE);
     578 >11402*10^7 :         XFS_STATS_INC(mp, xs_ig_found);
     579             : 
     580 >11367*10^7 :         return 0;
     581             : 
     582      790028 : out_skip:
     583      790028 :         trace_xfs_iget_skip(ip);
     584      789050 :         XFS_STATS_INC(mp, xs_ig_frecycle);
     585      788977 :         error = -EAGAIN;
     586     9302789 : out_error:
     587     9302789 :         spin_unlock(&ip->i_flags_lock);
     588     9303954 :         rcu_read_unlock();
     589     9303954 :         return error;
     590             : 
     591             : out_inodegc_flush:
     592       22690 :         spin_unlock(&ip->i_flags_lock);
     593       22690 :         rcu_read_unlock();
     594             :         /*
     595             :          * Do not wait for the workers, because the caller could hold an AGI
     596             :          * buffer lock.  We're just going to sleep in a loop anyway.
     597             :          */
     598       45380 :         if (xfs_is_inodegc_enabled(mp))
     599       22690 :                 xfs_inodegc_queue_all(mp);
     600             :         return -EAGAIN;
     601             : }
     602             : 
     603             : static int
     604   479663620 : xfs_iget_cache_miss(
     605             :         struct xfs_mount        *mp,
     606             :         struct xfs_perag        *pag,
     607             :         xfs_trans_t             *tp,
     608             :         xfs_ino_t               ino,
     609             :         struct xfs_inode        **ipp,
     610             :         int                     flags,
     611             :         int                     lock_flags)
     612             : {
     613   479663620 :         struct xfs_inode        *ip;
     614   479663620 :         int                     error;
     615   479663620 :         xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
     616   479663620 :         int                     iflags;
     617             : 
     618   479663620 :         ip = xfs_inode_alloc(mp, ino);
     619   479914004 :         if (!ip)
     620             :                 return -ENOMEM;
     621             : 
     622   479914004 :         error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags);
     623   479875737 :         if (error)
     624      100472 :                 goto out_destroy;
     625             : 
     626             :         /*
     627             :          * For version 5 superblocks, if we are initialising a new inode and we
     628             :          * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
     629             :          * simply build the new inode core with a random generation number.
     630             :          *
     631             :          * For version 4 (and older) superblocks, log recovery is dependent on
     632             :          * the i_flushiter field being initialised from the current on-disk
     633             :          * value and hence we must also read the inode off disk even when
     634             :          * initializing new inodes.
     635             :          */
     636   479775265 :         if (xfs_has_v3inodes(mp) &&
     637   479773167 :             (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
     638    62235434 :                 VFS_I(ip)->i_generation = get_random_u32();
     639             :         } else {
     640   417539831 :                 struct xfs_buf          *bp;
     641             : 
     642   417539831 :                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
     643   417365419 :                 if (error)
     644        7044 :                         goto out_destroy;
     645             : 
     646   417349993 :                 error = xfs_inode_from_disk(ip,
     647   417359477 :                                 xfs_buf_offset(bp, ip->i_imap.im_boffset));
     648   417351238 :                 if (!error)
     649   417350136 :                         xfs_buf_set_ref(bp, XFS_INO_REF);
     650             :                 else
     651        1102 :                         xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
     652   417352046 :                 xfs_trans_brelse(tp, bp);
     653             : 
     654   417375235 :                 if (error)
     655        1102 :                         goto out_destroy;
     656             :         }
     657             : 
     658   479612589 :         trace_xfs_iget_miss(ip);
     659             : 
     660             :         /*
     661             :          * Check the inode free state is valid. This also detects lookup
     662             :          * racing with unlinks.
     663             :          */
     664   479402414 :         error = xfs_iget_check_free_state(ip, flags);
     665   479430614 :         if (error)
     666           1 :                 goto out_destroy;
     667             : 
     668             :         /*
     669             :          * Preload the radix tree so we can insert safely under the
     670             :          * write spinlock. Note that we cannot sleep inside the preload
     671             :          * region. Since we can be called from transaction context, don't
     672             :          * recurse into the file system.
     673             :          */
     674   479430613 :         if (radix_tree_preload(GFP_NOFS)) {
     675           0 :                 error = -EAGAIN;
     676           0 :                 goto out_destroy;
     677             :         }
     678             : 
     679             :         /*
     680             :          * Because the inode hasn't been added to the radix-tree yet it can't
     681             :          * be found by another thread, so we can do the non-sleeping lock here.
     682             :          */
     683   479453856 :         if (lock_flags) {
     684   458447551 :                 if (!xfs_ilock_nowait(ip, lock_flags))
     685           0 :                         BUG();
     686             :         }
     687             : 
     688             :         /*
     689             :          * These values must be set before inserting the inode into the radix
     690             :          * tree as the moment it is inserted a concurrent lookup (allowed by the
     691             :          * RCU locking mechanism) can find it and that lookup must see that this
     692             :          * is an inode currently under construction (i.e. that XFS_INEW is set).
     693             :          * The ip->i_flags_lock that protects the XFS_INEW flag forms the
     694             :          * memory barrier that ensures this detection works correctly at lookup
     695             :          * time.
     696             :          */
     697   479563678 :         iflags = XFS_INEW;
     698   479563678 :         if (flags & XFS_IGET_DONTCACHE)
     699   396365340 :                 d_mark_dontcache(VFS_I(ip));
     700   479569131 :         ip->i_udquot = NULL;
     701   479569131 :         ip->i_gdquot = NULL;
     702   479569131 :         ip->i_pdquot = NULL;
     703   479569131 :         xfs_iflags_set(ip, iflags);
     704             : 
     705             :         /* insert the new inode */
     706   480027340 :         spin_lock(&pag->pag_ici_lock);
     707   480073730 :         error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
     708   479619044 :         if (unlikely(error)) {
     709      681279 :                 WARN_ON(error != -EEXIST);
     710      681279 :                 XFS_STATS_INC(mp, xs_ig_dup);
     711      681279 :                 error = -EAGAIN;
     712      681279 :                 goto out_preload_end;
     713             :         }
     714   478937765 :         spin_unlock(&pag->pag_ici_lock);
     715   479150632 :         radix_tree_preload_end();
     716             : 
     717   478577489 :         *ipp = ip;
     718   478577489 :         return 0;
     719             : 
     720             : out_preload_end:
     721      681279 :         spin_unlock(&pag->pag_ici_lock);
     722      681279 :         radix_tree_preload_end();
     723      681279 :         if (lock_flags)
     724      676204 :                 xfs_iunlock(ip, lock_flags);
     725        5075 : out_destroy:
     726      788796 :         __destroy_inode(VFS_I(ip));
     727      788795 :         xfs_inode_free(ip);
     728      788795 :         return error;
     729             : }
     730             : 
     731             : /*
     732             :  * Look up an inode by number in the given file system.  The inode is looked up
     733             :  * in the cache held in each AG.  If the inode is found in the cache, initialise
     734             :  * the vfs inode if necessary.
     735             :  *
     736             :  * If it is not in core, read it in from the file system's device, add it to the
     737             :  * cache and initialise the vfs inode.
     738             :  *
     739             :  * The inode is locked according to the value of the lock_flags parameter.
     740             :  * Inode lookup is only done during metadata operations and not as part of the
     741             :  * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
     742             :  */
     743             : int
     744 >11406*10^7 : xfs_iget(
     745             :         struct xfs_mount        *mp,
     746             :         struct xfs_trans        *tp,
     747             :         xfs_ino_t               ino,
     748             :         uint                    flags,
     749             :         uint                    lock_flags,
     750             :         struct xfs_inode        **ipp)
     751             : {
     752 >11406*10^7 :         struct xfs_inode        *ip;
     753 >11406*10^7 :         struct xfs_perag        *pag;
     754 >11406*10^7 :         xfs_agino_t             agino;
     755 >11406*10^7 :         int                     error;
     756             : 
     757 >11406*10^7 :         ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
     758             : 
     759             :         /* reject inode numbers outside existing AGs */
     760 >11406*10^7 :         if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
     761      841362 :                 return -EINVAL;
     762             : 
     763 >11405*10^7 :         XFS_STATS_INC(mp, xs_ig_attempts);
     764             : 
     765             :         /* get the perag structure and ensure that it's inode capable */
     766 >11264*10^7 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
     767 >11426*10^7 :         agino = XFS_INO_TO_AGINO(mp, ino);
     768             : 
     769 >11427*10^7 : again:
     770 >11427*10^7 :         error = 0;
     771 >11427*10^7 :         rcu_read_lock();
     772 >11408*10^7 :         ip = radix_tree_lookup(&pag->pag_ici_root, agino);
     773             : 
     774 >11420*10^7 :         if (ip) {
     775 >11372*10^7 :                 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
     776 >11336*10^7 :                 if (error)
     777     9326354 :                         goto out_error_or_again;
     778             :         } else {
     779   479923259 :                 rcu_read_unlock();
     780   479896128 :                 if (flags & XFS_IGET_INCORE) {
     781           0 :                         error = -ENODATA;
     782           0 :                         goto out_error_or_again;
     783             :                 }
     784   479896128 :                 XFS_STATS_INC(mp, xs_ig_missed);
     785             : 
     786   479630324 :                 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
     787             :                                                         flags, lock_flags);
     788   479315952 :                 if (error)
     789      788794 :                         goto out_error_or_again;
     790             :         }
     791 >11383*10^7 :         xfs_perag_put(pag);
     792             : 
     793 >11451*10^7 :         *ipp = ip;
     794             : 
     795             :         /*
     796             :          * If we have a real type for an on-disk inode, we can setup the inode
     797             :          * now.  If it's a new inode being created, xfs_init_new_inode will
     798             :          * handle it.
     799             :          */
     800 >22893*10^7 :         if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
     801   987651363 :                 xfs_setup_existing_inode(ip);
     802             :         return 0;
     803             : 
     804    10115148 : out_error_or_again:
     805    10115148 :         if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
     806             :             error == -EAGAIN) {
     807     1420621 :                 delay(1);
     808     1411098 :                 goto again;
     809             :         }
     810     8694527 :         xfs_perag_put(pag);
     811     8694527 :         return error;
     812             : }
     813             : 
     814             : /*
     815             :  * Grab the inode for reclaim exclusively.
     816             :  *
     817             :  * We have found this inode via a lookup under RCU, so the inode may have
     818             :  * already been freed, or it may be in the process of being recycled by
     819             :  * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
     820             :  * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
     821             :  * will not be set. Hence we need to check for both these flag conditions to
     822             :  * avoid inodes that are no longer reclaim candidates.
     823             :  *
     824             :  * Note: checking for other state flags here, under the i_flags_lock or not, is
     825             :  * racy and should be avoided. Those races should be resolved only after we have
     826             :  * ensured that we are able to reclaim this inode and the world can see that we
     827             :  * are going to reclaim it.
     828             :  *
     829             :  * Return true if we grabbed it, false otherwise.
     830             :  */
     831             : static bool
     832   496911629 : xfs_reclaim_igrab(
     833             :         struct xfs_inode        *ip,
     834             :         struct xfs_icwalk       *icw)
     835             : {
     836   496911629 :         ASSERT(rcu_read_lock_held());
     837             : 
     838   496911629 :         spin_lock(&ip->i_flags_lock);
     839   496912040 :         if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
     840             :             __xfs_iflags_test(ip, XFS_IRECLAIM)) {
     841             :                 /* not a reclaim candidate. */
     842        8309 :                 spin_unlock(&ip->i_flags_lock);
     843        8309 :                 return false;
     844             :         }
     845             : 
     846             :         /* Don't reclaim a sick inode unless the caller asked for it. */
     847   496903731 :         if (ip->i_sick &&
     848       52025 :             (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
     849           0 :                 spin_unlock(&ip->i_flags_lock);
     850           0 :                 return false;
     851             :         }
     852             : 
     853   496903731 :         __xfs_iflags_set(ip, XFS_IRECLAIM);
     854   496903731 :         spin_unlock(&ip->i_flags_lock);
     855   496903731 :         return true;
     856             : }
     857             : 
     858             : /*
     859             :  * Inode reclaim is non-blocking, so the default action if progress cannot be
     860             :  * made is to "requeue" the inode for reclaim by unlocking it and clearing the
     861             :  * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
     862             :  * blocking anymore and hence we can wait for the inode to be able to reclaim
     863             :  * it.
     864             :  *
     865             :  * We do no IO here - if callers require inodes to be cleaned they must push the
     866             :  * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
     867             :  * done in the background in a non-blocking manner, and enables memory reclaim
     868             :  * to make progress without blocking.
     869             :  */
     870             : static void
     871   496903701 : xfs_reclaim_inode(
     872             :         struct xfs_inode        *ip,
     873             :         struct xfs_perag        *pag)
     874             : {
     875   496903701 :         xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
     876             : 
     877   496903701 :         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
     878       14876 :                 goto out;
     879   496888555 :         if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
     880     9027171 :                 goto out_iunlock;
     881             : 
     882             :         /*
     883             :          * Check for log shutdown because aborting the inode can move the log
     884             :          * tail and corrupt in memory state. This is fine if the log is shut
     885             :          * down, but if the log is still active and only the mount is shut down
     886             :          * then the in-memory log tail movement caused by the abort can be
     887             :          * incorrectly propagated to disk.
     888             :          */
     889   975723698 :         if (xlog_is_shutdown(ip->i_mount->m_log)) {
     890   361223827 :                 xfs_iunpin_wait(ip);
     891   361223827 :                 xfs_iflush_shutdown_abort(ip);
     892   361223828 :                 goto reclaim;
     893             :         }
     894   126638022 :         if (xfs_ipincount(ip))
     895     3517243 :                 goto out_clear_flush;
     896   123120779 :         if (!xfs_inode_clean(ip))
     897     4838941 :                 goto out_clear_flush;
     898             : 
     899   118281838 :         xfs_iflags_clear(ip, XFS_IFLUSHING);
     900   479505579 : reclaim:
     901   479505579 :         trace_xfs_inode_reclaiming(ip);
     902             : 
     903             :         /*
     904             :          * Because we use RCU freeing we need to ensure the inode always appears
     905             :          * to be reclaimed with an invalid inode number when in the free state.
     906             :          * We do this as early as possible under the ILOCK so that
     907             :          * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
     908             :          * detect races with us here. By doing this, we guarantee that once
     909             :          * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
     910             :          * it will see either a valid inode that will serialise correctly, or it
     911             :          * will see an invalid inode that it can skip.
     912             :          */
     913   479505527 :         spin_lock(&ip->i_flags_lock);
     914   479505585 :         ip->i_flags = XFS_IRECLAIM;
     915   479505585 :         ip->i_ino = 0;
     916   479505585 :         ip->i_sick = 0;
     917   479505585 :         ip->i_checked = 0;
     918   479505585 :         spin_unlock(&ip->i_flags_lock);
     919             : 
     920   479505676 :         ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
     921   479505676 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     922             : 
     923   479505497 :         XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
     924             :         /*
     925             :          * Remove the inode from the per-AG radix tree.
     926             :          *
     927             :          * Because radix_tree_delete won't complain even if the item was never
     928             :          * added to the tree assert that it's been there before to catch
     929             :          * problems with the inode life time early on.
     930             :          */
     931   479505511 :         spin_lock(&pag->pag_ici_lock);
     932   959011368 :         if (!xfs_is_shutdown(pag->pag_mount)) {
     933             :                 /* had better not be on any unlinked list! */
     934   118281828 :                 ASSERT(!xfs_inode_on_unlinked_list(ip));
     935   118281828 :                 if (xfs_inode_on_unlinked_list(ip))
     936           0 :                         xfs_emerg(pag->pag_mount, "IUNLINK ino 0x%llx nlink %u mode 0o%o prevun 0x%x nextun 0x%x", ino, VFS_I(ip)->i_nlink, VFS_I(ip)->i_mode, ip->i_prev_unlinked, ip->i_next_unlinked);
     937             :         }
     938   479505652 :         if (!radix_tree_delete(&pag->pag_ici_root,
     939   479505684 :                                 XFS_INO_TO_AGINO(ip->i_mount, ino)))
     940           0 :                 ASSERT(0);
     941   479505652 :         xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
     942   479505514 :         spin_unlock(&pag->pag_ici_lock);
     943             : 
     944             :         /*
     945             :          * Here we do an (almost) spurious inode lock in order to coordinate
     946             :          * with inode cache radix tree lookups.  This is because the lookup
     947             :          * can reference the inodes in the cache without taking references.
     948             :          *
     949             :          * We make that OK here by ensuring that we wait until the inode is
     950             :          * unlocked after the lookup before we go ahead and free it.
     951             :          */
     952   479505627 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
     953   479505595 :         ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
     954   479505595 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     955   559591406 :         ASSERT(xfs_inode_clean(ip));
     956             : 
     957   479505611 :         __xfs_inode_free(ip);
     958   479505611 :         return;
     959             : 
     960     8356184 : out_clear_flush:
     961     8356184 :         xfs_iflags_clear(ip, XFS_IFLUSHING);
     962    17383355 : out_iunlock:
     963    17383355 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     964    17398232 : out:
     965    17398232 :         xfs_iflags_clear(ip, XFS_IRECLAIM);
     966             : }
     967             : 
     968             : /* Reclaim sick inodes if we're unmounting or the fs went down. */
     969             : static inline bool
     970      115138 : xfs_want_reclaim_sick(
     971             :         struct xfs_mount        *mp)
     972             : {
     973      284702 :         return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
     974             :                xfs_is_shutdown(mp);
     975             : }
     976             : 
     977             : void
     978       60712 : xfs_reclaim_inodes(
     979             :         struct xfs_mount        *mp)
     980             : {
     981       60712 :         struct xfs_icwalk       icw = {
     982             :                 .icw_flags      = 0,
     983             :         };
     984             : 
     985       60712 :         if (xfs_want_reclaim_sick(mp))
     986       60712 :                 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
     987             : 
     988      366875 :         while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
     989      306163 :                 xfs_ail_push_all_sync(mp->m_ail);
     990      306163 :                 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
     991             :         }
     992       60712 : }
     993             : 
     994             : /*
     995             :  * The shrinker infrastructure determines how many inodes we should scan for
     996             :  * reclaim. We want as many clean inodes ready to reclaim as possible, so we
     997             :  * push the AIL here. We also want to proactively free up memory if we can to
     998             :  * minimise the amount of work memory reclaim has to do so we kick the
     999             :  * background reclaim if it isn't already scheduled.
    1000             :  */
    1001             : long
    1002       54426 : xfs_reclaim_inodes_nr(
    1003             :         struct xfs_mount        *mp,
    1004             :         unsigned long           nr_to_scan)
    1005             : {
    1006       54426 :         struct xfs_icwalk       icw = {
    1007             :                 .icw_flags      = XFS_ICWALK_FLAG_SCAN_LIMIT,
    1008       54426 :                 .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
    1009             :         };
    1010             : 
    1011       54426 :         if (xfs_want_reclaim_sick(mp))
    1012           0 :                 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
    1013             : 
    1014             :         /* kick background reclaimer and push the AIL */
    1015       54426 :         xfs_reclaim_work_queue(mp);
    1016       54426 :         xfs_ail_push_all(mp->m_ail);
    1017             : 
    1018       54426 :         xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
    1019       54426 :         return 0;
    1020             : }
    1021             : 
    1022             : /*
    1023             :  * Return the number of reclaimable inodes in the filesystem for
    1024             :  * the shrinker to determine how much to reclaim.
    1025             :  */
    1026             : long
    1027      352633 : xfs_reclaim_inodes_count(
    1028             :         struct xfs_mount        *mp)
    1029             : {
    1030      352633 :         struct xfs_perag        *pag;
    1031      352633 :         xfs_agnumber_t          ag = 0;
    1032      352633 :         long                    reclaimable = 0;
    1033             : 
    1034      824168 :         while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
    1035      471535 :                 ag = pag->pag_agno + 1;
    1036      471535 :                 reclaimable += pag->pag_ici_reclaimable;
    1037      471535 :                 xfs_perag_put(pag);
    1038             :         }
    1039      352633 :         return reclaimable;
    1040             : }
    1041             : 
    1042             : STATIC bool
    1043     2904968 : xfs_icwalk_match_id(
    1044             :         struct xfs_inode        *ip,
    1045             :         struct xfs_icwalk       *icw)
    1046             : {
    1047     2904968 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
    1048             :             !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
    1049             :                 return false;
    1050             : 
    1051     2904968 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
    1052             :             !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
    1053             :                 return false;
    1054             : 
    1055     2904968 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
    1056           0 :             ip->i_projid != icw->icw_prid)
    1057           0 :                 return false;
    1058             : 
    1059             :         return true;
    1060             : }
    1061             : 
    1062             : /*
    1063             :  * A union-based inode filtering algorithm. Process the inode if any of the
    1064             :  * criteria match. This is for global/internal scans only.
    1065             :  */
    1066             : STATIC bool
    1067        3660 : xfs_icwalk_match_id_union(
    1068             :         struct xfs_inode        *ip,
    1069             :         struct xfs_icwalk       *icw)
    1070             : {
    1071        3660 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
    1072             :             uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
    1073             :                 return true;
    1074             : 
    1075         884 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
    1076             :             gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
    1077             :                 return true;
    1078             : 
    1079           0 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
    1080           0 :             ip->i_projid == icw->icw_prid)
    1081           0 :                 return true;
    1082             : 
    1083             :         return false;
    1084             : }
    1085             : 
    1086             : /*
    1087             :  * Is this inode @ip eligible for eof/cow block reclamation, given some
    1088             :  * filtering parameters @icw?  The inode is eligible if @icw is null or
    1089             :  * if the predicate functions match.
    1090             :  */
    1091             : static bool
    1092     4647781 : xfs_icwalk_match(
    1093             :         struct xfs_inode        *ip,
    1094             :         struct xfs_icwalk       *icw)
    1095             : {
    1096     4647781 :         bool                    match;
    1097             : 
    1098     4647781 :         if (!icw)
    1099             :                 return true;
    1100             : 
    1101     2899275 :         if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
    1102        3660 :                 match = xfs_icwalk_match_id_union(ip, icw);
    1103             :         else
    1104     2895615 :                 match = xfs_icwalk_match_id(ip, icw);
    1105     2899275 :         if (!match)
    1106             :                 return false;
    1107             : 
    1108             :         /* skip the inode if the file size is too small */
    1109     2896017 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
    1110           0 :             XFS_ISIZE(ip) < icw->icw_min_file_size)
    1111           0 :                 return false;
    1112             : 
    1113             :         return true;
    1114             : }
    1115             : 
    1116             : /*
    1117             :  * This is a fast pass over the inode cache to try to get reclaim moving on as
    1118             :  * many inodes as possible in a short period of time. It kicks itself every few
    1119             :  * seconds, as well as being kicked by the inode cache shrinker when memory
    1120             :  * goes low.
    1121             :  */
    1122             : void
    1123       51670 : xfs_reclaim_worker(
    1124             :         struct work_struct *work)
    1125             : {
    1126       51670 :         struct xfs_mount *mp = container_of(to_delayed_work(work),
    1127             :                                         struct xfs_mount, m_reclaim_work);
    1128             : 
    1129       51670 :         xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
    1130       51671 :         xfs_reclaim_work_queue(mp);
    1131       51671 : }
    1132             : 
    1133             : STATIC int
    1134    36857794 : xfs_inode_free_eofblocks(
    1135             :         struct xfs_inode        *ip,
    1136             :         struct xfs_icwalk       *icw,
    1137             :         unsigned int            *lockflags)
    1138             : {
    1139    36857794 :         bool                    wait;
    1140             : 
    1141    36857794 :         wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
    1142             : 
    1143    74104163 :         if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
    1144             :                 return 0;
    1145             : 
    1146             :         /*
    1147             :          * If the mapping is dirty the operation can block and wait for some
    1148             :          * time. Unless we are waiting, skip it.
    1149             :          */
    1150     7274944 :         if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
    1151             :                 return 0;
    1152             : 
    1153     3647631 :         if (!xfs_icwalk_match(ip, icw))
    1154             :                 return 0;
    1155             : 
    1156             :         /*
    1157             :          * If the caller is waiting, return -EAGAIN to keep the background
    1158             :          * scanner moving and revisit the inode in a subsequent pass.
    1159             :          */
    1160     3647631 :         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
    1161     2271279 :                 if (wait)
    1162             :                         return -EAGAIN;
    1163      162522 :                 return 0;
    1164             :         }
    1165     1386110 :         *lockflags |= XFS_IOLOCK_EXCL;
    1166             : 
    1167     1386110 :         if (xfs_can_free_eofblocks(ip, false))
    1168      565001 :                 return xfs_free_eofblocks(ip);
    1169             : 
    1170             :         /* inode could be preallocated or append-only */
    1171      821093 :         trace_xfs_inode_free_eofblocks_invalid(ip);
    1172      821068 :         xfs_inode_clear_eofblocks_tag(ip);
    1173      821068 :         return 0;
    1174             : }
    1175             : 
    1176             : static void
    1177    12037762 : xfs_blockgc_set_iflag(
    1178             :         struct xfs_inode        *ip,
    1179             :         unsigned long           iflag)
    1180             : {
    1181    12037762 :         struct xfs_mount        *mp = ip->i_mount;
    1182    12037762 :         struct xfs_perag        *pag;
    1183             : 
    1184    12037762 :         ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
    1185             : 
    1186             :         /*
    1187             :          * Don't bother locking the AG and looking up in the radix trees
    1188             :          * if we already know that we have the tag set.
    1189             :          */
    1190    12037762 :         if (ip->i_flags & iflag)
    1191             :                 return;
    1192     5861644 :         spin_lock(&ip->i_flags_lock);
    1193     5872753 :         ip->i_flags |= iflag;
    1194     5872753 :         spin_unlock(&ip->i_flags_lock);
    1195             : 
    1196     5871286 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
    1197     5881541 :         spin_lock(&pag->pag_ici_lock);
    1198             : 
    1199     5882656 :         xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
    1200             :                         XFS_ICI_BLOCKGC_TAG);
    1201             : 
    1202     5872430 :         spin_unlock(&pag->pag_ici_lock);
    1203     5868504 :         xfs_perag_put(pag);
    1204             : }
    1205             : 
    1206             : void
    1207     6140946 : xfs_inode_set_eofblocks_tag(
    1208             :         xfs_inode_t     *ip)
    1209             : {
    1210     6140946 :         trace_xfs_inode_set_eofblocks_tag(ip);
    1211     6140616 :         return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
    1212             : }
    1213             : 
    1214             : static void
    1215    28529382 : xfs_blockgc_clear_iflag(
    1216             :         struct xfs_inode        *ip,
    1217             :         unsigned long           iflag)
    1218             : {
    1219    28529382 :         struct xfs_mount        *mp = ip->i_mount;
    1220    28529382 :         struct xfs_perag        *pag;
    1221    28529382 :         bool                    clear_tag;
    1222             : 
    1223    28529382 :         ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
    1224             : 
    1225    28529382 :         spin_lock(&ip->i_flags_lock);
    1226    28597156 :         ip->i_flags &= ~iflag;
    1227    28597156 :         clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
    1228    28597156 :         spin_unlock(&ip->i_flags_lock);
    1229             : 
    1230    28588491 :         if (!clear_tag)
    1231             :                 return;
    1232             : 
    1233    22083492 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
    1234    22090968 :         spin_lock(&pag->pag_ici_lock);
    1235             : 
    1236    22097239 :         xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
    1237             :                         XFS_ICI_BLOCKGC_TAG);
    1238             : 
    1239    22082696 :         spin_unlock(&pag->pag_ici_lock);
    1240    22104418 :         xfs_perag_put(pag);
    1241             : }
    1242             : 
    1243             : void
    1244    20812597 : xfs_inode_clear_eofblocks_tag(
    1245             :         xfs_inode_t     *ip)
    1246             : {
    1247    20812597 :         trace_xfs_inode_clear_eofblocks_tag(ip);
    1248    20761080 :         return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
    1249             : }
    1250             : 
    1251             : /*
    1252             :  * Set ourselves up to free CoW blocks from this file.  If it's already clean
    1253             :  * then we can bail out quickly, but otherwise we must back off if the file
    1254             :  * is undergoing some kind of write.
    1255             :  */
    1256             : static bool
    1257    30198104 : xfs_prep_free_cowblocks(
    1258             :         struct xfs_inode        *ip)
    1259             : {
    1260             :         /*
    1261             :          * Just clear the tag if we have an empty cow fork or none at all. It's
    1262             :          * possible the inode was fully unshared since it was originally tagged.
    1263             :          */
    1264    60396208 :         if (!xfs_inode_has_cow_data(ip)) {
    1265      281261 :                 trace_xfs_inode_free_cowblocks_invalid(ip);
    1266      280952 :                 xfs_inode_clear_cowblocks_tag(ip);
    1267      280952 :                 return false;
    1268             :         }
    1269             : 
    1270             :         /*
    1271             :          * If the mapping is dirty or under writeback we cannot touch the
    1272             :          * CoW fork.  Leave it alone if we're in the midst of a directio.
    1273             :          */
    1274    31886520 :         if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
    1275     3938509 :             mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
    1276     3218951 :             mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
    1277             :             atomic_read(&VFS_I(ip)->i_dio_count))
    1278    28668242 :                 return false;
    1279             : 
    1280             :         return true;
    1281             : }
    1282             : 
    1283             : /*
    1284             :  * Automatic CoW Reservation Freeing
    1285             :  *
    1286             :  * These functions automatically garbage collect leftover CoW reservations
    1287             :  * that were made on behalf of a cowextsize hint when we start to run out
    1288             :  * of quota or when the reservations sit around for too long.  If the file
    1289             :  * has dirty pages or is undergoing writeback, its CoW reservations will
    1290             :  * be retained.
    1291             :  *
    1292             :  * The actual garbage collection piggybacks off the same code that runs
    1293             :  * the speculative EOF preallocation garbage collector.
    1294             :  */
    1295             : STATIC int
    1296    34955332 : xfs_inode_free_cowblocks(
    1297             :         struct xfs_inode        *ip,
    1298             :         struct xfs_icwalk       *icw,
    1299             :         unsigned int            *lockflags)
    1300             : {
    1301    34955332 :         bool                    wait;
    1302    34955332 :         int                     ret = 0;
    1303             : 
    1304    34955332 :         wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
    1305             : 
    1306    70199669 :         if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
    1307             :                 return 0;
    1308             : 
    1309    30012958 :         if (!xfs_prep_free_cowblocks(ip))
    1310             :                 return 0;
    1311             : 
    1312      992397 :         if (!xfs_icwalk_match(ip, icw))
    1313             :                 return 0;
    1314             : 
    1315             :         /*
    1316             :          * If the caller is waiting, return -EAGAIN to keep the background
    1317             :          * scanner moving and revisit the inode in a subsequent pass.
    1318             :          */
    1319     1934774 :         if (!(*lockflags & XFS_IOLOCK_EXCL) &&
    1320      940664 :             !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
    1321      737818 :                 if (wait)
    1322             :                         return -EAGAIN;
    1323       87637 :                 return 0;
    1324             :         }
    1325      256292 :         *lockflags |= XFS_IOLOCK_EXCL;
    1326             : 
    1327      256292 :         if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
    1328          74 :                 if (wait)
    1329             :                         return -EAGAIN;
    1330          46 :                 return 0;
    1331             :         }
    1332      256575 :         *lockflags |= XFS_MMAPLOCK_EXCL;
    1333             : 
    1334             :         /*
    1335             :          * Check again, nobody else should be able to dirty blocks or change
    1336             :          * the reflink iflag now that we have the first two locks held.
    1337             :          */
    1338      256575 :         if (xfs_prep_free_cowblocks(ip))
    1339      256561 :                 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
    1340             :         return ret;
    1341             : }
    1342             : 
    1343             : void
    1344     5901610 : xfs_inode_set_cowblocks_tag(
    1345             :         xfs_inode_t     *ip)
    1346             : {
    1347     5901610 :         trace_xfs_inode_set_cowblocks_tag(ip);
    1348     5897391 :         return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
    1349             : }
    1350             : 
    1351             : void
    1352     7753945 : xfs_inode_clear_cowblocks_tag(
    1353             :         xfs_inode_t     *ip)
    1354             : {
    1355     7753945 :         trace_xfs_inode_clear_cowblocks_tag(ip);
    1356     7751253 :         return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
    1357             : }
    1358             : 
    1359             : /* Disable post-EOF and CoW block auto-reclamation. */
    1360             : void
    1361      117645 : xfs_blockgc_stop(
    1362             :         struct xfs_mount        *mp)
    1363             : {
    1364      117645 :         struct xfs_perag        *pag;
    1365      117645 :         xfs_agnumber_t          agno;
    1366             : 
    1367      117645 :         if (!xfs_clear_blockgc_enabled(mp))
    1368          71 :                 return;
    1369             : 
    1370      822918 :         for_each_perag(mp, agno, pag)
    1371      705344 :                 cancel_delayed_work_sync(&pag->pag_blockgc_work);
    1372      117574 :         trace_xfs_blockgc_stop(mp, __return_address);
    1373             : }
    1374             : 
    1375             : /* Enable post-EOF and CoW block auto-reclamation. */
    1376             : void
    1377      117735 : xfs_blockgc_start(
    1378             :         struct xfs_mount        *mp)
    1379             : {
    1380      117735 :         struct xfs_perag        *pag;
    1381      117735 :         xfs_agnumber_t          agno;
    1382             : 
    1383      117735 :         if (xfs_set_blockgc_enabled(mp))
    1384             :                 return;
    1385             : 
    1386      117713 :         trace_xfs_blockgc_start(mp, __return_address);
    1387      230367 :         for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
    1388      112654 :                 xfs_blockgc_queue(pag);
    1389             : }
    1390             : 
    1391             : /* Don't try to run block gc on an inode that's in any of these states. */
    1392             : #define XFS_BLOCKGC_NOGRAB_IFLAGS       (XFS_INEW | \
    1393             :                                          XFS_NEED_INACTIVE | \
    1394             :                                          XFS_INACTIVATING | \
    1395             :                                          XFS_IRECLAIMABLE | \
    1396             :                                          XFS_IRECLAIM)
    1397             : /*
    1398             :  * Decide if the given @ip is eligible for garbage collection of speculative
    1399             :  * preallocations, and grab it if so.  Returns true if it's ready to go or
    1400             :  * false if we should just ignore it.
    1401             :  */
    1402             : static bool
    1403    37251879 : xfs_blockgc_igrab(
    1404             :         struct xfs_inode        *ip)
    1405             : {
    1406    37251879 :         struct inode            *inode = VFS_I(ip);
    1407             : 
    1408    37251879 :         ASSERT(rcu_read_lock_held());
    1409             : 
    1410             :         /* Check for stale RCU freed inode */
    1411    37251879 :         spin_lock(&ip->i_flags_lock);
    1412    37671780 :         if (!ip->i_ino)
    1413           0 :                 goto out_unlock_noent;
    1414             : 
    1415    37671780 :         if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
    1416      408920 :                 goto out_unlock_noent;
    1417    37262860 :         spin_unlock(&ip->i_flags_lock);
    1418             : 
    1419             :         /* nothing to sync during shutdown */
    1420    74302764 :         if (xfs_is_shutdown(ip->i_mount))
    1421             :                 return false;
    1422             : 
    1423             :         /* If we can't grab the inode, it must on it's way to reclaim. */
    1424    37132872 :         if (!igrab(inode))
    1425        3897 :                 return false;
    1426             : 
    1427             :         /* inode is valid */
    1428             :         return true;
    1429             : 
    1430      408920 : out_unlock_noent:
    1431      408920 :         spin_unlock(&ip->i_flags_lock);
    1432      408920 :         return false;
    1433             : }
    1434             : 
    1435             : /* Scan one incore inode for block preallocations that we can remove. */
    1436             : static int
    1437    36906778 : xfs_blockgc_scan_inode(
    1438             :         struct xfs_inode        *ip,
    1439             :         struct xfs_icwalk       *icw)
    1440             : {
    1441    36906778 :         unsigned int            lockflags = 0;
    1442    36906778 :         int                     error;
    1443             : 
    1444    36906778 :         error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
    1445    37156626 :         if (error)
    1446     2103719 :                 goto unlock;
    1447             : 
    1448    35052907 :         error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
    1449    37126915 : unlock:
    1450    37126915 :         if (lockflags)
    1451     1591206 :                 xfs_iunlock(ip, lockflags);
    1452    37126610 :         xfs_irele(ip);
    1453    37128487 :         return error;
    1454             : }
    1455             : 
    1456             : /* Background worker that trims preallocated space. */
    1457             : void
    1458     3247245 : xfs_blockgc_worker(
    1459             :         struct work_struct      *work)
    1460             : {
    1461     3247245 :         struct xfs_perag        *pag = container_of(to_delayed_work(work),
    1462             :                                         struct xfs_perag, pag_blockgc_work);
    1463     3247245 :         struct xfs_mount        *mp = pag->pag_mount;
    1464     3247245 :         int                     error;
    1465             : 
    1466     3247245 :         trace_xfs_blockgc_worker(mp, __return_address);
    1467             : 
    1468     3247238 :         error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
    1469     3246569 :         if (error)
    1470           0 :                 xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
    1471             :                                 pag->pag_agno, error);
    1472     3246569 :         xfs_blockgc_queue(pag);
    1473     3246327 : }
    1474             : 
    1475             : /*
    1476             :  * Try to free space in the filesystem by purging inactive inodes, eofblocks
    1477             :  * and cowblocks.
    1478             :  */
    1479             : int
    1480     1141384 : xfs_blockgc_free_space(
    1481             :         struct xfs_mount        *mp,
    1482             :         struct xfs_icwalk       *icw)
    1483             : {
    1484     1141384 :         int                     error;
    1485             : 
    1486     1141384 :         trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
    1487             : 
    1488     1140984 :         error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
    1489     1140398 :         if (error)
    1490             :                 return error;
    1491             : 
    1492     1140348 :         return xfs_inodegc_flush(mp);
    1493             : }
    1494             : 
    1495             : /*
    1496             :  * Reclaim all the free space that we can by scheduling the background blockgc
    1497             :  * and inodegc workers immediately and waiting for them all to clear.
    1498             :  */
    1499             : int
    1500     7564071 : xfs_blockgc_flush_all(
    1501             :         struct xfs_mount        *mp)
    1502             : {
    1503     7564071 :         struct xfs_perag        *pag;
    1504     7564071 :         xfs_agnumber_t          agno;
    1505             : 
    1506     7564071 :         trace_xfs_blockgc_flush_all(mp, __return_address);
    1507             : 
    1508             :         /*
    1509             :          * For each blockgc worker, move its queue time up to now.  If it
    1510             :          * wasn't queued, it will not be requeued.  Then flush whatever's
    1511             :          * left.
    1512             :          */
    1513    10448442 :         for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
    1514     2896256 :                 mod_delayed_work(pag->pag_mount->m_blockgc_wq,
    1515             :                                 &pag->pag_blockgc_work, 0);
    1516             : 
    1517    10428546 :         for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
    1518     2869024 :                 flush_delayed_work(&pag->pag_blockgc_work);
    1519             : 
    1520     7562779 :         return xfs_inodegc_flush(mp);
    1521             : }
    1522             : 
    1523             : /*
    1524             :  * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
    1525             :  * quota caused an allocation failure, so we make a best effort by including
    1526             :  * each quota under low free space conditions (less than 1% free space) in the
    1527             :  * scan.
    1528             :  *
    1529             :  * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
    1530             :  * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
    1531             :  * MMAPLOCK.
    1532             :  */
    1533             : int
    1534       22614 : xfs_blockgc_free_dquots(
    1535             :         struct xfs_mount        *mp,
    1536             :         struct xfs_dquot        *udqp,
    1537             :         struct xfs_dquot        *gdqp,
    1538             :         struct xfs_dquot        *pdqp,
    1539             :         unsigned int            iwalk_flags)
    1540             : {
    1541       22614 :         struct xfs_icwalk       icw = {0};
    1542       22614 :         bool                    do_work = false;
    1543             : 
    1544       22614 :         if (!udqp && !gdqp && !pdqp)
    1545             :                 return 0;
    1546             : 
    1547             :         /*
    1548             :          * Run a scan to free blocks using the union filter to cover all
    1549             :          * applicable quotas in a single scan.
    1550             :          */
    1551       22614 :         icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
    1552             : 
    1553       22614 :         if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
    1554       19419 :                 icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
    1555       19419 :                 icw.icw_flags |= XFS_ICWALK_FLAG_UID;
    1556       19419 :                 do_work = true;
    1557             :         }
    1558             : 
    1559       22614 :         if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
    1560       21881 :                 icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
    1561       21881 :                 icw.icw_flags |= XFS_ICWALK_FLAG_GID;
    1562       21881 :                 do_work = true;
    1563             :         }
    1564             : 
    1565       22614 :         if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
    1566       19041 :                 icw.icw_prid = pdqp->q_id;
    1567       19041 :                 icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
    1568       19041 :                 do_work = true;
    1569             :         }
    1570             : 
    1571       22614 :         if (!do_work)
    1572             :                 return 0;
    1573             : 
    1574       22199 :         return xfs_blockgc_free_space(mp, &icw);
    1575             : }
    1576             : 
    1577             : /* Run cow/eofblocks scans on the quotas attached to the inode. */
    1578             : int
    1579       15371 : xfs_blockgc_free_quota(
    1580             :         struct xfs_inode        *ip,
    1581             :         unsigned int            iwalk_flags)
    1582             : {
    1583       15371 :         return xfs_blockgc_free_dquots(ip->i_mount,
    1584             :                         xfs_inode_dquot(ip, XFS_DQTYPE_USER),
    1585             :                         xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
    1586             :                         xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
    1587             : }
    1588             : 
    1589             : /* XFS Inode Cache Walking Code */
    1590             : 
    1591             : /*
    1592             :  * The inode lookup is done in batches to keep the amount of lock traffic and
    1593             :  * radix tree lookups to a minimum. The batch size is a trade off between
    1594             :  * lookup reduction and stack usage. This is in the reclaim path, so we can't
    1595             :  * be too greedy.
    1596             :  */
    1597             : #define XFS_LOOKUP_BATCH        32
    1598             : 
    1599             : 
    1600             : /*
    1601             :  * Decide if we want to grab this inode in anticipation of doing work towards
    1602             :  * the goal.
    1603             :  */
    1604             : static inline bool
    1605   534215600 : xfs_icwalk_igrab(
    1606             :         enum xfs_icwalk_goal    goal,
    1607             :         struct xfs_inode        *ip,
    1608             :         struct xfs_icwalk       *icw)
    1609             : {
    1610   534215600 :         switch (goal) {
    1611    37303961 :         case XFS_ICWALK_BLOCKGC:
    1612    37303961 :                 return xfs_blockgc_igrab(ip);
    1613   496911639 :         case XFS_ICWALK_RECLAIM:
    1614   496911639 :                 return xfs_reclaim_igrab(ip, icw);
    1615             :         default:
    1616             :                 return false;
    1617             :         }
    1618             : }
    1619             : 
    1620             : /*
    1621             :  * Process an inode.  Each processing function must handle any state changes
    1622             :  * made by the icwalk igrab function.  Return -EAGAIN to skip an inode.
    1623             :  */
    1624             : static inline int
    1625   533859907 : xfs_icwalk_process_inode(
    1626             :         enum xfs_icwalk_goal    goal,
    1627             :         struct xfs_inode        *ip,
    1628             :         struct xfs_perag        *pag,
    1629             :         struct xfs_icwalk       *icw)
    1630             : {
    1631   533859907 :         int                     error = 0;
    1632             : 
    1633   533859907 :         switch (goal) {
    1634    36956191 :         case XFS_ICWALK_BLOCKGC:
    1635    36956191 :                 error = xfs_blockgc_scan_inode(ip, icw);
    1636    36956191 :                 break;
    1637   496903716 :         case XFS_ICWALK_RECLAIM:
    1638   496903716 :                 xfs_reclaim_inode(ip, pag);
    1639   496903716 :                 break;
    1640             :         }
    1641   533976351 :         return error;
    1642             : }
    1643             : 
    1644             : /*
    1645             :  * For a given per-AG structure @pag and a goal, grab qualifying inodes and
    1646             :  * process them in some manner.
    1647             :  */
    1648             : static int
    1649     4400442 : xfs_icwalk_ag(
    1650             :         struct xfs_perag        *pag,
    1651             :         enum xfs_icwalk_goal    goal,
    1652             :         struct xfs_icwalk       *icw)
    1653             : {
    1654     4400442 :         struct xfs_mount        *mp = pag->pag_mount;
    1655     4400442 :         uint32_t                first_index;
    1656     4400442 :         int                     last_error = 0;
    1657     6227538 :         int                     skipped;
    1658     6227538 :         bool                    done;
    1659     6227538 :         int                     nr_found;
    1660             : 
    1661     6227538 : restart:
    1662     6227538 :         done = false;
    1663     6227538 :         skipped = 0;
    1664     6227538 :         if (goal == XFS_ICWALK_RECLAIM)
    1665      663498 :                 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
    1666             :         else
    1667             :                 first_index = 0;
    1668             :         nr_found = 0;
    1669    28119261 :         do {
    1670    28119261 :                 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
    1671    28119261 :                 int             error = 0;
    1672    28119261 :                 int             i;
    1673             : 
    1674    28119261 :                 rcu_read_lock();
    1675             : 
    1676    28071364 :                 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
    1677             :                                 (void **) batch, first_index,
    1678             :                                 XFS_LOOKUP_BATCH, goal);
    1679    28113039 :                 if (!nr_found) {
    1680     6072007 :                         done = true;
    1681     6072007 :                         rcu_read_unlock();
    1682     6239181 :                         break;
    1683             :                 }
    1684             : 
    1685             :                 /*
    1686             :                  * Grab the inodes before we drop the lock. if we found
    1687             :                  * nothing, nr == 0 and the loop will be skipped.
    1688             :                  */
    1689   556396445 :                 for (i = 0; i < nr_found; i++) {
    1690   534329497 :                         struct xfs_inode *ip = batch[i];
    1691             : 
    1692   534292811 :                         if (done || !xfs_icwalk_igrab(goal, ip, icw))
    1693      426385 :                                 batch[i] = NULL;
    1694             : 
    1695             :                         /*
    1696             :                          * Update the index for the next lookup. Catch
    1697             :                          * overflows into the next AG range which can occur if
    1698             :                          * we have inodes in the last block of the AG and we
    1699             :                          * are currently pointing to the last inode.
    1700             :                          *
    1701             :                          * Because we may see inodes that are from the wrong AG
    1702             :                          * due to RCU freeing and reallocation, only update the
    1703             :                          * index if it lies in this AG. It was a race that lead
    1704             :                          * us to see this inode, so another lookup from the
    1705             :                          * same index will not find it again.
    1706             :                          */
    1707   534355413 :                         if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
    1708         266 :                                 continue;
    1709   534355147 :                         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
    1710   534355147 :                         if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
    1711           5 :                                 done = true;
    1712             :                 }
    1713             : 
    1714             :                 /* unlock now we've grabbed the inodes. */
    1715    22066948 :                 rcu_read_unlock();
    1716             : 
    1717   578547800 :                 for (i = 0; i < nr_found; i++) {
    1718   534439295 :                         if (!batch[i])
    1719      422676 :                                 continue;
    1720   533952888 :                         error = xfs_icwalk_process_inode(goal, batch[i], pag,
    1721             :                                         icw);
    1722   533992086 :                         if (error == -EAGAIN) {
    1723     2725259 :                                 skipped++;
    1724     2725259 :                                 continue;
    1725             :                         }
    1726   531266827 :                         if (error && last_error != -EFSCORRUPTED)
    1727           0 :                                 last_error = error;
    1728             :                 }
    1729             : 
    1730             :                 /* bail out if the filesystem is corrupted.  */
    1731    22041557 :                 if (error == -EFSCORRUPTED)
    1732             :                         break;
    1733             : 
    1734    22041557 :                 cond_resched();
    1735             : 
    1736    22046373 :                 if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
    1737      504518 :                         icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
    1738      504518 :                         if (icw->icw_scan_limit <= 0)
    1739             :                                 break;
    1740             :                 }
    1741    21891728 :         } while (nr_found && !done);
    1742             : 
    1743     6239186 :         if (goal == XFS_ICWALK_RECLAIM) {
    1744      663500 :                 if (done)
    1745      508855 :                         first_index = 0;
    1746      663500 :                 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
    1747             :         }
    1748             : 
    1749     6239186 :         if (skipped) {
    1750     1840168 :                 delay(1);
    1751     1827096 :                 goto restart;
    1752             :         }
    1753     4399020 :         return last_error;
    1754             : }
    1755             : 
    1756             : /* Walk all incore inodes to achieve a given goal. */
    1757             : static int
    1758     1553257 : xfs_icwalk(
    1759             :         struct xfs_mount        *mp,
    1760             :         enum xfs_icwalk_goal    goal,
    1761             :         struct xfs_icwalk       *icw)
    1762             : {
    1763     1553257 :         struct xfs_perag        *pag;
    1764     1553257 :         int                     error = 0;
    1765     1553257 :         int                     last_error = 0;
    1766     1553257 :         xfs_agnumber_t          agno;
    1767             : 
    1768     2706121 :         for_each_perag_tag(mp, agno, pag, goal) {
    1769     1153638 :                 error = xfs_icwalk_ag(pag, goal, icw);
    1770     1152864 :                 if (error) {
    1771           0 :                         last_error = error;
    1772           0 :                         if (error == -EFSCORRUPTED) {
    1773           0 :                                 xfs_perag_rele(pag);
    1774           0 :                                 break;
    1775             :                         }
    1776             :                 }
    1777             :         }
    1778     1553137 :         return last_error;
    1779             :         BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
    1780             : }
    1781             : 
    1782             : #ifdef DEBUG
    1783             : static void
    1784           0 : xfs_check_delalloc(
    1785             :         struct xfs_inode        *ip,
    1786             :         int                     whichfork)
    1787             : {
    1788           0 :         struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, whichfork);
    1789           0 :         struct xfs_bmbt_irec    got;
    1790           0 :         struct xfs_iext_cursor  icur;
    1791             : 
    1792           0 :         if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
    1793           0 :                 return;
    1794           0 :         do {
    1795           0 :                 if (isnullstartblock(got.br_startblock)) {
    1796           0 :                         xfs_warn(ip->i_mount,
    1797             :         "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
    1798             :                                 ip->i_ino,
    1799             :                                 whichfork == XFS_DATA_FORK ? "data" : "cow",
    1800             :                                 got.br_startoff, got.br_blockcount);
    1801             :                 }
    1802           0 :         } while (xfs_iext_next_extent(ifp, &icur, &got));
    1803             : }
    1804             : #else
    1805             : #define xfs_check_delalloc(ip, whichfork)       do { } while (0)
    1806             : #endif
    1807             : 
    1808             : /* Schedule the inode for reclaim. */
    1809             : static void
    1810  1115532990 : xfs_inodegc_set_reclaimable(
    1811             :         struct xfs_inode        *ip)
    1812             : {
    1813  1115532990 :         struct xfs_mount        *mp = ip->i_mount;
    1814  1115532990 :         struct xfs_perag        *pag;
    1815             : 
    1816  2231065980 :         if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
    1817           0 :                 xfs_check_delalloc(ip, XFS_DATA_FORK);
    1818           0 :                 xfs_check_delalloc(ip, XFS_COW_FORK);
    1819           0 :                 ASSERT(0);
    1820             :         }
    1821             : 
    1822  1115532990 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
    1823  1115896415 :         spin_lock(&pag->pag_ici_lock);
    1824  1116009202 :         spin_lock(&ip->i_flags_lock);
    1825             : 
    1826  2232058758 :         if (!xfs_is_shutdown(pag->pag_mount)) {
    1827             :                 /* had better not be on any unlinked list! */
    1828   756464847 :                 ASSERT(!xfs_inode_on_unlinked_list(ip));
    1829   756464847 :                 if (xfs_inode_on_unlinked_list(ip))
    1830           6 :                         xfs_emerg(pag->pag_mount, "IUNLINK mark reclaim ino 0x%llx nlink %u mode 0o%o prevun 0x%x nextun 0x%x", ip->i_ino, VFS_I(ip)->i_nlink, VFS_I(ip)->i_mode, ip->i_prev_unlinked, ip->i_next_unlinked);
    1831             :         }
    1832             : 
    1833  1116029379 :         trace_xfs_inode_set_reclaimable(ip);
    1834  1115792658 :         ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
    1835  1115792658 :         ip->i_flags |= XFS_IRECLAIMABLE;
    1836  1115792658 :         xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
    1837             :                         XFS_ICI_RECLAIM_TAG);
    1838             : 
    1839  1115773620 :         spin_unlock(&ip->i_flags_lock);
    1840  1115984705 :         spin_unlock(&pag->pag_ici_lock);
    1841  1115975608 :         xfs_perag_put(pag);
    1842  1115979505 : }
    1843             : 
    1844             : /*
    1845             :  * Free all speculative preallocations and possibly even the inode itself.
    1846             :  * This is the last chance to make changes to an otherwise unreferenced file
    1847             :  * before incore reclamation happens.
    1848             :  */
    1849             : static int
    1850    86064500 : xfs_inodegc_inactivate(
    1851             :         struct xfs_inode        *ip)
    1852             : {
    1853    86064500 :         int                     error;
    1854             : 
    1855    86064500 :         trace_xfs_inode_inactivating(ip);
    1856    85968784 :         error = xfs_inactive(ip);
    1857    86150680 :         xfs_inodegc_set_reclaimable(ip);
    1858    86156144 :         return error;
    1859             : 
    1860             : }
    1861             : 
    1862             : void
    1863    11088406 : xfs_inodegc_worker(
    1864             :         struct work_struct      *work)
    1865             : {
    1866    11088406 :         struct xfs_inodegc      *gc = container_of(to_delayed_work(work),
    1867             :                                                 struct xfs_inodegc, work);
    1868    11088406 :         struct llist_node       *node = llist_del_all(&gc->list);
    1869    11111766 :         struct xfs_inode        *ip, *n;
    1870    11111766 :         unsigned int            nofs_flag;
    1871             : 
    1872    11111766 :         ASSERT(gc->cpu == smp_processor_id());
    1873             : 
    1874    11109034 :         WRITE_ONCE(gc->items, 0);
    1875             : 
    1876    11109034 :         if (!node)
    1877             :                 return;
    1878             : 
    1879             :         /*
    1880             :          * We can allocate memory here while doing writeback on behalf of
    1881             :          * memory reclaim.  To avoid memory allocation deadlocks set the
    1882             :          * task-wide nofs context for the following operations.
    1883             :          */
    1884    11106108 :         nofs_flag = memalloc_nofs_save();
    1885             : 
    1886    11106108 :         ip = llist_entry(node, struct xfs_inode, i_gclist);
    1887    11106108 :         trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
    1888             : 
    1889    11098687 :         WRITE_ONCE(gc->shrinker_hits, 0);
    1890    97244270 :         llist_for_each_entry_safe(ip, n, node, i_gclist) {
    1891    86109890 :                 int     error;
    1892             : 
    1893    86109890 :                 xfs_iflags_set(ip, XFS_INACTIVATING);
    1894    86066061 :                 error = xfs_inodegc_inactivate(ip);
    1895    86145583 :                 if (error && !gc->error)
    1896        2505 :                         gc->error = error;
    1897             :         }
    1898             : 
    1899    11134380 :         memalloc_nofs_restore(nofs_flag);
    1900             : }
    1901             : 
    1902             : /*
    1903             :  * Expedite all pending inodegc work to run immediately. This does not wait for
    1904             :  * completion of the work.
    1905             :  */
    1906             : void
    1907    34532464 : xfs_inodegc_push(
    1908             :         struct xfs_mount        *mp)
    1909             : {
    1910    69064928 :         if (!xfs_is_inodegc_enabled(mp))
    1911             :                 return;
    1912    34441950 :         trace_xfs_inodegc_push(mp, __return_address);
    1913    34415726 :         xfs_inodegc_queue_all(mp);
    1914             : }
    1915             : 
    1916             : /*
    1917             :  * Force all currently queued inode inactivation work to run immediately and
    1918             :  * wait for the work to finish.
    1919             :  */
    1920             : int
    1921    11429681 : xfs_inodegc_flush(
    1922             :         struct xfs_mount        *mp)
    1923             : {
    1924    11429681 :         xfs_inodegc_push(mp);
    1925    11437626 :         trace_xfs_inodegc_flush(mp, __return_address);
    1926    11430346 :         return xfs_inodegc_wait_all(mp);
    1927             : }
    1928             : 
    1929             : /*
    1930             :  * Flush all the pending work and then disable the inode inactivation background
    1931             :  * workers and wait for them to stop.  Caller must hold sb->s_umount to
    1932             :  * coordinate changes in the inodegc_enabled state.
    1933             :  */
    1934             : void
    1935      117774 : xfs_inodegc_stop(
    1936             :         struct xfs_mount        *mp)
    1937             : {
    1938      117774 :         bool                    rerun;
    1939             : 
    1940      117774 :         if (!xfs_clear_inodegc_enabled(mp))
    1941             :                 return;
    1942             : 
    1943             :         /*
    1944             :          * Drain all pending inodegc work, including inodes that could be
    1945             :          * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
    1946             :          * threads that sample the inodegc state just prior to us clearing it.
    1947             :          * The inodegc flag state prevents new threads from queuing more
    1948             :          * inodes, so we queue pending work items and flush the workqueue until
    1949             :          * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
    1950             :          * here because it does not allow other unserialized mechanisms to
    1951             :          * reschedule inodegc work while this draining is in progress.
    1952             :          */
    1953      117703 :         xfs_inodegc_queue_all(mp);
    1954      117703 :         do {
    1955      117703 :                 flush_workqueue(mp->m_inodegc_wq);
    1956      117703 :                 rerun = xfs_inodegc_queue_all(mp);
    1957      117703 :         } while (rerun);
    1958             : 
    1959      117703 :         trace_xfs_inodegc_stop(mp, __return_address);
    1960             : }
    1961             : 
    1962             : /*
    1963             :  * Enable the inode inactivation background workers and schedule deferred inode
    1964             :  * inactivation work if there is any.  Caller must hold sb->s_umount to
    1965             :  * coordinate changes in the inodegc_enabled state.
    1966             :  */
    1967             : void
    1968      117735 : xfs_inodegc_start(
    1969             :         struct xfs_mount        *mp)
    1970             : {
    1971      117735 :         if (xfs_set_inodegc_enabled(mp))
    1972             :                 return;
    1973             : 
    1974      117713 :         trace_xfs_inodegc_start(mp, __return_address);
    1975      117713 :         xfs_inodegc_queue_all(mp);
    1976             : }
    1977             : 
    1978             : #ifdef CONFIG_XFS_RT
    1979             : static inline bool
    1980    69333715 : xfs_inodegc_want_queue_rt_file(
    1981             :         struct xfs_inode        *ip)
    1982             : {
    1983    69333715 :         struct xfs_mount        *mp = ip->i_mount;
    1984             : 
    1985    69333715 :         if (!XFS_IS_REALTIME_INODE(ip))
    1986             :                 return false;
    1987             : 
    1988     7313685 :         if (__percpu_counter_compare(&mp->m_frextents,
    1989     7313691 :                                 mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
    1990             :                                 XFS_FDBLOCKS_BATCH) < 0)
    1991      200433 :                 return true;
    1992             : 
    1993             :         return false;
    1994             : }
    1995             : #else
    1996             : # define xfs_inodegc_want_queue_rt_file(ip)     (false)
    1997             : #endif /* CONFIG_XFS_RT */
    1998             : 
    1999             : /*
    2000             :  * Schedule the inactivation worker when:
    2001             :  *
    2002             :  *  - We've accumulated more than one inode cluster buffer's worth of inodes.
    2003             :  *  - There is less than 5% free space left.
    2004             :  *  - Any of the quotas for this inode are near an enforcement limit.
    2005             :  */
    2006             : static inline bool
    2007    86132148 : xfs_inodegc_want_queue_work(
    2008             :         struct xfs_inode        *ip,
    2009             :         unsigned int            items)
    2010             : {
    2011    86132148 :         struct xfs_mount        *mp = ip->i_mount;
    2012             : 
    2013    86132148 :         if (items > mp->m_ino_geo.inodes_per_cluster)
    2014             :                 return true;
    2015             : 
    2016    69798929 :         if (__percpu_counter_compare(&mp->m_fdblocks,
    2017    69803711 :                                 mp->m_low_space[XFS_LOWSP_5_PCNT],
    2018             :                                 XFS_FDBLOCKS_BATCH) < 0)
    2019             :                 return true;
    2020             : 
    2021    69345737 :         if (xfs_inodegc_want_queue_rt_file(ip))
    2022             :                 return true;
    2023             : 
    2024    69121947 :         if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
    2025             :                 return true;
    2026             : 
    2027    69097000 :         if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
    2028             :                 return true;
    2029             : 
    2030    69102316 :         if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
    2031         116 :                 return true;
    2032             : 
    2033             :         return false;
    2034             : }
    2035             : 
    2036             : /*
    2037             :  * Upper bound on the number of inodes in each AG that can be queued for
    2038             :  * inactivation at any given time, to avoid monopolizing the workqueue.
    2039             :  */
    2040             : #define XFS_INODEGC_MAX_BACKLOG         (4 * XFS_INODES_PER_CHUNK)
    2041             : 
    2042             : /*
    2043             :  * Make the frontend wait for inactivations when:
    2044             :  *
    2045             :  *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
    2046             :  *  - The queue depth exceeds the maximum allowable percpu backlog.
    2047             :  *
    2048             :  * Note: If the current thread is running a transaction, we don't ever want to
    2049             :  * wait for other transactions because that could introduce a deadlock.
    2050             :  */
    2051             : static inline bool
    2052             : xfs_inodegc_want_flush_work(
    2053             :         struct xfs_inode        *ip,
    2054             :         unsigned int            items,
    2055             :         unsigned int            shrinker_hits)
    2056             : {
    2057    86064215 :         if (current->journal_info)
    2058             :                 return false;
    2059             : 
    2060    85983821 :         if (shrinker_hits > 0)
    2061             :                 return true;
    2062             : 
    2063    85983820 :         if (items > XFS_INODEGC_MAX_BACKLOG)
    2064             :                 return true;
    2065             : 
    2066             :         return false;
    2067             : }
    2068             : 
    2069             : /*
    2070             :  * Queue a background inactivation worker if there are inodes that need to be
    2071             :  * inactivated and higher level xfs code hasn't disabled the background
    2072             :  * workers.
    2073             :  */
    2074             : static void
    2075    86067455 : xfs_inodegc_queue(
    2076             :         struct xfs_inode        *ip)
    2077             : {
    2078    86067455 :         struct xfs_mount        *mp = ip->i_mount;
    2079    86067455 :         struct xfs_inodegc      *gc;
    2080    86067455 :         int                     items;
    2081    86067455 :         unsigned int            shrinker_hits;
    2082    86067455 :         unsigned long           queue_delay = 1;
    2083             : 
    2084    86067455 :         trace_xfs_inode_set_need_inactive(ip);
    2085    86045870 :         spin_lock(&ip->i_flags_lock);
    2086    86160353 :         ip->i_flags |= XFS_NEED_INACTIVE;
    2087    86160353 :         spin_unlock(&ip->i_flags_lock);
    2088             : 
    2089    86172030 :         gc = get_cpu_ptr(mp->m_inodegc);
    2090    86145338 :         llist_add(&ip->i_gclist, &gc->list);
    2091    86146412 :         items = READ_ONCE(gc->items);
    2092    86146412 :         WRITE_ONCE(gc->items, items + 1);
    2093    86146412 :         shrinker_hits = READ_ONCE(gc->shrinker_hits);
    2094             : 
    2095             :         /*
    2096             :          * We queue the work while holding the current CPU so that the work
    2097             :          * is scheduled to run on this CPU.
    2098             :          */
    2099   172292824 :         if (!xfs_is_inodegc_enabled(mp)) {
    2100           0 :                 put_cpu_ptr(gc);
    2101           0 :                 return;
    2102             :         }
    2103             : 
    2104    86146412 :         if (xfs_inodegc_want_queue_work(ip, items))
    2105    16985459 :                 queue_delay = 0;
    2106             : 
    2107    86081315 :         trace_xfs_inodegc_queue(mp, __return_address);
    2108    86062711 :         mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
    2109             :                         queue_delay);
    2110    86018087 :         put_cpu_ptr(gc);
    2111             : 
    2112    86064215 :         if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
    2113     5347843 :                 trace_xfs_inodegc_throttle(mp, __return_address);
    2114     5344041 :                 flush_delayed_work(&gc->work);
    2115             :         }
    2116             : }
    2117             : 
    2118             : /*
    2119             :  * Fold the dead CPU inodegc queue into the current CPUs queue.
    2120             :  */
    2121             : void
    2122         160 : xfs_inodegc_cpu_dead(
    2123             :         struct xfs_mount        *mp,
    2124             :         unsigned int            dead_cpu)
    2125             : {
    2126         160 :         struct xfs_inodegc      *dead_gc, *gc;
    2127         160 :         struct llist_node       *first, *last;
    2128         160 :         unsigned int            count = 0;
    2129             : 
    2130         160 :         dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
    2131         160 :         cancel_delayed_work_sync(&dead_gc->work);
    2132             : 
    2133         160 :         if (llist_empty(&dead_gc->list))
    2134             :                 return;
    2135             : 
    2136           0 :         first = dead_gc->list.first;
    2137           0 :         last = first;
    2138           0 :         while (last->next) {
    2139           0 :                 last = last->next;
    2140           0 :                 count++;
    2141             :         }
    2142           0 :         dead_gc->list.first = NULL;
    2143           0 :         dead_gc->items = 0;
    2144             : 
    2145             :         /* Add pending work to current CPU */
    2146           0 :         gc = get_cpu_ptr(mp->m_inodegc);
    2147           0 :         llist_add_batch(first, last, &gc->list);
    2148           0 :         count += READ_ONCE(gc->items);
    2149           0 :         WRITE_ONCE(gc->items, count);
    2150             : 
    2151           0 :         if (xfs_is_inodegc_enabled(mp)) {
    2152           0 :                 trace_xfs_inodegc_queue(mp, __return_address);
    2153           0 :                 mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
    2154             :                                 0);
    2155             :         }
    2156           0 :         put_cpu_ptr(gc);
    2157             : }
    2158             : 
    2159             : /*
    2160             :  * We set the inode flag atomically with the radix tree tag.  Once we get tag
    2161             :  * lookups on the radix tree, this inode flag can go away.
    2162             :  *
    2163             :  * We always use background reclaim here because even if the inode is clean, it
    2164             :  * still may be under IO and hence we have wait for IO completion to occur
    2165             :  * before we can reclaim the inode. The background reclaim path handles this
    2166             :  * more efficiently than we can here, so simply let background reclaim tear down
    2167             :  * all inodes.
    2168             :  */
    2169             : void
    2170  1115800591 : xfs_inode_mark_reclaimable(
    2171             :         struct xfs_inode        *ip)
    2172             : {
    2173  1115800591 :         struct xfs_mount        *mp = ip->i_mount;
    2174  1115800591 :         bool                    need_inactive;
    2175             : 
    2176  1115800591 :         XFS_STATS_INC(mp, vn_reclaim);
    2177             : 
    2178             :         /*
    2179             :          * We should never get here with any of the reclaim flags already set.
    2180             :          */
    2181  2231530150 :         ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
    2182             : 
    2183  1115996798 :         need_inactive = xfs_inode_needs_inactive(ip);
    2184  1115445852 :         if (need_inactive) {
    2185    86036303 :                 xfs_inodegc_queue(ip);
    2186    86036303 :                 return;
    2187             :         }
    2188             : 
    2189             :         /* Going straight to reclaim, so drop the dquots. */
    2190  1029409549 :         xfs_qm_dqdetach(ip);
    2191  1029392903 :         xfs_inodegc_set_reclaimable(ip);
    2192             : }
    2193             : 
    2194             : /*
    2195             :  * Register a phony shrinker so that we can run background inodegc sooner when
    2196             :  * there's memory pressure.  Inactivation does not itself free any memory but
    2197             :  * it does make inodes reclaimable, which eventually frees memory.
    2198             :  *
    2199             :  * The count function, seek value, and batch value are crafted to trigger the
    2200             :  * scan function during the second round of scanning.  Hopefully this means
    2201             :  * that we reclaimed enough memory that initiating metadata transactions won't
    2202             :  * make things worse.
    2203             :  */
    2204             : #define XFS_INODEGC_SHRINKER_COUNT      (1UL << DEF_PRIORITY)
    2205             : #define XFS_INODEGC_SHRINKER_BATCH      ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
    2206             : 
    2207             : static unsigned long
    2208        7304 : xfs_inodegc_shrinker_count(
    2209             :         struct shrinker         *shrink,
    2210             :         struct shrink_control   *sc)
    2211             : {
    2212        7304 :         struct xfs_mount        *mp = container_of(shrink, struct xfs_mount,
    2213             :                                                    m_inodegc_shrinker);
    2214        7304 :         struct xfs_inodegc      *gc;
    2215        7304 :         int                     cpu;
    2216             : 
    2217       14608 :         if (!xfs_is_inodegc_enabled(mp))
    2218             :                 return 0;
    2219             : 
    2220       36341 :         for_each_online_cpu(cpu) {
    2221       29110 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
    2222       29110 :                 if (!llist_empty(&gc->list))
    2223             :                         return XFS_INODEGC_SHRINKER_COUNT;
    2224             :         }
    2225             : 
    2226             :         return 0;
    2227             : }
    2228             : 
    2229             : static unsigned long
    2230          35 : xfs_inodegc_shrinker_scan(
    2231             :         struct shrinker         *shrink,
    2232             :         struct shrink_control   *sc)
    2233             : {
    2234          35 :         struct xfs_mount        *mp = container_of(shrink, struct xfs_mount,
    2235             :                                                    m_inodegc_shrinker);
    2236          35 :         struct xfs_inodegc      *gc;
    2237          35 :         int                     cpu;
    2238          35 :         bool                    no_items = true;
    2239             : 
    2240          70 :         if (!xfs_is_inodegc_enabled(mp))
    2241             :                 return SHRINK_STOP;
    2242             : 
    2243          35 :         trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
    2244             : 
    2245         210 :         for_each_online_cpu(cpu) {
    2246         140 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
    2247         140 :                 if (!llist_empty(&gc->list)) {
    2248          36 :                         unsigned int    h = READ_ONCE(gc->shrinker_hits);
    2249             : 
    2250          36 :                         WRITE_ONCE(gc->shrinker_hits, h + 1);
    2251          36 :                         mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
    2252          36 :                         no_items = false;
    2253             :                 }
    2254             :         }
    2255             : 
    2256             :         /*
    2257             :          * If there are no inodes to inactivate, we don't want the shrinker
    2258             :          * to think there's deferred work to call us back about.
    2259             :          */
    2260          35 :         if (no_items)
    2261           0 :                 return LONG_MAX;
    2262             : 
    2263             :         return SHRINK_STOP;
    2264             : }
    2265             : 
    2266             : /* Register a shrinker so we can accelerate inodegc and throttle queuing. */
    2267             : int
    2268       60764 : xfs_inodegc_register_shrinker(
    2269             :         struct xfs_mount        *mp)
    2270             : {
    2271       60764 :         struct shrinker         *shrink = &mp->m_inodegc_shrinker;
    2272             : 
    2273       60764 :         shrink->count_objects = xfs_inodegc_shrinker_count;
    2274       60764 :         shrink->scan_objects = xfs_inodegc_shrinker_scan;
    2275       60764 :         shrink->seeks = 0;
    2276       60764 :         shrink->flags = SHRINKER_NONSLAB;
    2277       60764 :         shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
    2278             : 
    2279       60764 :         return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
    2280             : }

Generated by: LCOV version 1.14