LCOV - code coverage report
Current view: top level - fs/xfs - xfs_icache.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsx @ Mon Jul 31 20:08:34 PDT 2023 Lines: 844 921 91.6 %
Date: 2023-07-31 20:08:34 Functions: 64 65 98.5 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
       4             :  * All Rights Reserved.
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_log_format.h"
      11             : #include "xfs_trans_resv.h"
      12             : #include "xfs_mount.h"
      13             : #include "xfs_inode.h"
      14             : #include "xfs_trans.h"
      15             : #include "xfs_trans_priv.h"
      16             : #include "xfs_inode_item.h"
      17             : #include "xfs_quota.h"
      18             : #include "xfs_trace.h"
      19             : #include "xfs_icache.h"
      20             : #include "xfs_bmap_util.h"
      21             : #include "xfs_dquot_item.h"
      22             : #include "xfs_dquot.h"
      23             : #include "xfs_reflink.h"
      24             : #include "xfs_ialloc.h"
      25             : #include "xfs_ag.h"
      26             : #include "xfs_log_priv.h"
      27             : #include "xfs_health.h"
      28             : #include "xfs_da_format.h"
      29             : #include "xfs_dir2.h"
      30             : #include "xfs_imeta.h"
      31             : 
      32             : #include <linux/iversion.h>
      33             : 
      34             : /* Radix tree tags for incore inode tree. */
      35             : 
      36             : /* inode is to be reclaimed */
      37             : #define XFS_ICI_RECLAIM_TAG     0
      38             : /* Inode has speculative preallocations (posteof or cow) to clean. */
      39             : #define XFS_ICI_BLOCKGC_TAG     1
      40             : 
      41             : /*
      42             :  * The goal for walking incore inodes.  These can correspond with incore inode
      43             :  * radix tree tags when convenient.  Avoid existing XFS_IWALK namespace.
      44             :  */
      45             : enum xfs_icwalk_goal {
      46             :         /* Goals directly associated with tagged inodes. */
      47             :         XFS_ICWALK_BLOCKGC      = XFS_ICI_BLOCKGC_TAG,
      48             :         XFS_ICWALK_RECLAIM      = XFS_ICI_RECLAIM_TAG,
      49             : };
      50             : 
      51             : static int xfs_icwalk(struct xfs_mount *mp,
      52             :                 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
      53             : static int xfs_icwalk_ag(struct xfs_perag *pag,
      54             :                 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
      55             : 
      56             : /*
      57             :  * Private inode cache walk flags for struct xfs_icwalk.  Must not
      58             :  * coincide with XFS_ICWALK_FLAGS_VALID.
      59             :  */
      60             : 
      61             : /* Stop scanning after icw_scan_limit inodes. */
      62             : #define XFS_ICWALK_FLAG_SCAN_LIMIT      (1U << 28)
      63             : 
      64             : #define XFS_ICWALK_FLAG_RECLAIM_SICK    (1U << 27)
      65             : #define XFS_ICWALK_FLAG_UNION           (1U << 26) /* union filter algorithm */
      66             : 
      67             : #define XFS_ICWALK_PRIVATE_FLAGS        (XFS_ICWALK_FLAG_SCAN_LIMIT | \
      68             :                                          XFS_ICWALK_FLAG_RECLAIM_SICK | \
      69             :                                          XFS_ICWALK_FLAG_UNION)
      70             : 
      71             : /*
      72             :  * Allocate and initialise an xfs_inode.
      73             :  */
      74             : struct xfs_inode *
      75   444998433 : xfs_inode_alloc(
      76             :         struct xfs_mount        *mp,
      77             :         xfs_ino_t               ino)
      78             : {
      79   444998433 :         struct xfs_inode        *ip;
      80             : 
      81             :         /*
      82             :          * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
      83             :          * and return NULL here on ENOMEM.
      84             :          */
      85   444998433 :         ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
      86             : 
      87   445710232 :         if (inode_init_always(mp->m_super, VFS_I(ip))) {
      88           0 :                 kmem_cache_free(xfs_inode_cache, ip);
      89           0 :                 return NULL;
      90             :         }
      91             : 
      92             :         /* VFS doesn't initialise i_mode or i_state! */
      93   445620923 :         VFS_I(ip)->i_mode = 0;
      94   445620923 :         VFS_I(ip)->i_state = 0;
      95   445620923 :         mapping_set_large_folios(VFS_I(ip)->i_mapping);
      96             : 
      97   445638967 :         XFS_STATS_INC(mp, vn_active);
      98   445322733 :         ASSERT(atomic_read(&ip->i_pincount) == 0);
      99   445322733 :         ASSERT(ip->i_ino == 0);
     100             : 
     101             :         /* initialise the xfs inode */
     102   445322733 :         ip->i_ino = ino;
     103   445322733 :         ip->i_mount = mp;
     104   445322733 :         memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
     105   445322733 :         ip->i_cowfp = NULL;
     106   445322733 :         memset(&ip->i_af, 0, sizeof(ip->i_af));
     107   445322733 :         ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
     108   445322733 :         memset(&ip->i_df, 0, sizeof(ip->i_df));
     109   445322733 :         ip->i_flags = 0;
     110   445322733 :         ip->i_delayed_blks = 0;
     111   445322733 :         ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
     112   445322733 :         ip->i_nblocks = 0;
     113   445322733 :         ip->i_forkoff = 0;
     114   445322733 :         ip->i_sick = 0;
     115   445322733 :         ip->i_checked = 0;
     116   445322733 :         INIT_WORK(&ip->i_ioend_work, xfs_end_io);
     117   445322733 :         INIT_LIST_HEAD(&ip->i_ioend_list);
     118   445322733 :         spin_lock_init(&ip->i_ioend_lock);
     119   445686716 :         ip->i_next_unlinked = NULLAGINO;
     120   445686716 :         ip->i_prev_unlinked = 0;
     121             : 
     122   445686716 :         return ip;
     123             : }
     124             : 
     125             : STATIC void
     126   444709486 : xfs_inode_free_callback(
     127             :         struct rcu_head         *head)
     128             : {
     129   444709486 :         struct inode            *inode = container_of(head, struct inode, i_rcu);
     130   444709486 :         struct xfs_inode        *ip = XFS_I(inode);
     131             : 
     132   444709486 :         switch (VFS_I(ip)->i_mode & S_IFMT) {
     133   287198690 :         case S_IFREG:
     134             :         case S_IFDIR:
     135             :         case S_IFLNK:
     136   287198690 :                 xfs_idestroy_fork(&ip->i_df);
     137   287198690 :                 break;
     138             :         }
     139             : 
     140   444726567 :         xfs_ifork_zap_attr(ip);
     141             : 
     142   445240087 :         if (ip->i_cowfp) {
     143    96025217 :                 xfs_idestroy_fork(ip->i_cowfp);
     144    96011894 :                 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
     145             :         }
     146   445092811 :         if (ip->i_itemp) {
     147    83950298 :                 ASSERT(!test_bit(XFS_LI_IN_AIL,
     148             :                                  &ip->i_itemp->ili_item.li_flags));
     149    83950298 :                 xfs_inode_item_destroy(ip);
     150    83986216 :                 ip->i_itemp = NULL;
     151             :         }
     152             : 
     153   445128729 :         kmem_cache_free(xfs_inode_cache, ip);
     154   444948469 : }
     155             : 
     156             : static void
     157   445921946 : __xfs_inode_free(
     158             :         struct xfs_inode        *ip)
     159             : {
     160             :         /* asserts to verify all state is correct here */
     161   445921946 :         ASSERT(atomic_read(&ip->i_pincount) == 0);
     162   445921946 :         ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
     163   445921946 :         XFS_STATS_DEC(ip->i_mount, vn_active);
     164             : 
     165   445921941 :         call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
     166   445922057 : }
     167             : 
     168             : void
     169      816345 : xfs_inode_free(
     170             :         struct xfs_inode        *ip)
     171             : {
     172     1632691 :         ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
     173             : 
     174             :         /*
     175             :          * Because we use RCU freeing we need to ensure the inode always
     176             :          * appears to be reclaimed with an invalid inode number when in the
     177             :          * free state. The ip->i_flags_lock provides the barrier against lookup
     178             :          * races.
     179             :          */
     180      816346 :         spin_lock(&ip->i_flags_lock);
     181      816347 :         ip->i_flags = XFS_IRECLAIM;
     182      816347 :         ip->i_ino = 0;
     183      816347 :         spin_unlock(&ip->i_flags_lock);
     184             : 
     185      816347 :         __xfs_inode_free(ip);
     186      816344 : }
     187             : 
     188             : /*
     189             :  * Queue background inode reclaim work if there are reclaimable inodes and there
     190             :  * isn't reclaim work already scheduled or in progress.
     191             :  */
     192             : static void
     193     9297240 : xfs_reclaim_work_queue(
     194             :         struct xfs_mount        *mp)
     195             : {
     196             : 
     197     9297240 :         rcu_read_lock();
     198     9295309 :         if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
     199     9271618 :                 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
     200     9273397 :                         msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
     201             :         }
     202     9298380 :         rcu_read_unlock();
     203     9295682 : }
     204             : 
     205             : /*
     206             :  * Background scanning to trim preallocated space. This is queued based on the
     207             :  * 'speculative_prealloc_lifetime' tunable (5m by default).
     208             :  */
     209             : static inline void
     210     5823881 : xfs_blockgc_queue(
     211             :         struct xfs_perag        *pag)
     212             : {
     213     5823881 :         struct xfs_mount        *mp = pag->pag_mount;
     214             : 
     215    11647762 :         if (!xfs_is_blockgc_enabled(mp))
     216             :                 return;
     217             : 
     218     5823565 :         rcu_read_lock();
     219     5823503 :         if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
     220     5748337 :                 queue_delayed_work(pag->pag_mount->m_blockgc_wq,
     221             :                                    &pag->pag_blockgc_work,
     222     5748298 :                                    msecs_to_jiffies(xfs_blockgc_secs * 1000));
     223     5823671 :         rcu_read_unlock();
     224             : }
     225             : 
     226             : /* Set a tag on both the AG incore inode tree and the AG radix tree. */
     227             : static void
     228   886078071 : xfs_perag_set_inode_tag(
     229             :         struct xfs_perag        *pag,
     230             :         xfs_agino_t             agino,
     231             :         unsigned int            tag)
     232             : {
     233   886078071 :         struct xfs_mount        *mp = pag->pag_mount;
     234   886078071 :         bool                    was_tagged;
     235             : 
     236   886078071 :         lockdep_assert_held(&pag->pag_ici_lock);
     237             : 
     238   886078071 :         was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
     239   886056534 :         radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
     240             : 
     241   886116844 :         if (tag == XFS_ICI_RECLAIM_TAG)
     242   879623973 :                 pag->pag_ici_reclaimable++;
     243             : 
     244   886116844 :         if (was_tagged)
     245             :                 return;
     246             : 
     247             :         /* propagate the tag up into the perag radix tree */
     248     9816852 :         spin_lock(&mp->m_perag_lock);
     249     9830850 :         radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
     250     9830850 :         spin_unlock(&mp->m_perag_lock);
     251             : 
     252             :         /* start background work */
     253     9830832 :         switch (tag) {
     254     9182324 :         case XFS_ICI_RECLAIM_TAG:
     255     9182324 :                 xfs_reclaim_work_queue(mp);
     256     9182324 :                 break;
     257      648508 :         case XFS_ICI_BLOCKGC_TAG:
     258      648508 :                 xfs_blockgc_queue(pag);
     259      648508 :                 break;
     260             :         }
     261             : 
     262     9814714 :         trace_xfs_perag_set_inode_tag(pag, _RET_IP_);
     263             : }
     264             : 
     265             : /* Clear a tag on both the AG incore inode tree and the AG radix tree. */
     266             : static void
     267   904934145 : xfs_perag_clear_inode_tag(
     268             :         struct xfs_perag        *pag,
     269             :         xfs_agino_t             agino,
     270             :         unsigned int            tag)
     271             : {
     272   904934145 :         struct xfs_mount        *mp = pag->pag_mount;
     273             : 
     274   904934145 :         lockdep_assert_held(&pag->pag_ici_lock);
     275             : 
     276             :         /*
     277             :          * Reclaim can signal (with a null agino) that it cleared its own tag
     278             :          * by removing the inode from the radix tree.
     279             :          */
     280   904934145 :         if (agino != NULLAGINO)
     281   459828436 :                 radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
     282             :         else
     283   445105709 :                 ASSERT(tag == XFS_ICI_RECLAIM_TAG);
     284             : 
     285   904926282 :         if (tag == XFS_ICI_RECLAIM_TAG)
     286   879740589 :                 pag->pag_ici_reclaimable--;
     287             : 
     288   904926282 :         if (radix_tree_tagged(&pag->pag_ici_root, tag))
     289             :                 return;
     290             : 
     291             :         /* clear the tag from the perag radix tree */
     292    23002925 :         spin_lock(&mp->m_perag_lock);
     293    23115517 :         radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
     294    23115517 :         spin_unlock(&mp->m_perag_lock);
     295             : 
     296    23115242 :         trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
     297             : }
     298             : 
     299             : /*
     300             :  * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
     301             :  * part of the structure. This is made more complex by the fact we store
     302             :  * information about the on-disk values in the VFS inode and so we can't just
     303             :  * overwrite the values unconditionally. Hence we save the parameters we
     304             :  * need to retain across reinitialisation, and rewrite them into the VFS inode
     305             :  * after reinitialisation even if it fails.
     306             :  */
     307             : static int
     308   434660510 : xfs_reinit_inode(
     309             :         struct xfs_mount        *mp,
     310             :         struct inode            *inode)
     311             : {
     312   434660510 :         int                     error;
     313   434660510 :         uint32_t                nlink = inode->i_nlink;
     314   434660510 :         uint32_t                generation = inode->i_generation;
     315   434660510 :         uint64_t                version = inode_peek_iversion(inode);
     316   434660510 :         umode_t                 mode = inode->i_mode;
     317   434660510 :         dev_t                   dev = inode->i_rdev;
     318   434660510 :         kuid_t                  uid = inode->i_uid;
     319   434660510 :         kgid_t                  gid = inode->i_gid;
     320             : 
     321   434660510 :         error = inode_init_always(mp->m_super, inode);
     322             : 
     323   434645066 :         set_nlink(inode, nlink);
     324   434627604 :         inode->i_generation = generation;
     325   434627604 :         inode_set_iversion_queried(inode, version);
     326   434627604 :         inode->i_mode = mode;
     327   434627604 :         inode->i_rdev = dev;
     328   434627604 :         inode->i_uid = uid;
     329   434627604 :         inode->i_gid = gid;
     330   434627604 :         mapping_set_large_folios(inode->i_mapping);
     331   434629126 :         return error;
     332             : }
     333             : 
     334             : /*
     335             :  * Carefully nudge an inode whose VFS state has been torn down back into a
     336             :  * usable state.  Drops the i_flags_lock and the rcu read lock.
     337             :  */
     338             : static int
     339   434637674 : xfs_iget_recycle(
     340             :         struct xfs_perag        *pag,
     341             :         struct xfs_inode        *ip) __releases(&ip->i_flags_lock)
     342             : {
     343   434637674 :         struct xfs_mount        *mp = ip->i_mount;
     344   434637674 :         struct inode            *inode = VFS_I(ip);
     345   434637674 :         int                     error;
     346             : 
     347   434637674 :         trace_xfs_iget_recycle(ip);
     348             : 
     349   434629245 :         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
     350             :                 return -EAGAIN;
     351             : 
     352             :         /*
     353             :          * We need to make it look like the inode is being reclaimed to prevent
     354             :          * the actual reclaim workers from stomping over us while we recycle
     355             :          * the inode.  We can't clear the radix tree tag yet as it requires
     356             :          * pag_ici_lock to be held exclusive.
     357             :          */
     358   434638504 :         ip->i_flags |= XFS_IRECLAIM;
     359             : 
     360   434638504 :         spin_unlock(&ip->i_flags_lock);
     361   434666683 :         rcu_read_unlock();
     362             : 
     363   434663512 :         ASSERT(!rwsem_is_locked(&inode->i_rwsem));
     364   434663512 :         error = xfs_reinit_inode(mp, inode);
     365   434630666 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     366   434625597 :         if (error) {
     367             :                 /*
     368             :                  * Re-initializing the inode failed, and we are in deep
     369             :                  * trouble.  Try to re-add it to the reclaim list.
     370             :                  */
     371           0 :                 rcu_read_lock();
     372           0 :                 spin_lock(&ip->i_flags_lock);
     373           0 :                 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
     374           0 :                 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
     375           0 :                 spin_unlock(&ip->i_flags_lock);
     376           0 :                 rcu_read_unlock();
     377             : 
     378           0 :                 trace_xfs_iget_recycle_fail(ip);
     379           0 :                 return error;
     380             :         }
     381             : 
     382   434625597 :         spin_lock(&pag->pag_ici_lock);
     383   434673936 :         spin_lock(&ip->i_flags_lock);
     384             : 
     385             :         /*
     386             :          * Clear the per-lifetime state in the inode as we are now effectively
     387             :          * a new inode and need to return to the initial state before reuse
     388             :          * occurs.
     389             :          */
     390   434681908 :         ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
     391   434681908 :         ip->i_flags |= XFS_INEW;
     392   434681908 :         xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
     393             :                         XFS_ICI_RECLAIM_TAG);
     394   434627787 :         inode->i_state = I_NEW;
     395   434627787 :         spin_unlock(&ip->i_flags_lock);
     396   434662040 :         spin_unlock(&pag->pag_ici_lock);
     397             : 
     398   434662040 :         return 0;
     399             : }
     400             : 
     401             : /*
     402             :  * If we are allocating a new inode, then check what was returned is
     403             :  * actually a free, empty inode. If we are not allocating an inode,
     404             :  * then check we didn't find a free inode.
     405             :  *
     406             :  * Returns:
     407             :  *      0               if the inode free state matches the lookup context
     408             :  *      -ENOENT         if the inode is free and we are not allocating
     409             :  *      -EFSCORRUPTED   if there is any state mismatch at all
     410             :  */
     411             : static int
     412 >14093*10^7 : xfs_iget_check_free_state(
     413             :         struct xfs_inode        *ip,
     414             :         int                     flags)
     415             : {
     416 >14093*10^7 :         if (flags & XFS_IGET_CREATE) {
     417             :                 /* should be a free inode */
     418   123720568 :                 if (VFS_I(ip)->i_mode != 0) {
     419           0 :                         xfs_warn(ip->i_mount,
     420             : "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
     421             :                                 ip->i_ino, VFS_I(ip)->i_mode);
     422           0 :                         xfs_agno_mark_sick(ip->i_mount,
     423           0 :                                         XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
     424             :                                         XFS_SICK_AG_INOBT);
     425           0 :                         return -EFSCORRUPTED;
     426             :                 }
     427             : 
     428   123720568 :                 if (ip->i_nblocks != 0) {
     429           0 :                         xfs_warn(ip->i_mount,
     430             : "Corruption detected! Free inode 0x%llx has blocks allocated!",
     431             :                                 ip->i_ino);
     432           0 :                         xfs_agno_mark_sick(ip->i_mount,
     433           0 :                                         XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
     434             :                                         XFS_SICK_AG_INOBT);
     435           0 :                         return -EFSCORRUPTED;
     436             :                 }
     437             :                 return 0;
     438             :         }
     439             : 
     440             :         /* should be an allocated inode */
     441 >14081*10^7 :         if (VFS_I(ip)->i_mode == 0)
     442     2563920 :                 return -ENOENT;
     443             : 
     444             :         return 0;
     445             : }
     446             : 
     447             : /* Make all pending inactivation work start immediately. */
     448             : static bool
     449    47051756 : xfs_inodegc_queue_all(
     450             :         struct xfs_mount        *mp)
     451             : {
     452    47051756 :         struct xfs_inodegc      *gc;
     453    47051756 :         int                     cpu;
     454    47051756 :         bool                    ret = false;
     455             : 
     456   235033760 :         for_each_online_cpu(cpu) {
     457   187931501 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
     458   187970961 :                 if (!llist_empty(&gc->list)) {
     459     4008420 :                         mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
     460     4008420 :                         ret = true;
     461             :                 }
     462             :         }
     463             : 
     464    47032650 :         return ret;
     465             : }
     466             : 
     467             : /* Wait for all queued work and collect errors */
     468             : static int
     469    16950883 : xfs_inodegc_wait_all(
     470             :         struct xfs_mount        *mp)
     471             : {
     472    16950883 :         int                     cpu;
     473    16950883 :         int                     error = 0;
     474             : 
     475    16950883 :         flush_workqueue(mp->m_inodegc_wq);
     476   101776772 :         for_each_online_cpu(cpu) {
     477    67858395 :                 struct xfs_inodegc      *gc;
     478             : 
     479    67858395 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
     480    67858773 :                 if (gc->error && !error)
     481        2106 :                         error = gc->error;
     482    67858773 :                 gc->error = 0;
     483             :         }
     484             : 
     485    16966262 :         return error;
     486             : }
     487             : 
     488             : /*
     489             :  * Check the validity of the inode we just found it the cache
     490             :  */
     491             : static int
     492 >13975*10^7 : xfs_iget_cache_hit(
     493             :         struct xfs_perag        *pag,
     494             :         struct xfs_inode        *ip,
     495             :         xfs_ino_t               ino,
     496             :         int                     flags,
     497             :         int                     lock_flags) __releases(RCU)
     498             : {
     499 >13975*10^7 :         struct inode            *inode = VFS_I(ip);
     500 >13975*10^7 :         struct xfs_mount        *mp = ip->i_mount;
     501 >13975*10^7 :         int                     error;
     502             : 
     503             :         /*
     504             :          * check for re-use of an inode within an RCU grace period due to the
     505             :          * radix tree nodes not being updated yet. We monitor for this by
     506             :          * setting the inode number to zero before freeing the inode structure.
     507             :          * If the inode has been reallocated and set up, then the inode number
     508             :          * will not match, so check for that, too.
     509             :          */
     510 >13975*10^7 :         spin_lock(&ip->i_flags_lock);
     511 >14113*10^7 :         if (ip->i_ino != ino)
     512          10 :                 goto out_skip;
     513             : 
     514             :         /*
     515             :          * If we are racing with another cache hit that is currently
     516             :          * instantiating this inode or currently recycling it out of
     517             :          * reclaimable state, wait for the initialisation to complete
     518             :          * before continuing.
     519             :          *
     520             :          * If we're racing with the inactivation worker we also want to wait.
     521             :          * If we're creating a new file, it's possible that the worker
     522             :          * previously marked the inode as free on disk but hasn't finished
     523             :          * updating the incore state yet.  The AGI buffer will be dirty and
     524             :          * locked to the icreate transaction, so a synchronous push of the
     525             :          * inodegc workers would result in deadlock.  For a regular iget, the
     526             :          * worker is running already, so we might as well wait.
     527             :          *
     528             :          * XXX(hch): eventually we should do something equivalent to
     529             :          *           wait_on_inode to wait for these flags to be cleared
     530             :          *           instead of polling for it.
     531             :          */
     532 >14113*10^7 :         if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
     533      493465 :                 goto out_skip;
     534             : 
     535 >14113*10^7 :         if (ip->i_flags & XFS_NEED_INACTIVE) {
     536             :                 /* Unlinked inodes cannot be re-grabbed. */
     537     5038589 :                 if (VFS_I(ip)->i_nlink == 0) {
     538     5016905 :                         error = -ENOENT;
     539     5016905 :                         goto out_error;
     540             :                 }
     541       21684 :                 goto out_inodegc_flush;
     542             :         }
     543             : 
     544             :         /*
     545             :          * Check the inode free state is valid. This also detects lookup
     546             :          * racing with unlinks.
     547             :          */
     548 >14113*10^7 :         error = xfs_iget_check_free_state(ip, flags);
     549 >14003*10^7 :         if (error)
     550     2563911 :                 goto out_error;
     551             : 
     552             :         /* Skip inodes that have no vfs state. */
     553 >14003*10^7 :         if ((flags & XFS_IGET_INCORE) &&
     554           0 :             (ip->i_flags & XFS_IRECLAIMABLE))
     555           0 :                 goto out_skip;
     556             : 
     557             :         /* The inode fits the selection criteria; process it. */
     558 >14003*10^7 :         if (ip->i_flags & XFS_IRECLAIMABLE) {
     559             :                 /* Drops i_flags_lock and RCU read lock. */
     560   434641832 :                 error = xfs_iget_recycle(pag, ip);
     561   434667835 :                 if (error == -EAGAIN)
     562           4 :                         goto out_skip;
     563   434667831 :                 if (error)
     564             :                         return error;
     565             :         } else {
     566             :                 /* If the VFS inode is being torn down, pause and try again. */
     567 >13959*10^7 :                 if (!igrab(inode))
     568      167759 :                         goto out_skip;
     569             : 
     570             :                 /* We've got a live one. */
     571 >14026*10^7 :                 spin_unlock(&ip->i_flags_lock);
     572 >14045*10^7 :                 rcu_read_unlock();
     573 >14044*10^7 :                 trace_xfs_iget_hit(ip);
     574             :         }
     575             : 
     576 >13961*10^7 :         if (lock_flags != 0)
     577 81301242018 :                 xfs_ilock(ip, lock_flags);
     578             : 
     579 >14041*10^7 :         if (!(flags & XFS_IGET_INCORE))
     580 >14040*10^7 :                 xfs_iflags_clear(ip, XFS_ISTALE);
     581 >14106*10^7 :         XFS_STATS_INC(mp, xs_ig_found);
     582             : 
     583 >14053*10^7 :         return 0;
     584             : 
     585      661238 : out_skip:
     586      661238 :         trace_xfs_iget_skip(ip);
     587      661133 :         XFS_STATS_INC(mp, xs_ig_frecycle);
     588      661158 :         error = -EAGAIN;
     589     8241974 : out_error:
     590     8241974 :         spin_unlock(&ip->i_flags_lock);
     591     8242034 :         rcu_read_unlock();
     592     8242034 :         return error;
     593             : 
     594             : out_inodegc_flush:
     595       21684 :         spin_unlock(&ip->i_flags_lock);
     596       21684 :         rcu_read_unlock();
     597             :         /*
     598             :          * Do not wait for the workers, because the caller could hold an AGI
     599             :          * buffer lock.  We're just going to sleep in a loop anyway.
     600             :          */
     601       43364 :         if (xfs_is_inodegc_enabled(mp))
     602       21681 :                 xfs_inodegc_queue_all(mp);
     603             :         return -EAGAIN;
     604             : }
     605             : 
     606             : static int
     607   445077311 : xfs_iget_cache_miss(
     608             :         struct xfs_mount        *mp,
     609             :         struct xfs_perag        *pag,
     610             :         xfs_trans_t             *tp,
     611             :         xfs_ino_t               ino,
     612             :         struct xfs_inode        **ipp,
     613             :         int                     flags,
     614             :         int                     lock_flags)
     615             : {
     616   445077311 :         struct xfs_inode        *ip;
     617   445077311 :         int                     error;
     618   445077311 :         xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
     619   445077311 :         int                     iflags;
     620             : 
     621   445077311 :         ip = xfs_inode_alloc(mp, ino);
     622   445508052 :         if (!ip)
     623             :                 return -ENOMEM;
     624             : 
     625   445508052 :         error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags);
     626   445512442 :         if (error)
     627      114707 :                 goto out_destroy;
     628             : 
     629             :         /*
     630             :          * For version 5 superblocks, if we are initialising a new inode and we
     631             :          * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
     632             :          * simply build the new inode core with a random generation number.
     633             :          *
     634             :          * For version 4 (and older) superblocks, log recovery is dependent on
     635             :          * the i_flushiter field being initialised from the current on-disk
     636             :          * value and hence we must also read the inode off disk even when
     637             :          * initializing new inodes.
     638             :          */
     639   445397735 :         if (xfs_has_v3inodes(mp) &&
     640   445395462 :             (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
     641    69036331 :                 VFS_I(ip)->i_generation = get_random_u32();
     642             :         } else {
     643   376361404 :                 struct xfs_buf          *bp;
     644             : 
     645   376361404 :                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
     646   376243895 :                 if (error)
     647        6433 :                         goto out_destroy;
     648             : 
     649   376225925 :                 error = xfs_inode_from_disk(ip,
     650   376238566 :                                 xfs_buf_offset(bp, ip->i_imap.im_boffset));
     651   376219736 :                 if (!error)
     652   376218632 :                         xfs_buf_set_ref(bp, XFS_INO_REF);
     653             :                 else
     654        1104 :                         xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
     655   376225861 :                 xfs_trans_brelse(tp, bp);
     656             : 
     657   376248386 :                 if (error)
     658        1104 :                         goto out_destroy;
     659             :         }
     660             : 
     661   445354252 :         trace_xfs_iget_miss(ip);
     662             : 
     663             :         /*
     664             :          * Check the inode free state is valid. This also detects lookup
     665             :          * racing with unlinks.
     666             :          */
     667   445116378 :         error = xfs_iget_check_free_state(ip, flags);
     668   445030023 :         if (error)
     669           0 :                 goto out_destroy;
     670             : 
     671             :         /*
     672             :          * Preload the radix tree so we can insert safely under the
     673             :          * write spinlock. Note that we cannot sleep inside the preload
     674             :          * region. Since we can be called from transaction context, don't
     675             :          * recurse into the file system.
     676             :          */
     677   445030023 :         if (radix_tree_preload(GFP_NOFS)) {
     678           0 :                 error = -EAGAIN;
     679           0 :                 goto out_destroy;
     680             :         }
     681             : 
     682             :         /*
     683             :          * Because the inode hasn't been added to the radix-tree yet it can't
     684             :          * be found by another thread, so we can do the non-sleeping lock here.
     685             :          */
     686   445259179 :         if (lock_flags) {
     687   419191816 :                 if (!xfs_ilock_nowait(ip, lock_flags))
     688           0 :                         BUG();
     689             :         }
     690             : 
     691             :         /*
     692             :          * These values must be set before inserting the inode into the radix
     693             :          * tree as the moment it is inserted a concurrent lookup (allowed by the
     694             :          * RCU locking mechanism) can find it and that lookup must see that this
     695             :          * is an inode currently under construction (i.e. that XFS_INEW is set).
     696             :          * The ip->i_flags_lock that protects the XFS_INEW flag forms the
     697             :          * memory barrier that ensures this detection works correctly at lookup
     698             :          * time.
     699             :          */
     700   445290059 :         iflags = XFS_INEW;
     701   445290059 :         if (flags & XFS_IGET_DONTCACHE)
     702   350317232 :                 d_mark_dontcache(VFS_I(ip));
     703   445292888 :         ip->i_udquot = NULL;
     704   445292888 :         ip->i_gdquot = NULL;
     705   445292888 :         ip->i_pdquot = NULL;
     706   445292888 :         xfs_iflags_set(ip, iflags);
     707             : 
     708             :         /* insert the new inode */
     709   445588571 :         spin_lock(&pag->pag_ici_lock);
     710   445632400 :         error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
     711   445506420 :         if (unlikely(error)) {
     712      695207 :                 WARN_ON(error != -EEXIST);
     713      695207 :                 XFS_STATS_INC(mp, xs_ig_dup);
     714      695207 :                 error = -EAGAIN;
     715      695207 :                 goto out_preload_end;
     716             :         }
     717   444811213 :         spin_unlock(&pag->pag_ici_lock);
     718   444918808 :         radix_tree_preload_end();
     719             : 
     720   444300664 :         *ipp = ip;
     721   444300664 :         return 0;
     722             : 
     723             : out_preload_end:
     724      695207 :         spin_unlock(&pag->pag_ici_lock);
     725      695207 :         radix_tree_preload_end();
     726      695207 :         if (lock_flags)
     727      689937 :                 xfs_iunlock(ip, lock_flags);
     728        5270 : out_destroy:
     729      816345 :         __destroy_inode(VFS_I(ip));
     730      816344 :         xfs_inode_free(ip);
     731      816344 :         return error;
     732             : }
     733             : 
     734             : /*
     735             :  * Look up an inode by number in the given file system.  The inode is looked up
     736             :  * in the cache held in each AG.  If the inode is found in the cache, initialise
     737             :  * the vfs inode if necessary.
     738             :  *
     739             :  * If it is not in core, read it in from the file system's device, add it to the
     740             :  * cache and initialise the vfs inode.
     741             :  *
     742             :  * The inode is locked according to the value of the lock_flags parameter.
     743             :  * Inode lookup is only done during metadata operations and not as part of the
     744             :  * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
     745             :  */
     746             : int
     747 >14131*10^7 : xfs_iget(
     748             :         struct xfs_mount        *mp,
     749             :         struct xfs_trans        *tp,
     750             :         xfs_ino_t               ino,
     751             :         uint                    flags,
     752             :         uint                    lock_flags,
     753             :         struct xfs_inode        **ipp)
     754             : {
     755 >14131*10^7 :         struct xfs_inode        *ip;
     756 >14131*10^7 :         struct xfs_perag        *pag;
     757 >14131*10^7 :         xfs_agino_t             agino;
     758 >14131*10^7 :         int                     error;
     759             : 
     760 >14131*10^7 :         ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
     761             : 
     762             :         /* reject inode numbers outside existing AGs */
     763 >14131*10^7 :         if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
     764     1603482 :                 return -EINVAL;
     765             : 
     766 >14131*10^7 :         XFS_STATS_INC(mp, xs_ig_attempts);
     767             : 
     768             :         /* get the perag structure and ensure that it's inode capable */
     769 >13925*10^7 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
     770 >14121*10^7 :         agino = XFS_INO_TO_AGINO(mp, ino);
     771             : 
     772 >14121*10^7 : again:
     773 >14121*10^7 :         error = 0;
     774 >14121*10^7 :         rcu_read_lock();
     775 >14091*10^7 :         ip = radix_tree_lookup(&pag->pag_ici_root, agino);
     776             : 
     777 >14085*10^7 :         if (ip) {
     778 >14041*10^7 :                 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
     779 >13983*10^7 :                 if (error)
     780     8263559 :                         goto out_error_or_again;
     781             :         } else {
     782   445429412 :                 rcu_read_unlock();
     783   445442774 :                 if (flags & XFS_IGET_INCORE) {
     784           0 :                         error = -ENODATA;
     785           0 :                         goto out_error_or_again;
     786             :                 }
     787   445442774 :                 XFS_STATS_INC(mp, xs_ig_missed);
     788             : 
     789   445064706 :                 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
     790             :                                                         flags, lock_flags);
     791   445059621 :                 if (error)
     792      816343 :                         goto out_error_or_again;
     793             :         }
     794 >14027*10^7 :         xfs_perag_put(pag);
     795             : 
     796 >14133*10^7 :         *ipp = ip;
     797             : 
     798             :         /*
     799             :          * If we have a real type for an on-disk inode, we can setup the inode
     800             :          * now.  If it's a new inode being created, xfs_init_new_inode will
     801             :          * handle it.
     802             :          */
     803 >28259*10^7 :         if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
     804   755186052 :                 xfs_setup_existing_inode(ip);
     805             :         return 0;
     806             : 
     807     9079902 : out_error_or_again:
     808     9079902 :         if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
     809             :             error == -EAGAIN) {
     810     1272362 :                 delay(1);
     811     1265566 :                 goto again;
     812             :         }
     813     7807540 :         xfs_perag_put(pag);
     814     7807540 :         return error;
     815             : }
     816             : 
     817             : /* Get a metadata inode.  The ftype must match exactly. */
     818             : int
     819     1578493 : xfs_imeta_iget(
     820             :         struct xfs_mount        *mp,
     821             :         xfs_ino_t               ino,
     822             :         unsigned char           ftype,
     823             :         struct xfs_inode        **ipp)
     824             : {
     825     1578493 :         struct xfs_inode        *ip;
     826     1578493 :         int                     error;
     827             : 
     828     1578493 :         ASSERT(ftype != XFS_DIR3_FT_UNKNOWN);
     829             : 
     830     1578493 :         error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
     831     1578493 :         if (error == -EFSCORRUPTED)
     832          21 :                 goto whine;
     833     1578472 :         if (error)
     834             :                 return error;
     835             : 
     836     1578472 :         if (VFS_I(ip)->i_nlink == 0)
     837           0 :                 goto bad_rele;
     838     1578472 :         if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype)
     839           0 :                 goto bad_rele;
     840     1578472 :         if (xfs_has_metadir(mp) && !xfs_is_metadir_inode(ip))
     841           4 :                 goto bad_rele;
     842             : 
     843     1578468 :         *ipp = ip;
     844     1578468 :         return 0;
     845           4 : bad_rele:
     846           4 :         xfs_irele(ip);
     847          25 : whine:
     848          25 :         xfs_err(mp, "metadata inode 0x%llx is corrupt", ino);
     849          25 :         xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
     850          25 :         return -EFSCORRUPTED;
     851             : }
     852             : 
     853             : /*
     854             :  * Grab the inode for reclaim exclusively.
     855             :  *
     856             :  * We have found this inode via a lookup under RCU, so the inode may have
     857             :  * already been freed, or it may be in the process of being recycled by
     858             :  * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
     859             :  * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
     860             :  * will not be set. Hence we need to check for both these flag conditions to
     861             :  * avoid inodes that are no longer reclaim candidates.
     862             :  *
     863             :  * Note: checking for other state flags here, under the i_flags_lock or not, is
     864             :  * racy and should be avoided. Those races should be resolved only after we have
     865             :  * ensured that we are able to reclaim this inode and the world can see that we
     866             :  * are going to reclaim it.
     867             :  *
     868             :  * Return true if we grabbed it, false otherwise.
     869             :  */
     870             : static bool
     871   474328903 : xfs_reclaim_igrab(
     872             :         struct xfs_inode        *ip,
     873             :         struct xfs_icwalk       *icw)
     874             : {
     875   474328903 :         ASSERT(rcu_read_lock_held());
     876             : 
     877   474328903 :         spin_lock(&ip->i_flags_lock);
     878   474329080 :         if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
     879             :             __xfs_iflags_test(ip, XFS_IRECLAIM)) {
     880             :                 /* not a reclaim candidate. */
     881        2281 :                 spin_unlock(&ip->i_flags_lock);
     882        2281 :                 return false;
     883             :         }
     884             : 
     885             :         /* Don't reclaim a sick inode unless the caller asked for it. */
     886   474326799 :         if (ip->i_sick &&
     887       27908 :             (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
     888           0 :                 spin_unlock(&ip->i_flags_lock);
     889           0 :                 return false;
     890             :         }
     891             : 
     892   474326799 :         __xfs_iflags_set(ip, XFS_IRECLAIM);
     893   474326799 :         spin_unlock(&ip->i_flags_lock);
     894   474326799 :         return true;
     895             : }
     896             : 
     897             : /*
     898             :  * Inode reclaim is non-blocking, so the default action if progress cannot be
     899             :  * made is to "requeue" the inode for reclaim by unlocking it and clearing the
     900             :  * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
     901             :  * blocking anymore and hence we can wait for the inode to be able to reclaim
     902             :  * it.
     903             :  *
     904             :  * We do no IO here - if callers require inodes to be cleaned they must push the
     905             :  * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
     906             :  * done in the background in a non-blocking manner, and enables memory reclaim
     907             :  * to make progress without blocking.
     908             :  */
     909             : static void
     910   474326811 : xfs_reclaim_inode(
     911             :         struct xfs_inode        *ip,
     912             :         struct xfs_perag        *pag)
     913             : {
     914   474326811 :         xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
     915             : 
     916   474326811 :         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
     917        7107 :                 goto out;
     918   474319583 :         if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
     919     8731308 :                 goto out_iunlock;
     920             : 
     921             :         /*
     922             :          * Check for log shutdown because aborting the inode can move the log
     923             :          * tail and corrupt in memory state. This is fine if the log is shut
     924             :          * down, but if the log is still active and only the mount is shut down
     925             :          * then the in-memory log tail movement caused by the abort can be
     926             :          * incorrectly propagated to disk.
     927             :          */
     928   931176724 :         if (xlog_is_shutdown(ip->i_mount->m_log)) {
     929   325411208 :                 xfs_iunpin_wait(ip);
     930   325411207 :                 xfs_iflush_shutdown_abort(ip);
     931   325411208 :                 goto reclaim;
     932             :         }
     933   140177154 :         if (xfs_ipincount(ip))
     934     7355521 :                 goto out_clear_flush;
     935   132821633 :         if (!xfs_inode_clean(ip))
     936    13127200 :                 goto out_clear_flush;
     937             : 
     938   119694433 :         xfs_iflags_clear(ip, XFS_IFLUSHING);
     939   445105660 : reclaim:
     940   445105660 :         trace_xfs_inode_reclaiming(ip);
     941             : 
     942             :         /*
     943             :          * Because we use RCU freeing we need to ensure the inode always appears
     944             :          * to be reclaimed with an invalid inode number when in the free state.
     945             :          * We do this as early as possible under the ILOCK so that
     946             :          * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
     947             :          * detect races with us here. By doing this, we guarantee that once
     948             :          * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
     949             :          * it will see either a valid inode that will serialise correctly, or it
     950             :          * will see an invalid inode that it can skip.
     951             :          */
     952   445105627 :         spin_lock(&ip->i_flags_lock);
     953   445105693 :         ip->i_flags = XFS_IRECLAIM;
     954   445105693 :         ip->i_ino = 0;
     955   445105693 :         ip->i_sick = 0;
     956   445105693 :         ip->i_checked = 0;
     957   445105693 :         spin_unlock(&ip->i_flags_lock);
     958             : 
     959   445105720 :         ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
     960   445105720 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     961             : 
     962   445105676 :         XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
     963             :         /*
     964             :          * Remove the inode from the per-AG radix tree.
     965             :          *
     966             :          * Because radix_tree_delete won't complain even if the item was never
     967             :          * added to the tree assert that it's been there before to catch
     968             :          * problems with the inode life time early on.
     969             :          */
     970   445105675 :         spin_lock(&pag->pag_ici_lock);
     971   890211468 :         if (!xfs_is_shutdown(pag->pag_mount)) {
     972             :                 /* had better not be on any unlinked list! */
     973   119694527 :                 ASSERT(!xfs_inode_on_unlinked_list(ip));
     974   119694527 :                 if (xfs_inode_on_unlinked_list(ip))
     975           0 :                         xfs_emerg(pag->pag_mount, "IUNLINK ino 0x%llx nlink %u mode 0o%o prevun 0x%x nextun 0x%x", ino, VFS_I(ip)->i_nlink, VFS_I(ip)->i_mode, ip->i_prev_unlinked, ip->i_next_unlinked);
     976             :         }
     977   445105714 :         if (!radix_tree_delete(&pag->pag_ici_root,
     978   445105734 :                                 XFS_INO_TO_AGINO(ip->i_mount, ino)))
     979           0 :                 ASSERT(0);
     980   445105714 :         xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
     981   445105623 :         spin_unlock(&pag->pag_ici_lock);
     982             : 
     983             :         /*
     984             :          * Here we do an (almost) spurious inode lock in order to coordinate
     985             :          * with inode cache radix tree lookups.  This is because the lookup
     986             :          * can reference the inodes in the cache without taking references.
     987             :          *
     988             :          * We make that OK here by ensuring that we wait until the inode is
     989             :          * unlocked after the lookup before we go ahead and free it.
     990             :          */
     991   445105684 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
     992   445105708 :         ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
     993   445105708 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     994   529262387 :         ASSERT(xfs_inode_clean(ip));
     995             : 
     996   445105653 :         __xfs_inode_free(ip);
     997   445105653 :         return;
     998             : 
     999    20482721 : out_clear_flush:
    1000    20482721 :         xfs_iflags_clear(ip, XFS_IFLUSHING);
    1001    29214029 : out_iunlock:
    1002    29214029 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
    1003    29221135 : out:
    1004    29221135 :         xfs_iflags_clear(ip, XFS_IRECLAIM);
    1005             : }
    1006             : 
    1007             : /* Reclaim sick inodes if we're unmounting or the fs went down. */
    1008             : static inline bool
    1009      106910 : xfs_want_reclaim_sick(
    1010             :         struct xfs_mount        *mp)
    1011             : {
    1012      253905 :         return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
    1013             :                xfs_is_shutdown(mp);
    1014             : }
    1015             : 
    1016             : void
    1017       66825 : xfs_reclaim_inodes(
    1018             :         struct xfs_mount        *mp)
    1019             : {
    1020       66825 :         struct xfs_icwalk       icw = {
    1021             :                 .icw_flags      = 0,
    1022             :         };
    1023             : 
    1024       66825 :         if (xfs_want_reclaim_sick(mp))
    1025       66825 :                 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
    1026             : 
    1027      397328 :         while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
    1028      330503 :                 xfs_ail_push_all_sync(mp->m_ail);
    1029      330503 :                 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
    1030             :         }
    1031       66825 : }
    1032             : 
    1033             : /*
    1034             :  * The shrinker infrastructure determines how many inodes we should scan for
    1035             :  * reclaim. We want as many clean inodes ready to reclaim as possible, so we
    1036             :  * push the AIL here. We also want to proactively free up memory if we can to
    1037             :  * minimise the amount of work memory reclaim has to do so we kick the
    1038             :  * background reclaim if it isn't already scheduled.
    1039             :  */
    1040             : long
    1041       40085 : xfs_reclaim_inodes_nr(
    1042             :         struct xfs_mount        *mp,
    1043             :         unsigned long           nr_to_scan)
    1044             : {
    1045       40085 :         struct xfs_icwalk       icw = {
    1046             :                 .icw_flags      = XFS_ICWALK_FLAG_SCAN_LIMIT,
    1047       40085 :                 .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
    1048             :         };
    1049             : 
    1050       40085 :         if (xfs_want_reclaim_sick(mp))
    1051           9 :                 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
    1052             : 
    1053             :         /* kick background reclaimer and push the AIL */
    1054       40085 :         xfs_reclaim_work_queue(mp);
    1055       40085 :         xfs_ail_push_all(mp->m_ail);
    1056             : 
    1057       40085 :         xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
    1058       40085 :         return 0;
    1059             : }
    1060             : 
    1061             : /*
    1062             :  * Return the number of reclaimable inodes in the filesystem for
    1063             :  * the shrinker to determine how much to reclaim.
    1064             :  */
    1065             : long
    1066      386465 : xfs_reclaim_inodes_count(
    1067             :         struct xfs_mount        *mp)
    1068             : {
    1069      386465 :         struct xfs_perag        *pag;
    1070      386465 :         xfs_agnumber_t          ag = 0;
    1071      386465 :         long                    reclaimable = 0;
    1072             : 
    1073      807449 :         while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
    1074      420984 :                 ag = pag->pag_agno + 1;
    1075      420984 :                 reclaimable += pag->pag_ici_reclaimable;
    1076      420984 :                 xfs_perag_put(pag);
    1077             :         }
    1078      386465 :         return reclaimable;
    1079             : }
    1080             : 
    1081             : STATIC bool
    1082     1908811 : xfs_icwalk_match_id(
    1083             :         struct xfs_inode        *ip,
    1084             :         struct xfs_icwalk       *icw)
    1085             : {
    1086     1908811 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
    1087             :             !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
    1088             :                 return false;
    1089             : 
    1090     1908811 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
    1091             :             !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
    1092             :                 return false;
    1093             : 
    1094     1908811 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
    1095           0 :             ip->i_projid != icw->icw_prid)
    1096           0 :                 return false;
    1097             : 
    1098             :         return true;
    1099             : }
    1100             : 
    1101             : /*
    1102             :  * A union-based inode filtering algorithm. Process the inode if any of the
    1103             :  * criteria match. This is for global/internal scans only.
    1104             :  */
    1105             : STATIC bool
    1106        3613 : xfs_icwalk_match_id_union(
    1107             :         struct xfs_inode        *ip,
    1108             :         struct xfs_icwalk       *icw)
    1109             : {
    1110        3613 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
    1111             :             uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
    1112             :                 return true;
    1113             : 
    1114         804 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
    1115             :             gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
    1116             :                 return true;
    1117             : 
    1118           0 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
    1119           0 :             ip->i_projid == icw->icw_prid)
    1120           0 :                 return true;
    1121             : 
    1122             :         return false;
    1123             : }
    1124             : 
    1125             : /*
    1126             :  * Is this inode @ip eligible for eof/cow block reclamation, given some
    1127             :  * filtering parameters @icw?  The inode is eligible if @icw is null or
    1128             :  * if the predicate functions match.
    1129             :  */
    1130             : static bool
    1131     4401598 : xfs_icwalk_match(
    1132             :         struct xfs_inode        *ip,
    1133             :         struct xfs_icwalk       *icw)
    1134             : {
    1135     4401598 :         bool                    match;
    1136             : 
    1137     4401598 :         if (!icw)
    1138             :                 return true;
    1139             : 
    1140     1901001 :         if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
    1141        3612 :                 match = xfs_icwalk_match_id_union(ip, icw);
    1142             :         else
    1143     1897389 :                 match = xfs_icwalk_match_id(ip, icw);
    1144     1901001 :         if (!match)
    1145             :                 return false;
    1146             : 
    1147             :         /* skip the inode if the file size is too small */
    1148     1896569 :         if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
    1149           0 :             XFS_ISIZE(ip) < icw->icw_min_file_size)
    1150           0 :                 return false;
    1151             : 
    1152             :         return true;
    1153             : }
    1154             : 
    1155             : /*
    1156             :  * This is a fast pass over the inode cache to try to get reclaim moving on as
    1157             :  * many inodes as possible in a short period of time. It kicks itself every few
    1158             :  * seconds, as well as being kicked by the inode cache shrinker when memory
    1159             :  * goes low.
    1160             :  */
    1161             : void
    1162       83906 : xfs_reclaim_worker(
    1163             :         struct work_struct *work)
    1164             : {
    1165       83906 :         struct xfs_mount *mp = container_of(to_delayed_work(work),
    1166             :                                         struct xfs_mount, m_reclaim_work);
    1167             : 
    1168       83906 :         xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
    1169       83906 :         xfs_reclaim_work_queue(mp);
    1170       83906 : }
    1171             : 
    1172             : STATIC int
    1173    35882413 : xfs_inode_free_eofblocks(
    1174             :         struct xfs_inode        *ip,
    1175             :         struct xfs_icwalk       *icw,
    1176             :         unsigned int            *lockflags)
    1177             : {
    1178    35882413 :         bool                    wait;
    1179             : 
    1180    35882413 :         wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
    1181             : 
    1182    72075148 :         if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
    1183             :                 return 0;
    1184             : 
    1185             :         /*
    1186             :          * If the mapping is dirty the operation can block and wait for some
    1187             :          * time. Unless we are waiting, skip it.
    1188             :          */
    1189     7619008 :         if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
    1190             :                 return 0;
    1191             : 
    1192     2821475 :         if (!xfs_icwalk_match(ip, icw))
    1193             :                 return 0;
    1194             : 
    1195             :         /*
    1196             :          * If the caller is waiting, return -EAGAIN to keep the background
    1197             :          * scanner moving and revisit the inode in a subsequent pass.
    1198             :          */
    1199     2821475 :         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
    1200     1405016 :                 if (wait)
    1201             :                         return -EAGAIN;
    1202      190954 :                 return 0;
    1203             :         }
    1204     1445445 :         *lockflags |= XFS_IOLOCK_EXCL;
    1205             : 
    1206     1445445 :         if (xfs_can_free_eofblocks(ip, false))
    1207      593709 :                 return xfs_free_eofblocks(ip);
    1208             : 
    1209             :         /* inode could be preallocated or append-only */
    1210      851631 :         trace_xfs_inode_free_eofblocks_invalid(ip);
    1211      851582 :         xfs_inode_clear_eofblocks_tag(ip);
    1212      851582 :         return 0;
    1213             : }
    1214             : 
    1215             : static void
    1216    13934396 : xfs_blockgc_set_iflag(
    1217             :         struct xfs_inode        *ip,
    1218             :         unsigned long           iflag)
    1219             : {
    1220    13934396 :         struct xfs_mount        *mp = ip->i_mount;
    1221    13934396 :         struct xfs_perag        *pag;
    1222             : 
    1223    13934396 :         ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
    1224             : 
    1225             :         /*
    1226             :          * Don't bother locking the AG and looking up in the radix trees
    1227             :          * if we already know that we have the tag set.
    1228             :          */
    1229    13934396 :         if (ip->i_flags & iflag)
    1230             :                 return;
    1231     6455625 :         spin_lock(&ip->i_flags_lock);
    1232     6466738 :         ip->i_flags |= iflag;
    1233     6466738 :         spin_unlock(&ip->i_flags_lock);
    1234             : 
    1235     6465248 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
    1236     6475478 :         spin_lock(&pag->pag_ici_lock);
    1237             : 
    1238     6475161 :         xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
    1239             :                         XFS_ICI_BLOCKGC_TAG);
    1240             : 
    1241     6463299 :         spin_unlock(&pag->pag_ici_lock);
    1242     6461622 :         xfs_perag_put(pag);
    1243             : }
    1244             : 
    1245             : void
    1246     6415246 : xfs_inode_set_eofblocks_tag(
    1247             :         xfs_inode_t     *ip)
    1248             : {
    1249     6415246 :         trace_xfs_inode_set_eofblocks_tag(ip);
    1250     6414973 :         return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
    1251             : }
    1252             : 
    1253             : static void
    1254    32750332 : xfs_blockgc_clear_iflag(
    1255             :         struct xfs_inode        *ip,
    1256             :         unsigned long           iflag)
    1257             : {
    1258    32750332 :         struct xfs_mount        *mp = ip->i_mount;
    1259    32750332 :         struct xfs_perag        *pag;
    1260    32750332 :         bool                    clear_tag;
    1261             : 
    1262    32750332 :         ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
    1263             : 
    1264    32750332 :         spin_lock(&ip->i_flags_lock);
    1265    32832685 :         ip->i_flags &= ~iflag;
    1266    32832685 :         clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
    1267    32832685 :         spin_unlock(&ip->i_flags_lock);
    1268             : 
    1269    32823605 :         if (!clear_tag)
    1270             :                 return;
    1271             : 
    1272    25224724 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
    1273    25243794 :         spin_lock(&pag->pag_ici_lock);
    1274             : 
    1275    25244528 :         xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
    1276             :                         XFS_ICI_BLOCKGC_TAG);
    1277             : 
    1278    25224399 :         spin_unlock(&pag->pag_ici_lock);
    1279    25253547 :         xfs_perag_put(pag);
    1280             : }
    1281             : 
    1282             : void
    1283    22135972 : xfs_inode_clear_eofblocks_tag(
    1284             :         xfs_inode_t     *ip)
    1285             : {
    1286    22135972 :         trace_xfs_inode_clear_eofblocks_tag(ip);
    1287    22078193 :         return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
    1288             : }
    1289             : 
    1290             : /*
    1291             :  * Set ourselves up to free CoW blocks from this file.  If it's already clean
    1292             :  * then we can bail out quickly, but otherwise we must back off if the file
    1293             :  * is undergoing some kind of write.
    1294             :  */
    1295             : static bool
    1296    29500476 : xfs_prep_free_cowblocks(
    1297             :         struct xfs_inode        *ip)
    1298             : {
    1299             :         /*
    1300             :          * Just clear the tag if we have an empty cow fork or none at all. It's
    1301             :          * possible the inode was fully unshared since it was originally tagged.
    1302             :          */
    1303    59000952 :         if (!xfs_inode_has_cow_data(ip)) {
    1304      282923 :                 trace_xfs_inode_free_cowblocks_invalid(ip);
    1305      282493 :                 xfs_inode_clear_cowblocks_tag(ip);
    1306      282493 :                 return false;
    1307             :         }
    1308             : 
    1309             :         /*
    1310             :          * If the mapping is dirty or under writeback we cannot touch the
    1311             :          * CoW fork.  Leave it alone if we're in the midst of a directio.
    1312             :          */
    1313    32379226 :         if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
    1314     6322948 :             mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
    1315     5646893 :             mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
    1316             :             atomic_read(&VFS_I(ip)->i_dio_count))
    1317    26735202 :                 return false;
    1318             : 
    1319             :         return true;
    1320             : }
    1321             : 
    1322             : /*
    1323             :  * Automatic CoW Reservation Freeing
    1324             :  *
    1325             :  * These functions automatically garbage collect leftover CoW reservations
    1326             :  * that were made on behalf of a cowextsize hint when we start to run out
    1327             :  * of quota or when the reservations sit around for too long.  If the file
    1328             :  * has dirty pages or is undergoing writeback, its CoW reservations will
    1329             :  * be retained.
    1330             :  *
    1331             :  * The actual garbage collection piggybacks off the same code that runs
    1332             :  * the speculative EOF preallocation garbage collector.
    1333             :  */
    1334             : STATIC int
    1335    34845248 : xfs_inode_free_cowblocks(
    1336             :         struct xfs_inode        *ip,
    1337             :         struct xfs_icwalk       *icw,
    1338             :         unsigned int            *lockflags)
    1339             : {
    1340    34845248 :         bool                    wait;
    1341    34845248 :         int                     ret = 0;
    1342             : 
    1343    34845248 :         wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
    1344             : 
    1345    69950333 :         if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
    1346             :                 return 0;
    1347             : 
    1348    28643059 :         if (!xfs_prep_free_cowblocks(ip))
    1349             :                 return 0;
    1350             : 
    1351     1556144 :         if (!xfs_icwalk_match(ip, icw))
    1352             :                 return 0;
    1353             : 
    1354             :         /*
    1355             :          * If the caller is waiting, return -EAGAIN to keep the background
    1356             :          * scanner moving and revisit the inode in a subsequent pass.
    1357             :          */
    1358     3058908 :         if (!(*lockflags & XFS_IOLOCK_EXCL) &&
    1359     1501612 :             !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
    1360      630801 :                 if (wait)
    1361             :                         return -EAGAIN;
    1362      111793 :                 return 0;
    1363             :         }
    1364      926495 :         *lockflags |= XFS_IOLOCK_EXCL;
    1365             : 
    1366      926495 :         if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
    1367          39 :                 if (wait)
    1368             :                         return -EAGAIN;
    1369          37 :                 return 0;
    1370             :         }
    1371      926648 :         *lockflags |= XFS_MMAPLOCK_EXCL;
    1372             : 
    1373             :         /*
    1374             :          * Check again, nobody else should be able to dirty blocks or change
    1375             :          * the reflink iflag now that we have the first two locks held.
    1376             :          */
    1377      926648 :         if (xfs_prep_free_cowblocks(ip))
    1378      926589 :                 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
    1379             :         return ret;
    1380             : }
    1381             : 
    1382             : void
    1383     7523392 : xfs_inode_set_cowblocks_tag(
    1384             :         xfs_inode_t     *ip)
    1385             : {
    1386     7523392 :         trace_xfs_inode_set_cowblocks_tag(ip);
    1387     7518101 :         return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
    1388             : }
    1389             : 
    1390             : void
    1391    10654684 : xfs_inode_clear_cowblocks_tag(
    1392             :         xfs_inode_t     *ip)
    1393             : {
    1394    10654684 :         trace_xfs_inode_clear_cowblocks_tag(ip);
    1395    10652670 :         return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
    1396             : }
    1397             : 
    1398             : /* Disable post-EOF and CoW block auto-reclamation. */
    1399             : void
    1400      136724 : xfs_blockgc_stop(
    1401             :         struct xfs_mount        *mp)
    1402             : {
    1403      136724 :         struct xfs_perag        *pag;
    1404      136724 :         xfs_agnumber_t          agno;
    1405             : 
    1406      136724 :         if (!xfs_clear_blockgc_enabled(mp))
    1407          71 :                 return;
    1408             : 
    1409      977439 :         for_each_perag(mp, agno, pag)
    1410      840786 :                 cancel_delayed_work_sync(&pag->pag_blockgc_work);
    1411      136653 :         trace_xfs_blockgc_stop(mp, __return_address);
    1412             : }
    1413             : 
    1414             : /* Enable post-EOF and CoW block auto-reclamation. */
    1415             : void
    1416      136842 : xfs_blockgc_start(
    1417             :         struct xfs_mount        *mp)
    1418             : {
    1419      136842 :         struct xfs_perag        *pag;
    1420      136842 :         xfs_agnumber_t          agno;
    1421             : 
    1422      136842 :         if (xfs_set_blockgc_enabled(mp))
    1423             :                 return;
    1424             : 
    1425      136820 :         trace_xfs_blockgc_start(mp, __return_address);
    1426      351662 :         for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
    1427      214842 :                 xfs_blockgc_queue(pag);
    1428             : }
    1429             : 
    1430             : /* Don't try to run block gc on an inode that's in any of these states. */
    1431             : #define XFS_BLOCKGC_NOGRAB_IFLAGS       (XFS_INEW | \
    1432             :                                          XFS_NEED_INACTIVE | \
    1433             :                                          XFS_INACTIVATING | \
    1434             :                                          XFS_IRECLAIMABLE | \
    1435             :                                          XFS_IRECLAIM)
    1436             : /*
    1437             :  * Decide if the given @ip is eligible for garbage collection of speculative
    1438             :  * preallocations, and grab it if so.  Returns true if it's ready to go or
    1439             :  * false if we should just ignore it.
    1440             :  */
    1441             : static bool
    1442    36461558 : xfs_blockgc_igrab(
    1443             :         struct xfs_inode        *ip)
    1444             : {
    1445    36461558 :         struct inode            *inode = VFS_I(ip);
    1446             : 
    1447    36461558 :         ASSERT(rcu_read_lock_held());
    1448             : 
    1449             :         /* Check for stale RCU freed inode */
    1450    36461558 :         spin_lock(&ip->i_flags_lock);
    1451    36733512 :         if (!ip->i_ino)
    1452           0 :                 goto out_unlock_noent;
    1453             : 
    1454    36733512 :         if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
    1455      520830 :                 goto out_unlock_noent;
    1456    36212682 :         spin_unlock(&ip->i_flags_lock);
    1457             : 
    1458             :         /* nothing to sync during shutdown */
    1459    72232802 :         if (xfs_is_shutdown(ip->i_mount))
    1460             :                 return false;
    1461             : 
    1462             :         /* If we can't grab the inode, it must on it's way to reclaim. */
    1463    36099090 :         if (!igrab(inode))
    1464        3972 :                 return false;
    1465             : 
    1466             :         /* inode is valid */
    1467             :         return true;
    1468             : 
    1469      520830 : out_unlock_noent:
    1470      520830 :         spin_unlock(&ip->i_flags_lock);
    1471      520830 :         return false;
    1472             : }
    1473             : 
    1474             : /* Scan one incore inode for block preallocations that we can remove. */
    1475             : static int
    1476    35914745 : xfs_blockgc_scan_inode(
    1477             :         struct xfs_inode        *ip,
    1478             :         struct xfs_icwalk       *icw)
    1479             : {
    1480    35914745 :         unsigned int            lockflags = 0;
    1481    35914745 :         int                     error;
    1482             : 
    1483    35914745 :         error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
    1484    36138999 :         if (error)
    1485     1212531 :                 goto unlock;
    1486             : 
    1487    34926468 :         error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
    1488    36111592 : unlock:
    1489    36111592 :         if (lockflags)
    1490     2317196 :                 xfs_iunlock(ip, lockflags);
    1491    36110941 :         xfs_irele(ip);
    1492    36135193 :         return error;
    1493             : }
    1494             : 
    1495             : /* Background worker that trims preallocated space. */
    1496             : void
    1497     4961915 : xfs_blockgc_worker(
    1498             :         struct work_struct      *work)
    1499             : {
    1500     4961915 :         struct xfs_perag        *pag = container_of(to_delayed_work(work),
    1501             :                                         struct xfs_perag, pag_blockgc_work);
    1502     4961915 :         struct xfs_mount        *mp = pag->pag_mount;
    1503     4961915 :         int                     error;
    1504             : 
    1505     4961915 :         trace_xfs_blockgc_worker(mp, __return_address);
    1506             : 
    1507     4961910 :         error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
    1508     4960802 :         if (error)
    1509           0 :                 xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
    1510             :                                 pag->pag_agno, error);
    1511     4960802 :         xfs_blockgc_queue(pag);
    1512     4960008 : }
    1513             : 
    1514             : /*
    1515             :  * Try to free space in the filesystem by purging inactive inodes, eofblocks
    1516             :  * and cowblocks.
    1517             :  */
    1518             : int
    1519     1687319 : xfs_blockgc_free_space(
    1520             :         struct xfs_mount        *mp,
    1521             :         struct xfs_icwalk       *icw)
    1522             : {
    1523     1687319 :         int                     error;
    1524             : 
    1525     1687319 :         trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
    1526             : 
    1527     1686803 :         error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
    1528     1686168 :         if (error)
    1529             :                 return error;
    1530             : 
    1531     1686138 :         return xfs_inodegc_flush(mp);
    1532             : }
    1533             : 
    1534             : /*
    1535             :  * Reclaim all the free space that we can by scheduling the background blockgc
    1536             :  * and inodegc workers immediately and waiting for them all to clear.
    1537             :  */
    1538             : int
    1539    12654503 : xfs_blockgc_flush_all(
    1540             :         struct xfs_mount        *mp)
    1541             : {
    1542    12654503 :         struct xfs_perag        *pag;
    1543    12654503 :         xfs_agnumber_t          agno;
    1544             : 
    1545    12654503 :         trace_xfs_blockgc_flush_all(mp, __return_address);
    1546             : 
    1547             :         /*
    1548             :          * For each blockgc worker, move its queue time up to now.  If it
    1549             :          * wasn't queued, it will not be requeued.  Then flush whatever's
    1550             :          * left.
    1551             :          */
    1552    17062176 :         for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
    1553     4413977 :                 mod_delayed_work(pag->pag_mount->m_blockgc_wq,
    1554             :                                 &pag->pag_blockgc_work, 0);
    1555             : 
    1556    17028634 :         for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
    1557     4375215 :                 flush_delayed_work(&pag->pag_blockgc_work);
    1558             : 
    1559    12656644 :         return xfs_inodegc_flush(mp);
    1560             : }
    1561             : 
    1562             : /*
    1563             :  * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
    1564             :  * quota caused an allocation failure, so we make a best effort by including
    1565             :  * each quota under low free space conditions (less than 1% free space) in the
    1566             :  * scan.
    1567             :  *
    1568             :  * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
    1569             :  * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
    1570             :  * MMAPLOCK.
    1571             :  */
    1572             : int
    1573       36002 : xfs_blockgc_free_dquots(
    1574             :         struct xfs_mount        *mp,
    1575             :         struct xfs_dquot        *udqp,
    1576             :         struct xfs_dquot        *gdqp,
    1577             :         struct xfs_dquot        *pdqp,
    1578             :         unsigned int            iwalk_flags)
    1579             : {
    1580       36002 :         struct xfs_icwalk       icw = {0};
    1581       36002 :         bool                    do_work = false;
    1582             : 
    1583       36002 :         if (!udqp && !gdqp && !pdqp)
    1584             :                 return 0;
    1585             : 
    1586             :         /*
    1587             :          * Run a scan to free blocks using the union filter to cover all
    1588             :          * applicable quotas in a single scan.
    1589             :          */
    1590       36002 :         icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
    1591             : 
    1592       36002 :         if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
    1593       21263 :                 icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
    1594       21263 :                 icw.icw_flags |= XFS_ICWALK_FLAG_UID;
    1595       21263 :                 do_work = true;
    1596             :         }
    1597             : 
    1598       36002 :         if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
    1599       34589 :                 icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
    1600       34589 :                 icw.icw_flags |= XFS_ICWALK_FLAG_GID;
    1601       34589 :                 do_work = true;
    1602             :         }
    1603             : 
    1604       36002 :         if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
    1605       30638 :                 icw.icw_prid = pdqp->q_id;
    1606       30638 :                 icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
    1607       30638 :                 do_work = true;
    1608             :         }
    1609             : 
    1610       36002 :         if (!do_work)
    1611             :                 return 0;
    1612             : 
    1613       35173 :         return xfs_blockgc_free_space(mp, &icw);
    1614             : }
    1615             : 
    1616             : /* Run cow/eofblocks scans on the quotas attached to the inode. */
    1617             : int
    1618       16887 : xfs_blockgc_free_quota(
    1619             :         struct xfs_inode        *ip,
    1620             :         unsigned int            iwalk_flags)
    1621             : {
    1622       16887 :         return xfs_blockgc_free_dquots(ip->i_mount,
    1623             :                         xfs_inode_dquot(ip, XFS_DQTYPE_USER),
    1624             :                         xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
    1625             :                         xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
    1626             : }
    1627             : 
    1628             : /* XFS Inode Cache Walking Code */
    1629             : 
    1630             : /*
    1631             :  * The inode lookup is done in batches to keep the amount of lock traffic and
    1632             :  * radix tree lookups to a minimum. The batch size is a trade off between
    1633             :  * lookup reduction and stack usage. This is in the reclaim path, so we can't
    1634             :  * be too greedy.
    1635             :  */
    1636             : #define XFS_LOOKUP_BATCH        32
    1637             : 
    1638             : 
    1639             : /*
    1640             :  * Decide if we want to grab this inode in anticipation of doing work towards
    1641             :  * the goal.
    1642             :  */
    1643             : static inline bool
    1644   510818591 : xfs_icwalk_igrab(
    1645             :         enum xfs_icwalk_goal    goal,
    1646             :         struct xfs_inode        *ip,
    1647             :         struct xfs_icwalk       *icw)
    1648             : {
    1649   510818591 :         switch (goal) {
    1650    36489670 :         case XFS_ICWALK_BLOCKGC:
    1651    36489670 :                 return xfs_blockgc_igrab(ip);
    1652   474328921 :         case XFS_ICWALK_RECLAIM:
    1653   474328921 :                 return xfs_reclaim_igrab(ip, icw);
    1654             :         default:
    1655             :                 return false;
    1656             :         }
    1657             : }
    1658             : 
    1659             : /*
    1660             :  * Process an inode.  Each processing function must handle any state changes
    1661             :  * made by the icwalk igrab function.  Return -EAGAIN to skip an inode.
    1662             :  */
    1663             : static inline int
    1664   510290671 : xfs_icwalk_process_inode(
    1665             :         enum xfs_icwalk_goal    goal,
    1666             :         struct xfs_inode        *ip,
    1667             :         struct xfs_perag        *pag,
    1668             :         struct xfs_icwalk       *icw)
    1669             : {
    1670   510290671 :         int                     error = 0;
    1671             : 
    1672   510290671 :         switch (goal) {
    1673    35963855 :         case XFS_ICWALK_BLOCKGC:
    1674    35963855 :                 error = xfs_blockgc_scan_inode(ip, icw);
    1675    35963855 :                 break;
    1676   474326816 :         case XFS_ICWALK_RECLAIM:
    1677   474326816 :                 xfs_reclaim_inode(ip, pag);
    1678   474326816 :                 break;
    1679             :         }
    1680   510416254 :         return error;
    1681             : }
    1682             : 
    1683             : /*
    1684             :  * For a given per-AG structure @pag and a goal, grab qualifying inodes and
    1685             :  * process them in some manner.
    1686             :  */
    1687             : static int
    1688     6327553 : xfs_icwalk_ag(
    1689             :         struct xfs_perag        *pag,
    1690             :         enum xfs_icwalk_goal    goal,
    1691             :         struct xfs_icwalk       *icw)
    1692             : {
    1693     6327553 :         struct xfs_mount        *mp = pag->pag_mount;
    1694     6327553 :         uint32_t                first_index;
    1695     6327553 :         int                     last_error = 0;
    1696     7489800 :         int                     skipped;
    1697     7489800 :         bool                    done;
    1698     7489800 :         int                     nr_found;
    1699             : 
    1700     7489800 : restart:
    1701     7489800 :         done = false;
    1702     7489800 :         skipped = 0;
    1703     7489800 :         if (goal == XFS_ICWALK_RECLAIM)
    1704      680479 :                 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
    1705             :         else
    1706             :                 first_index = 0;
    1707             :         nr_found = 0;
    1708    29818088 :         do {
    1709    29818088 :                 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
    1710    29818088 :                 int             error = 0;
    1711    29818088 :                 int             i;
    1712             : 
    1713    29818088 :                 rcu_read_lock();
    1714             : 
    1715    29799120 :                 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
    1716             :                                 (void **) batch, first_index,
    1717             :                                 XFS_LOOKUP_BATCH, goal);
    1718    29789713 :                 if (!nr_found) {
    1719     7355134 :                         done = true;
    1720     7355134 :                         rcu_read_unlock();
    1721     7477586 :                         break;
    1722             :                 }
    1723             : 
    1724             :                 /*
    1725             :                  * Grab the inodes before we drop the lock. if we found
    1726             :                  * nothing, nr == 0 and the loop will be skipped.
    1727             :                  */
    1728   533349024 :                 for (i = 0; i < nr_found; i++) {
    1729   510905967 :                         struct xfs_inode *ip = batch[i];
    1730             : 
    1731   510863314 :                         if (done || !xfs_icwalk_igrab(goal, ip, icw))
    1732      520262 :                                 batch[i] = NULL;
    1733             : 
    1734             :                         /*
    1735             :                          * Update the index for the next lookup. Catch
    1736             :                          * overflows into the next AG range which can occur if
    1737             :                          * we have inodes in the last block of the AG and we
    1738             :                          * are currently pointing to the last inode.
    1739             :                          *
    1740             :                          * Because we may see inodes that are from the wrong AG
    1741             :                          * due to RCU freeing and reallocation, only update the
    1742             :                          * index if it lies in this AG. It was a race that lead
    1743             :                          * us to see this inode, so another lookup from the
    1744             :                          * same index will not find it again.
    1745             :                          */
    1746   510914445 :                         if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
    1747         114 :                                 continue;
    1748   510914331 :                         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
    1749   510914331 :                         if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
    1750           3 :                                 done = true;
    1751             :                 }
    1752             : 
    1753             :                 /* unlock now we've grabbed the inodes. */
    1754    22443057 :                 rcu_read_unlock();
    1755             : 
    1756   555833986 :                 for (i = 0; i < nr_found; i++) {
    1757   510962316 :                         if (!batch[i])
    1758      530822 :                                 continue;
    1759   510370300 :                         error = xfs_icwalk_process_inode(goal, batch[i], pag,
    1760             :                                         icw);
    1761   510418450 :                         if (error == -EAGAIN) {
    1762     1718080 :                                 skipped++;
    1763     1718080 :                                 continue;
    1764             :                         }
    1765   508700370 :                         if (error && last_error != -EFSCORRUPTED)
    1766           0 :                                 last_error = error;
    1767             :                 }
    1768             : 
    1769             :                 /* bail out if the filesystem is corrupted.  */
    1770    22428613 :                 if (error == -EFSCORRUPTED)
    1771             :                         break;
    1772             : 
    1773    22428613 :                 cond_resched();
    1774             : 
    1775    22436472 :                 if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
    1776      373125 :                         icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
    1777      373125 :                         if (icw->icw_scan_limit <= 0)
    1778             :                                 break;
    1779             :                 }
    1780    22328291 :         } while (nr_found && !done);
    1781             : 
    1782     7477589 :         if (goal == XFS_ICWALK_RECLAIM) {
    1783      680479 :                 if (done)
    1784      572298 :                         first_index = 0;
    1785      680479 :                 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
    1786             :         }
    1787             : 
    1788     7477589 :         if (skipped) {
    1789     1152024 :                 delay(1);
    1790     1162247 :                 goto restart;
    1791             :         }
    1792     6325565 :         return last_error;
    1793             : }
    1794             : 
    1795             : /* Walk all incore inodes to achieve a given goal. */
    1796             : static int
    1797     2141338 : xfs_icwalk(
    1798             :         struct xfs_mount        *mp,
    1799             :         enum xfs_icwalk_goal    goal,
    1800             :         struct xfs_icwalk       *icw)
    1801             : {
    1802     2141338 :         struct xfs_perag        *pag;
    1803     2141338 :         int                     error = 0;
    1804     2141338 :         int                     last_error = 0;
    1805     2141338 :         xfs_agnumber_t          agno;
    1806             : 
    1807     3506793 :         for_each_perag_tag(mp, agno, pag, goal) {
    1808     1366249 :                 error = xfs_icwalk_ag(pag, goal, icw);
    1809     1365455 :                 if (error) {
    1810           0 :                         last_error = error;
    1811           0 :                         if (error == -EFSCORRUPTED) {
    1812           0 :                                 xfs_perag_rele(pag);
    1813           0 :                                 break;
    1814             :                         }
    1815             :                 }
    1816             :         }
    1817     2141158 :         return last_error;
    1818             :         BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
    1819             : }
    1820             : 
    1821             : #ifdef DEBUG
    1822             : static void
    1823           0 : xfs_check_delalloc(
    1824             :         struct xfs_inode        *ip,
    1825             :         int                     whichfork)
    1826             : {
    1827           0 :         struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, whichfork);
    1828           0 :         struct xfs_bmbt_irec    got;
    1829           0 :         struct xfs_iext_cursor  icur;
    1830             : 
    1831           0 :         if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
    1832           0 :                 return;
    1833           0 :         do {
    1834           0 :                 if (isnullstartblock(got.br_startblock)) {
    1835           0 :                         xfs_warn(ip->i_mount,
    1836             :         "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
    1837             :                                 ip->i_ino,
    1838             :                                 whichfork == XFS_DATA_FORK ? "data" : "cow",
    1839             :                                 got.br_startoff, got.br_blockcount);
    1840             :                 }
    1841           0 :         } while (xfs_iext_next_extent(ifp, &icur, &got));
    1842             : }
    1843             : #else
    1844             : #define xfs_check_delalloc(ip, whichfork)       do { } while (0)
    1845             : #endif
    1846             : 
    1847             : /* Schedule the inode for reclaim. */
    1848             : static void
    1849   879573715 : xfs_inodegc_set_reclaimable(
    1850             :         struct xfs_inode        *ip)
    1851             : {
    1852   879573715 :         struct xfs_mount        *mp = ip->i_mount;
    1853   879573715 :         struct xfs_perag        *pag;
    1854             : 
    1855  1759147430 :         if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
    1856           0 :                 xfs_check_delalloc(ip, XFS_DATA_FORK);
    1857           0 :                 xfs_check_delalloc(ip, XFS_COW_FORK);
    1858           0 :                 ASSERT(0);
    1859             :         }
    1860             : 
    1861   879573715 :         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
    1862   879712170 :         spin_lock(&pag->pag_ici_lock);
    1863   879755263 :         spin_lock(&ip->i_flags_lock);
    1864             : 
    1865  1759529114 :         if (!xfs_is_shutdown(pag->pag_mount)) {
    1866             :                 /* had better not be on any unlinked list! */
    1867   556108633 :                 ASSERT(!xfs_inode_on_unlinked_list(ip));
    1868   556108633 :                 if (xfs_inode_on_unlinked_list(ip))
    1869           1 :                         xfs_emerg(pag->pag_mount, "IUNLINK mark reclaim ino 0x%llx nlink %u mode 0o%o prevun 0x%x nextun 0x%x", ip->i_ino, VFS_I(ip)->i_nlink, VFS_I(ip)->i_mode, ip->i_prev_unlinked, ip->i_next_unlinked);
    1870             :         }
    1871             : 
    1872   879764557 :         trace_xfs_inode_set_reclaimable(ip);
    1873   879579790 :         ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
    1874   879579790 :         ip->i_flags |= XFS_IRECLAIMABLE;
    1875   879579790 :         xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
    1876             :                         XFS_ICI_RECLAIM_TAG);
    1877             : 
    1878   879605713 :         spin_unlock(&ip->i_flags_lock);
    1879   879716519 :         spin_unlock(&pag->pag_ici_lock);
    1880   879714549 :         xfs_perag_put(pag);
    1881   879704888 : }
    1882             : 
    1883             : /*
    1884             :  * Free all speculative preallocations and possibly even the inode itself.
    1885             :  * This is the last chance to make changes to an otherwise unreferenced file
    1886             :  * before incore reclamation happens.
    1887             :  */
    1888             : static int
    1889    75824412 : xfs_inodegc_inactivate(
    1890             :         struct xfs_inode        *ip)
    1891             : {
    1892    75824412 :         int                     error;
    1893             : 
    1894    75824412 :         trace_xfs_inode_inactivating(ip);
    1895    75696781 :         error = xfs_inactive(ip);
    1896    75975146 :         xfs_inodegc_set_reclaimable(ip);
    1897    75927580 :         return error;
    1898             : 
    1899             : }
    1900             : 
    1901             : void
    1902    11963352 : xfs_inodegc_worker(
    1903             :         struct work_struct      *work)
    1904             : {
    1905    11963352 :         struct xfs_inodegc      *gc = container_of(to_delayed_work(work),
    1906             :                                                 struct xfs_inodegc, work);
    1907    11963352 :         struct llist_node       *node = llist_del_all(&gc->list);
    1908    11998999 :         struct xfs_inode        *ip, *n;
    1909    11998999 :         unsigned int            nofs_flag;
    1910             : 
    1911    11998999 :         ASSERT(gc->cpu == smp_processor_id());
    1912             : 
    1913    11993306 :         WRITE_ONCE(gc->items, 0);
    1914             : 
    1915    11993306 :         if (!node)
    1916             :                 return;
    1917             : 
    1918             :         /*
    1919             :          * We can allocate memory here while doing writeback on behalf of
    1920             :          * memory reclaim.  To avoid memory allocation deadlocks set the
    1921             :          * task-wide nofs context for the following operations.
    1922             :          */
    1923    11989132 :         nofs_flag = memalloc_nofs_save();
    1924             : 
    1925    11989132 :         ip = llist_entry(node, struct xfs_inode, i_gclist);
    1926    11989132 :         trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
    1927             : 
    1928    11979014 :         WRITE_ONCE(gc->shrinker_hits, 0);
    1929    87887070 :         llist_for_each_entry_safe(ip, n, node, i_gclist) {
    1930    75870078 :                 int     error;
    1931             : 
    1932    75870078 :                 xfs_iflags_set(ip, XFS_INACTIVATING);
    1933    75817003 :                 error = xfs_inodegc_inactivate(ip);
    1934    75908056 :                 if (error && !gc->error)
    1935        2271 :                         gc->error = error;
    1936             :         }
    1937             : 
    1938    12016992 :         memalloc_nofs_restore(nofs_flag);
    1939             : }
    1940             : 
    1941             : /*
    1942             :  * Expedite all pending inodegc work to run immediately. This does not wait for
    1943             :  * completion of the work.
    1944             :  */
    1945             : void
    1946    46682112 : xfs_inodegc_push(
    1947             :         struct xfs_mount        *mp)
    1948             : {
    1949    93364224 :         if (!xfs_is_inodegc_enabled(mp))
    1950             :                 return;
    1951    46612121 :         trace_xfs_inodegc_push(mp, __return_address);
    1952    46582088 :         xfs_inodegc_queue_all(mp);
    1953             : }
    1954             : 
    1955             : /*
    1956             :  * Force all currently queued inode inactivation work to run immediately and
    1957             :  * wait for the work to finish.
    1958             :  */
    1959             : int
    1960    16946721 : xfs_inodegc_flush(
    1961             :         struct xfs_mount        *mp)
    1962             : {
    1963    16946721 :         xfs_inodegc_push(mp);
    1964    16955376 :         trace_xfs_inodegc_flush(mp, __return_address);
    1965    16950526 :         return xfs_inodegc_wait_all(mp);
    1966             : }
    1967             : 
    1968             : /*
    1969             :  * Flush all the pending work and then disable the inode inactivation background
    1970             :  * workers and wait for them to stop.  Caller must hold sb->s_umount to
    1971             :  * coordinate changes in the inodegc_enabled state.
    1972             :  */
    1973             : void
    1974      136902 : xfs_inodegc_stop(
    1975             :         struct xfs_mount        *mp)
    1976             : {
    1977      136902 :         bool                    rerun;
    1978             : 
    1979      136902 :         if (!xfs_clear_inodegc_enabled(mp))
    1980             :                 return;
    1981             : 
    1982             :         /*
    1983             :          * Drain all pending inodegc work, including inodes that could be
    1984             :          * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
    1985             :          * threads that sample the inodegc state just prior to us clearing it.
    1986             :          * The inodegc flag state prevents new threads from queuing more
    1987             :          * inodes, so we queue pending work items and flush the workqueue until
    1988             :          * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
    1989             :          * here because it does not allow other unserialized mechanisms to
    1990             :          * reschedule inodegc work while this draining is in progress.
    1991             :          */
    1992      136831 :         xfs_inodegc_queue_all(mp);
    1993      136831 :         do {
    1994      136831 :                 flush_workqueue(mp->m_inodegc_wq);
    1995      136831 :                 rerun = xfs_inodegc_queue_all(mp);
    1996      136831 :         } while (rerun);
    1997             : 
    1998      136831 :         trace_xfs_inodegc_stop(mp, __return_address);
    1999             : }
    2000             : 
    2001             : /*
    2002             :  * Enable the inode inactivation background workers and schedule deferred inode
    2003             :  * inactivation work if there is any.  Caller must hold sb->s_umount to
    2004             :  * coordinate changes in the inodegc_enabled state.
    2005             :  */
    2006             : void
    2007      136842 : xfs_inodegc_start(
    2008             :         struct xfs_mount        *mp)
    2009             : {
    2010      136842 :         if (xfs_set_inodegc_enabled(mp))
    2011             :                 return;
    2012             : 
    2013      136820 :         trace_xfs_inodegc_start(mp, __return_address);
    2014      136820 :         xfs_inodegc_queue_all(mp);
    2015             : }
    2016             : 
    2017             : #ifdef CONFIG_XFS_RT
    2018             : static inline bool
    2019    59856979 : xfs_inodegc_want_queue_rt_file(
    2020             :         struct xfs_inode        *ip)
    2021             : {
    2022    59856979 :         struct xfs_mount        *mp = ip->i_mount;
    2023             : 
    2024    59856979 :         if (!XFS_IS_REALTIME_INODE(ip))
    2025             :                 return false;
    2026             : 
    2027     8040559 :         if (__percpu_counter_compare(&mp->m_frextents,
    2028     8040630 :                                 mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
    2029             :                                 XFS_FDBLOCKS_BATCH) < 0)
    2030      196295 :                 return true;
    2031             : 
    2032             :         return false;
    2033             : }
    2034             : #else
    2035             : # define xfs_inodegc_want_queue_rt_file(ip)     (false)
    2036             : #endif /* CONFIG_XFS_RT */
    2037             : 
    2038             : /*
    2039             :  * Schedule the inactivation worker when:
    2040             :  *
    2041             :  *  - We've accumulated more than one inode cluster buffer's worth of inodes.
    2042             :  *  - There is less than 5% free space left.
    2043             :  *  - Any of the quotas for this inode are near an enforcement limit.
    2044             :  */
    2045             : static inline bool
    2046    75956492 : xfs_inodegc_want_queue_work(
    2047             :         struct xfs_inode        *ip,
    2048             :         unsigned int            items)
    2049             : {
    2050    75956492 :         struct xfs_mount        *mp = ip->i_mount;
    2051             : 
    2052    75956492 :         if (items > mp->m_ino_geo.inodes_per_cluster)
    2053             :                 return true;
    2054             : 
    2055    60407718 :         if (__percpu_counter_compare(&mp->m_fdblocks,
    2056    60414857 :                                 mp->m_low_space[XFS_LOWSP_5_PCNT],
    2057             :                                 XFS_FDBLOCKS_BATCH) < 0)
    2058             :                 return true;
    2059             : 
    2060    59859943 :         if (xfs_inodegc_want_queue_rt_file(ip))
    2061             :                 return true;
    2062             : 
    2063    59656671 :         if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
    2064             :                 return true;
    2065             : 
    2066    59632973 :         if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
    2067             :                 return true;
    2068             : 
    2069    59650832 :         if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
    2070         213 :                 return true;
    2071             : 
    2072             :         return false;
    2073             : }
    2074             : 
    2075             : /*
    2076             :  * Upper bound on the number of inodes in each AG that can be queued for
    2077             :  * inactivation at any given time, to avoid monopolizing the workqueue.
    2078             :  */
    2079             : #define XFS_INODEGC_MAX_BACKLOG         (4 * XFS_INODES_PER_CHUNK)
    2080             : 
    2081             : /*
    2082             :  * Make the frontend wait for inactivations when:
    2083             :  *
    2084             :  *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
    2085             :  *  - The queue depth exceeds the maximum allowable percpu backlog.
    2086             :  *
    2087             :  * Note: If the current thread is running a transaction, we don't ever want to
    2088             :  * wait for other transactions because that could introduce a deadlock.
    2089             :  */
    2090             : static inline bool
    2091             : xfs_inodegc_want_flush_work(
    2092             :         struct xfs_inode        *ip,
    2093             :         unsigned int            items,
    2094             :         unsigned int            shrinker_hits)
    2095             : {
    2096    75911120 :         if (current->journal_info)
    2097             :                 return false;
    2098             : 
    2099    75833624 :         if (shrinker_hits > 0)
    2100             :                 return true;
    2101             : 
    2102    75833622 :         if (items > XFS_INODEGC_MAX_BACKLOG)
    2103             :                 return true;
    2104             : 
    2105             :         return false;
    2106             : }
    2107             : 
    2108             : /*
    2109             :  * Queue a background inactivation worker if there are inodes that need to be
    2110             :  * inactivated and higher level xfs code hasn't disabled the background
    2111             :  * workers.
    2112             :  */
    2113             : static void
    2114    75928439 : xfs_inodegc_queue(
    2115             :         struct xfs_inode        *ip)
    2116             : {
    2117    75928439 :         struct xfs_mount        *mp = ip->i_mount;
    2118    75928439 :         struct xfs_inodegc      *gc;
    2119    75928439 :         int                     items;
    2120    75928439 :         unsigned int            shrinker_hits;
    2121    75928439 :         unsigned long           queue_delay = 1;
    2122             : 
    2123    75928439 :         trace_xfs_inode_set_need_inactive(ip);
    2124    75902193 :         spin_lock(&ip->i_flags_lock);
    2125    75975304 :         ip->i_flags |= XFS_NEED_INACTIVE;
    2126    75975304 :         spin_unlock(&ip->i_flags_lock);
    2127             : 
    2128    75986529 :         gc = get_cpu_ptr(mp->m_inodegc);
    2129    75967415 :         llist_add(&ip->i_gclist, &gc->list);
    2130    75969675 :         items = READ_ONCE(gc->items);
    2131    75969675 :         WRITE_ONCE(gc->items, items + 1);
    2132    75969675 :         shrinker_hits = READ_ONCE(gc->shrinker_hits);
    2133             : 
    2134             :         /*
    2135             :          * We queue the work while holding the current CPU so that the work
    2136             :          * is scheduled to run on this CPU.
    2137             :          */
    2138   151939350 :         if (!xfs_is_inodegc_enabled(mp)) {
    2139           0 :                 put_cpu_ptr(gc);
    2140           0 :                 return;
    2141             :         }
    2142             : 
    2143    75969675 :         if (xfs_inodegc_want_queue_work(ip, items))
    2144    16291802 :                 queue_delay = 0;
    2145             : 
    2146    75936138 :         trace_xfs_inodegc_queue(mp, __return_address);
    2147    75924090 :         mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
    2148             :                         queue_delay);
    2149    75872587 :         put_cpu_ptr(gc);
    2150             : 
    2151    75911120 :         if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
    2152     5749308 :                 trace_xfs_inodegc_throttle(mp, __return_address);
    2153     5744478 :                 flush_delayed_work(&gc->work);
    2154             :         }
    2155             : }
    2156             : 
    2157             : /*
    2158             :  * Fold the dead CPU inodegc queue into the current CPUs queue.
    2159             :  */
    2160             : void
    2161         158 : xfs_inodegc_cpu_dead(
    2162             :         struct xfs_mount        *mp,
    2163             :         unsigned int            dead_cpu)
    2164             : {
    2165         158 :         struct xfs_inodegc      *dead_gc, *gc;
    2166         158 :         struct llist_node       *first, *last;
    2167         158 :         unsigned int            count = 0;
    2168             : 
    2169         158 :         dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
    2170         158 :         cancel_delayed_work_sync(&dead_gc->work);
    2171             : 
    2172         158 :         if (llist_empty(&dead_gc->list))
    2173             :                 return;
    2174             : 
    2175           0 :         first = dead_gc->list.first;
    2176           0 :         last = first;
    2177           0 :         while (last->next) {
    2178           0 :                 last = last->next;
    2179           0 :                 count++;
    2180             :         }
    2181           0 :         dead_gc->list.first = NULL;
    2182           0 :         dead_gc->items = 0;
    2183             : 
    2184             :         /* Add pending work to current CPU */
    2185           0 :         gc = get_cpu_ptr(mp->m_inodegc);
    2186           0 :         llist_add_batch(first, last, &gc->list);
    2187           0 :         count += READ_ONCE(gc->items);
    2188           0 :         WRITE_ONCE(gc->items, count);
    2189             : 
    2190           0 :         if (xfs_is_inodegc_enabled(mp)) {
    2191           0 :                 trace_xfs_inodegc_queue(mp, __return_address);
    2192           0 :                 mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
    2193             :                                 0);
    2194             :         }
    2195           0 :         put_cpu_ptr(gc);
    2196             : }
    2197             : 
    2198             : /*
    2199             :  * We set the inode flag atomically with the radix tree tag.  Once we get tag
    2200             :  * lookups on the radix tree, this inode flag can go away.
    2201             :  *
    2202             :  * We always use background reclaim here because even if the inode is clean, it
    2203             :  * still may be under IO and hence we have wait for IO completion to occur
    2204             :  * before we can reclaim the inode. The background reclaim path handles this
    2205             :  * more efficiently than we can here, so simply let background reclaim tear down
    2206             :  * all inodes.
    2207             :  */
    2208             : void
    2209   879675307 : xfs_inode_mark_reclaimable(
    2210             :         struct xfs_inode        *ip)
    2211             : {
    2212   879675307 :         struct xfs_mount        *mp = ip->i_mount;
    2213   879675307 :         bool                    need_inactive;
    2214             : 
    2215   879675307 :         XFS_STATS_INC(mp, vn_reclaim);
    2216             : 
    2217             :         /*
    2218             :          * We should never get here with any of the reclaim flags already set.
    2219             :          */
    2220  1759370332 :         ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
    2221             : 
    2222   879744819 :         need_inactive = xfs_inode_needs_inactive(ip);
    2223   879498492 :         if (need_inactive) {
    2224    75899130 :                 xfs_inodegc_queue(ip);
    2225    75899130 :                 return;
    2226             :         }
    2227             : 
    2228             :         /* Going straight to reclaim, so drop the dquots. */
    2229   803599362 :         xfs_qm_dqdetach(ip);
    2230   803603829 :         xfs_inodegc_set_reclaimable(ip);
    2231             : }
    2232             : 
    2233             : /*
    2234             :  * Register a phony shrinker so that we can run background inodegc sooner when
    2235             :  * there's memory pressure.  Inactivation does not itself free any memory but
    2236             :  * it does make inodes reclaimable, which eventually frees memory.
    2237             :  *
    2238             :  * The count function, seek value, and batch value are crafted to trigger the
    2239             :  * scan function during the second round of scanning.  Hopefully this means
    2240             :  * that we reclaimed enough memory that initiating metadata transactions won't
    2241             :  * make things worse.
    2242             :  */
    2243             : #define XFS_INODEGC_SHRINKER_COUNT      (1UL << DEF_PRIORITY)
    2244             : #define XFS_INODEGC_SHRINKER_BATCH      ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
    2245             : 
    2246             : static unsigned long
    2247        7597 : xfs_inodegc_shrinker_count(
    2248             :         struct shrinker         *shrink,
    2249             :         struct shrink_control   *sc)
    2250             : {
    2251        7597 :         struct xfs_mount        *mp = container_of(shrink, struct xfs_mount,
    2252             :                                                    m_inodegc_shrinker);
    2253        7597 :         struct xfs_inodegc      *gc;
    2254        7597 :         int                     cpu;
    2255             : 
    2256       15194 :         if (!xfs_is_inodegc_enabled(mp))
    2257             :                 return 0;
    2258             : 
    2259       37824 :         for_each_online_cpu(cpu) {
    2260       30288 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
    2261       30288 :                 if (!llist_empty(&gc->list))
    2262             :                         return XFS_INODEGC_SHRINKER_COUNT;
    2263             :         }
    2264             : 
    2265             :         return 0;
    2266             : }
    2267             : 
    2268             : static unsigned long
    2269          32 : xfs_inodegc_shrinker_scan(
    2270             :         struct shrinker         *shrink,
    2271             :         struct shrink_control   *sc)
    2272             : {
    2273          32 :         struct xfs_mount        *mp = container_of(shrink, struct xfs_mount,
    2274             :                                                    m_inodegc_shrinker);
    2275          32 :         struct xfs_inodegc      *gc;
    2276          32 :         int                     cpu;
    2277          32 :         bool                    no_items = true;
    2278             : 
    2279          64 :         if (!xfs_is_inodegc_enabled(mp))
    2280             :                 return SHRINK_STOP;
    2281             : 
    2282          32 :         trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
    2283             : 
    2284         192 :         for_each_online_cpu(cpu) {
    2285         128 :                 gc = per_cpu_ptr(mp->m_inodegc, cpu);
    2286         128 :                 if (!llist_empty(&gc->list)) {
    2287          32 :                         unsigned int    h = READ_ONCE(gc->shrinker_hits);
    2288             : 
    2289          32 :                         WRITE_ONCE(gc->shrinker_hits, h + 1);
    2290          32 :                         mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
    2291          32 :                         no_items = false;
    2292             :                 }
    2293             :         }
    2294             : 
    2295             :         /*
    2296             :          * If there are no inodes to inactivate, we don't want the shrinker
    2297             :          * to think there's deferred work to call us back about.
    2298             :          */
    2299          32 :         if (no_items)
    2300           0 :                 return LONG_MAX;
    2301             : 
    2302             :         return SHRINK_STOP;
    2303             : }
    2304             : 
    2305             : /* Register a shrinker so we can accelerate inodegc and throttle queuing. */
    2306             : int
    2307       66856 : xfs_inodegc_register_shrinker(
    2308             :         struct xfs_mount        *mp)
    2309             : {
    2310       66856 :         struct shrinker         *shrink = &mp->m_inodegc_shrinker;
    2311             : 
    2312       66856 :         shrink->count_objects = xfs_inodegc_shrinker_count;
    2313       66856 :         shrink->scan_objects = xfs_inodegc_shrinker_scan;
    2314       66856 :         shrink->seeks = 0;
    2315       66856 :         shrink->flags = SHRINKER_NONSLAB;
    2316       66856 :         shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
    2317             : 
    2318       66856 :         return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
    2319             : }

Generated by: LCOV version 1.14