LCOV - code coverage report
Current view: top level - fs/xfs - xfs_buf.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-acha @ Mon Jul 31 20:08:06 PDT 2023 Lines: 907 1013 89.5 %
Date: 2023-07-31 20:08:07 Functions: 79 81 97.5 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
       4             :  * All Rights Reserved.
       5             :  */
       6             : #include "xfs.h"
       7             : #include <linux/backing-dev.h>
       8             : #include <linux/dax.h>
       9             : 
      10             : #include "xfs_shared.h"
      11             : #include "xfs_format.h"
      12             : #include "xfs_log_format.h"
      13             : #include "xfs_trans_resv.h"
      14             : #include "xfs_mount.h"
      15             : #include "xfs_trace.h"
      16             : #include "xfs_log.h"
      17             : #include "xfs_log_recover.h"
      18             : #include "xfs_log_priv.h"
      19             : #include "xfs_trans.h"
      20             : #include "xfs_buf_item.h"
      21             : #include "xfs_errortag.h"
      22             : #include "xfs_error.h"
      23             : #include "xfs_ag.h"
      24             : #include "xfs_buf_xfile.h"
      25             : 
      26             : struct kmem_cache *xfs_buf_cache;
      27             : 
      28             : /*
      29             :  * Locking orders
      30             :  *
      31             :  * xfs_buf_ioacct_inc:
      32             :  * xfs_buf_ioacct_dec:
      33             :  *      b_sema (caller holds)
      34             :  *        b_lock
      35             :  *
      36             :  * xfs_buf_stale:
      37             :  *      b_sema (caller holds)
      38             :  *        b_lock
      39             :  *          lru_lock
      40             :  *
      41             :  * xfs_buf_rele:
      42             :  *      b_lock
      43             :  *        pag_buf_lock
      44             :  *          lru_lock
      45             :  *
      46             :  * xfs_buftarg_drain_rele
      47             :  *      lru_lock
      48             :  *        b_lock (trylock due to inversion)
      49             :  *
      50             :  * xfs_buftarg_isolate
      51             :  *      lru_lock
      52             :  *        b_lock (trylock due to inversion)
      53             :  */
      54             : 
      55             : static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
      56             : 
      57             : static inline int
      58    34512925 : xfs_buf_submit(
      59             :         struct xfs_buf          *bp)
      60             : {
      61    34512925 :         return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
      62             : }
      63             : 
      64             : static inline int
      65             : xfs_buf_is_vmapped(
      66             :         struct xfs_buf  *bp)
      67             : {
      68             :         /*
      69             :          * Return true if the buffer is vmapped.
      70             :          *
      71             :          * b_addr is null if the buffer is not mapped, but the code is clever
      72             :          * enough to know it doesn't have to map a single page, so the check has
      73             :          * to be both for b_addr and bp->b_page_count > 1.
      74             :          */
      75    69469177 :         return bp->b_addr && bp->b_page_count > 1;
      76             : }
      77             : 
      78             : static inline int
      79             : xfs_buf_vmap_len(
      80             :         struct xfs_buf  *bp)
      81             : {
      82             :         return (bp->b_page_count * PAGE_SIZE);
      83             : }
      84             : 
      85             : /*
      86             :  * Bump the I/O in flight count on the buftarg if we haven't yet done so for
      87             :  * this buffer. The count is incremented once per buffer (per hold cycle)
      88             :  * because the corresponding decrement is deferred to buffer release. Buffers
      89             :  * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
      90             :  * tracking adds unnecessary overhead. This is used for sychronization purposes
      91             :  * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
      92             :  * in-flight buffers.
      93             :  *
      94             :  * Buffers that are never released (e.g., superblock, iclog buffers) must set
      95             :  * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
      96             :  * never reaches zero and unmount hangs indefinitely.
      97             :  */
      98             : static inline void
      99    99627157 : xfs_buf_ioacct_inc(
     100             :         struct xfs_buf  *bp)
     101             : {
     102    99627157 :         if (bp->b_flags & XBF_NO_IOACCT)
     103             :                 return;
     104             : 
     105    99471839 :         ASSERT(bp->b_flags & XBF_ASYNC);
     106    99471839 :         spin_lock(&bp->b_lock);
     107    99472045 :         if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
     108    94954653 :                 bp->b_state |= XFS_BSTATE_IN_FLIGHT;
     109    94954653 :                 percpu_counter_inc(&bp->b_target->bt_io_count);
     110             :         }
     111    99472032 :         spin_unlock(&bp->b_lock);
     112             : }
     113             : 
     114             : /*
     115             :  * Clear the in-flight state on a buffer about to be released to the LRU or
     116             :  * freed and unaccount from the buftarg.
     117             :  */
     118             : static inline void
     119  6958747056 : __xfs_buf_ioacct_dec(
     120             :         struct xfs_buf  *bp)
     121             : {
     122  6958747056 :         lockdep_assert_held(&bp->b_lock);
     123             : 
     124  6958747056 :         if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
     125    94954499 :                 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
     126    94954499 :                 percpu_counter_dec(&bp->b_target->bt_io_count);
     127             :         }
     128  6958746885 : }
     129             : 
     130             : static inline void
     131      142765 : xfs_buf_ioacct_dec(
     132             :         struct xfs_buf  *bp)
     133             : {
     134      142765 :         spin_lock(&bp->b_lock);
     135      142765 :         __xfs_buf_ioacct_dec(bp);
     136      142765 :         spin_unlock(&bp->b_lock);
     137      142765 : }
     138             : 
     139             : /*
     140             :  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
     141             :  * b_lru_ref count so that the buffer is freed immediately when the buffer
     142             :  * reference count falls to zero. If the buffer is already on the LRU, we need
     143             :  * to remove the reference that LRU holds on the buffer.
     144             :  *
     145             :  * This prevents build-up of stale buffers on the LRU.
     146             :  */
     147             : void
     148    34688820 : xfs_buf_stale(
     149             :         struct xfs_buf  *bp)
     150             : {
     151    34688820 :         ASSERT(xfs_buf_islocked(bp));
     152             : 
     153    34688820 :         bp->b_flags |= XBF_STALE;
     154             : 
     155             :         /*
     156             :          * Clear the delwri status so that a delwri queue walker will not
     157             :          * flush this buffer to disk now that it is stale. The delwri queue has
     158             :          * a reference to the buffer, so this is safe to do.
     159             :          */
     160    34688820 :         bp->b_flags &= ~_XBF_DELWRI_Q;
     161             : 
     162             :         /*
     163             :          * Once the buffer is marked stale and unlocked, a subsequent lookup
     164             :          * could reset b_flags. There is no guarantee that the buffer is
     165             :          * unaccounted (released to LRU) before that occurs. Drop in-flight
     166             :          * status now to preserve accounting consistency.
     167             :          */
     168    34688820 :         spin_lock(&bp->b_lock);
     169    34689149 :         __xfs_buf_ioacct_dec(bp);
     170             : 
     171    34689144 :         atomic_set(&bp->b_lru_ref, 0);
     172    69380731 :         if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
     173    34688758 :             (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
     174     5429803 :                 atomic_dec(&bp->b_hold);
     175             : 
     176    34691973 :         ASSERT(atomic_read(&bp->b_hold) >= 1);
     177    34691973 :         spin_unlock(&bp->b_lock);
     178    34691400 : }
     179             : 
     180             : static int
     181    69459104 : xfs_buf_get_maps(
     182             :         struct xfs_buf          *bp,
     183             :         int                     map_count)
     184             : {
     185    69459104 :         ASSERT(bp->b_maps == NULL);
     186    69459104 :         bp->b_map_count = map_count;
     187             : 
     188    69459104 :         if (map_count == 1) {
     189    69459064 :                 bp->b_maps = &bp->__b_map;
     190    69459064 :                 return 0;
     191             :         }
     192             : 
     193          40 :         bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
     194             :                                 KM_NOFS);
     195          40 :         if (!bp->b_maps)
     196           0 :                 return -ENOMEM;
     197             :         return 0;
     198             : }
     199             : 
     200             : /*
     201             :  *      Frees b_pages if it was allocated.
     202             :  */
     203             : static void
     204             : xfs_buf_free_maps(
     205             :         struct xfs_buf  *bp)
     206             : {
     207    69264671 :         if (bp->b_maps != &bp->__b_map) {
     208          40 :                 kmem_free(bp->b_maps);
     209          40 :                 bp->b_maps = NULL;
     210             :         }
     211             : }
     212             : 
     213             : static int
     214    69463601 : _xfs_buf_alloc(
     215             :         struct xfs_buftarg      *target,
     216             :         struct xfs_buf_map      *map,
     217             :         int                     nmaps,
     218             :         xfs_buf_flags_t         flags,
     219             :         struct xfs_buf          **bpp)
     220             : {
     221    69463601 :         struct xfs_buf          *bp;
     222    69463601 :         int                     error;
     223    69463601 :         int                     i;
     224             : 
     225    69463601 :         *bpp = NULL;
     226    69463601 :         bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
     227             : 
     228             :         /*
     229             :          * We don't want certain flags to appear in b_flags unless they are
     230             :          * specifically set by later operations on the buffer.
     231             :          */
     232    69455750 :         flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
     233             : 
     234    69455750 :         atomic_set(&bp->b_hold, 1);
     235    69455750 :         atomic_set(&bp->b_lru_ref, 1);
     236    69455750 :         init_completion(&bp->b_iowait);
     237    69453890 :         INIT_LIST_HEAD(&bp->b_lru);
     238    69453890 :         INIT_LIST_HEAD(&bp->b_list);
     239    69453890 :         INIT_LIST_HEAD(&bp->b_li_list);
     240    69453890 :         sema_init(&bp->b_sema, 0); /* held, no waiters */
     241    69453890 :         spin_lock_init(&bp->b_lock);
     242    69459551 :         bp->b_target = target;
     243    69459551 :         bp->b_mount = target->bt_mount;
     244    69459551 :         bp->b_flags = flags;
     245             : 
     246             :         /*
     247             :          * Set length and io_length to the same value initially.
     248             :          * I/O routines should use io_length, which will be the same in
     249             :          * most cases but may be reset (e.g. XFS recovery).
     250             :          */
     251    69459551 :         error = xfs_buf_get_maps(bp, nmaps);
     252    69458587 :         if (error)  {
     253           0 :                 kmem_cache_free(xfs_buf_cache, bp);
     254           0 :                 return error;
     255             :         }
     256             : 
     257    69458587 :         bp->b_rhash_key = map[0].bm_bn;
     258    69458587 :         bp->b_length = 0;
     259   138923302 :         for (i = 0; i < nmaps; i++) {
     260    69464715 :                 bp->b_maps[i].bm_bn = map[i].bm_bn;
     261    69464715 :                 bp->b_maps[i].bm_len = map[i].bm_len;
     262    69464715 :                 bp->b_length += map[i].bm_len;
     263             :         }
     264             : 
     265    69458587 :         atomic_set(&bp->b_pin_count, 0);
     266    69458587 :         init_waitqueue_head(&bp->b_waiters);
     267             : 
     268    69460596 :         XFS_STATS_INC(bp->b_mount, xb_create);
     269    69460596 :         trace_xfs_buf_init(bp, _RET_IP_);
     270             : 
     271    69459757 :         *bpp = bp;
     272    69459757 :         return 0;
     273             : }
     274             : 
     275             : static void
     276      143482 : xfs_buf_free_pages(
     277             :         struct xfs_buf  *bp)
     278             : {
     279      143482 :         uint            i;
     280             : 
     281      143482 :         ASSERT(bp->b_flags & _XBF_PAGES);
     282             : 
     283      287101 :         for (i = 0; i < bp->b_page_count; i++) {
     284      143619 :                 if (bp->b_pages[i])
     285      143619 :                         __free_page(bp->b_pages[i]);
     286             :         }
     287      143482 :         mm_account_reclaimed_pages(bp->b_page_count);
     288             : 
     289      143482 :         xfs_buf_free_page_array(bp);
     290      143482 : }
     291             : 
     292             : void
     293      190601 : xfs_buf_free_page_array(
     294             :         struct xfs_buf  *bp)
     295             : {
     296      190601 :         ASSERT(bp->b_flags & _XBF_PAGES);
     297             : 
     298      190601 :         if (bp->b_pages != bp->b_page_array)
     299           0 :                 kmem_free(bp->b_pages);
     300      190601 :         bp->b_pages = NULL;
     301      190601 :         bp->b_flags &= ~_XBF_PAGES;
     302      190601 :         bp->b_page_count = 0;
     303      190601 : }
     304             : 
     305             : static void
     306    69264671 : xfs_buf_free_callback(
     307             :         struct callback_head    *cb)
     308             : {
     309    69264671 :         struct xfs_buf          *bp = container_of(cb, struct xfs_buf, b_rcu);
     310             : 
     311    69264671 :         xfs_buf_free_maps(bp);
     312    69264671 :         kmem_cache_free(xfs_buf_cache, bp);
     313    69294640 : }
     314             : 
     315             : static void
     316    69469052 : xfs_buf_free(
     317             :         struct xfs_buf          *bp)
     318             : {
     319    69469052 :         trace_xfs_buf_free(bp, _RET_IP_);
     320             : 
     321    69469177 :         ASSERT(list_empty(&bp->b_lru));
     322             : 
     323    69469177 :         if (xfs_buf_is_vmapped(bp))
     324         137 :                 vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
     325             : 
     326    69469191 :         if (bp->b_flags & _XBF_DIRECT_MAP)
     327       47118 :                 xfile_buf_unmap_pages(bp);
     328    69422073 :         else if (bp->b_flags & _XBF_PAGES)
     329      143482 :                 xfs_buf_free_pages(bp);
     330    69278591 :         else if (bp->b_flags & _XBF_KMEM)
     331    69278577 :                 kmem_free(bp->b_addr);
     332             : 
     333    69469188 :         call_rcu(&bp->b_rcu, xfs_buf_free_callback);
     334    69469146 : }
     335             : 
     336             : static int
     337    69273036 : xfs_buf_alloc_kmem(
     338             :         struct xfs_buf  *bp,
     339             :         xfs_buf_flags_t flags)
     340             : {
     341    69273036 :         xfs_km_flags_t  kmflag_mask = KM_NOFS;
     342    69273036 :         size_t          size = BBTOB(bp->b_length);
     343             : 
     344             :         /* Assure zeroed buffer for non-read cases. */
     345    69273036 :         if (!(flags & XBF_READ))
     346    34957157 :                 kmflag_mask |= KM_ZERO;
     347             : 
     348    69273036 :         bp->b_addr = kmem_alloc(size, kmflag_mask);
     349    69269972 :         if (!bp->b_addr)
     350             :                 return -ENOMEM;
     351             : 
     352    69269972 :         if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
     353             :             ((unsigned long)bp->b_addr & PAGE_MASK)) {
     354             :                 /* b_addr spans two pages - use alloc_page instead */
     355           0 :                 kmem_free(bp->b_addr);
     356           0 :                 bp->b_addr = NULL;
     357           0 :                 return -ENOMEM;
     358             :         }
     359    69269972 :         bp->b_offset = offset_in_page(bp->b_addr);
     360    69269972 :         bp->b_pages = bp->b_page_array;
     361    69269972 :         bp->b_pages[0] = kmem_to_page(bp->b_addr);
     362    69270040 :         bp->b_page_count = 1;
     363    69270040 :         bp->b_flags |= _XBF_KMEM;
     364    69270040 :         return 0;
     365             : }
     366             : 
     367             : /* Make sure that we have a page list */
     368             : int
     369      190588 : xfs_buf_alloc_page_array(
     370             :         struct xfs_buf  *bp,
     371             :         gfp_t           gfp_mask)
     372             : {
     373      190588 :         ASSERT(!(bp->b_flags & _XBF_PAGES));
     374             : 
     375      190588 :         bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
     376      190588 :         if (bp->b_page_count <= XB_PAGES) {
     377      190588 :                 bp->b_pages = bp->b_page_array;
     378             :         } else {
     379           0 :                 bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
     380             :                                         gfp_mask);
     381           0 :                 if (!bp->b_pages)
     382             :                         return -ENOMEM;
     383             :         }
     384             : 
     385      190588 :         bp->b_flags |= _XBF_PAGES;
     386      190588 :         return 0;
     387             : }
     388             : 
     389             : static int
     390      143475 : xfs_buf_alloc_pages(
     391             :         struct xfs_buf  *bp,
     392             :         xfs_buf_flags_t flags)
     393             : {
     394      143475 :         gfp_t           gfp_mask = __GFP_NOWARN;
     395      143475 :         long            filled = 0;
     396      143475 :         int             error;
     397             : 
     398      143475 :         if (flags & XBF_READ_AHEAD)
     399             :                 gfp_mask |= __GFP_NORETRY;
     400             :         else
     401      143466 :                 gfp_mask |= GFP_NOFS;
     402             : 
     403      143475 :         error = xfs_buf_alloc_page_array(bp, gfp_mask);
     404      143474 :         if (error)
     405             :                 return error;
     406             : 
     407             :         /* Assure zeroed buffer for non-read cases. */
     408      143474 :         if (!(flags & XBF_READ))
     409      143091 :                 gfp_mask |= __GFP_ZERO;
     410             : 
     411             :         /*
     412             :          * Bulk filling of pages can take multiple calls. Not filling the entire
     413             :          * array is not an allocation failure, so don't back off if we get at
     414             :          * least one extra page.
     415             :          */
     416      143474 :         for (;;) {
     417      143474 :                 long    last = filled;
     418             : 
     419      143474 :                 filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
     420             :                                                 bp->b_pages);
     421      143475 :                 if (filled == bp->b_page_count) {
     422      143475 :                         XFS_STATS_INC(bp->b_mount, xb_page_found);
     423      143475 :                         break;
     424             :                 }
     425             : 
     426           0 :                 if (filled != last)
     427           0 :                         continue;
     428             : 
     429           0 :                 if (flags & XBF_READ_AHEAD) {
     430           0 :                         xfs_buf_free_pages(bp);
     431           0 :                         return -ENOMEM;
     432             :                 }
     433             : 
     434           0 :                 XFS_STATS_INC(bp->b_mount, xb_page_retries);
     435           0 :                 memalloc_retry_wait(gfp_mask);
     436             :         }
     437             : 
     438      143475 :         bp->b_offset = 0;
     439      143475 :         return 0;
     440             : }
     441             : 
     442             : /*
     443             :  *      Map buffer into kernel address-space if necessary.
     444             :  */
     445             : STATIC int
     446      190591 : _xfs_buf_map_pages(
     447             :         struct xfs_buf          *bp,
     448             :         xfs_buf_flags_t         flags)
     449             : {
     450      190591 :         ASSERT(bp->b_flags & (_XBF_PAGES | _XBF_DIRECT_MAP));
     451             : 
     452      190591 :         if (bp->b_page_count == 1) {
     453             :                 /* A single page buffer is always mappable */
     454      190454 :                 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
     455         137 :         } else if (flags & XBF_UNMAPPED) {
     456           0 :                 bp->b_addr = NULL;
     457             :         } else {
     458         137 :                 int retried = 0;
     459         137 :                 unsigned nofs_flag;
     460             : 
     461             :                 /*
     462             :                  * vm_map_ram() will allocate auxiliary structures (e.g.
     463             :                  * pagetables) with GFP_KERNEL, yet we are likely to be under
     464             :                  * GFP_NOFS context here. Hence we need to tell memory reclaim
     465             :                  * that we are in such a context via PF_MEMALLOC_NOFS to prevent
     466             :                  * memory reclaim re-entering the filesystem here and
     467             :                  * potentially deadlocking.
     468             :                  */
     469         137 :                 nofs_flag = memalloc_nofs_save();
     470         137 :                 do {
     471         137 :                         bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
     472             :                                                 -1);
     473         137 :                         if (bp->b_addr)
     474             :                                 break;
     475           0 :                         vm_unmap_aliases();
     476           0 :                 } while (retried++ <= 1);
     477         137 :                 memalloc_nofs_restore(nofs_flag);
     478             : 
     479         137 :                 if (!bp->b_addr)
     480             :                         return -ENOMEM;
     481             : 
     482         137 :                 bp->b_addr += bp->b_offset;
     483             :         }
     484             : 
     485             :         return 0;
     486             : }
     487             : 
     488             : /*
     489             :  *      Finding and Reading Buffers
     490             :  */
     491             : static int
     492 23555079665 : _xfs_buf_obj_cmp(
     493             :         struct rhashtable_compare_arg   *arg,
     494             :         const void                      *obj)
     495             : {
     496 23555079665 :         const struct xfs_buf_map        *map = arg->key;
     497 23555079665 :         const struct xfs_buf            *bp = obj;
     498             : 
     499             :         /*
     500             :          * The key hashing in the lookup path depends on the key being the
     501             :          * first element of the compare_arg, make sure to assert this.
     502             :          */
     503 23555079665 :         BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
     504             : 
     505 23555079665 :         if (bp->b_rhash_key != map->bm_bn)
     506             :                 return 1;
     507             : 
     508 18317476921 :         if (unlikely(bp->b_length != map->bm_len)) {
     509             :                 /*
     510             :                  * found a block number match. If the range doesn't
     511             :                  * match, the only way this is allowed is if the buffer
     512             :                  * in the cache is stale and the transaction that made
     513             :                  * it stale has not yet committed. i.e. we are
     514             :                  * reallocating a busy extent. Skip this buffer and
     515             :                  * continue searching for an exact match.
     516             :                  *
     517             :                  * Note: If we're scanning for incore buffers to stale, don't
     518             :                  * complain if we find non-stale buffers.
     519             :                  */
     520     3421193 :                 if (!(map->bm_flags & XBM_LIVESCAN))
     521           4 :                         ASSERT(bp->b_flags & XBF_STALE);
     522     3421193 :                 return 1;
     523             :         }
     524             :         return 0;
     525             : }
     526             : 
     527             : static const struct rhashtable_params xfs_buf_hash_params = {
     528             :         .min_size               = 32,   /* empty AGs have minimal footprint */
     529             :         .nelem_hint             = 16,
     530             :         .key_len                = sizeof(xfs_daddr_t),
     531             :         .key_offset             = offsetof(struct xfs_buf, b_rhash_key),
     532             :         .head_offset            = offsetof(struct xfs_buf, b_rhash_head),
     533             :         .automatic_shrinking    = true,
     534             :         .obj_cmpfn              = _xfs_buf_obj_cmp,
     535             : };
     536             : 
     537             : int
     538      154732 : xfs_buf_cache_init(
     539             :         struct xfs_buf_cache    *bch)
     540             : {
     541      154732 :         spin_lock_init(&bch->bc_lock);
     542      154716 :         return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
     543             : }
     544             : 
     545             : void
     546      154772 : xfs_buf_cache_destroy(
     547             :         struct xfs_buf_cache    *bch)
     548             : {
     549      154772 :         rhashtable_destroy(&bch->bc_hash);
     550      154767 : }
     551             : 
     552             : static int
     553 18382920349 : xfs_buf_map_verify(
     554             :         struct xfs_buftarg      *btp,
     555             :         struct xfs_buf_map      *map)
     556             : {
     557 18382920349 :         xfs_daddr_t             eofs;
     558             : 
     559             :         /* Check for IOs smaller than the sector size / not sector aligned */
     560 18382920349 :         ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
     561 18382920349 :         ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
     562             : 
     563             :         /*
     564             :          * Corrupted block numbers can get through to here, unfortunately, so we
     565             :          * have to check that the buffer falls within the filesystem bounds.
     566             :          */
     567 18382920349 :         eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
     568 18382920349 :         if (map->bm_bn < 0 || map->bm_bn >= eofs) {
     569           0 :                 xfs_alert(btp->bt_mount,
     570             :                           "%s: daddr 0x%llx out of range, EOFS 0x%llx",
     571             :                           __func__, map->bm_bn, eofs);
     572           0 :                 WARN_ON(1);
     573           0 :                 return -EFSCORRUPTED;
     574             :         }
     575             :         return 0;
     576             : }
     577             : 
     578             : static int
     579 18332787149 : xfs_buf_find_lock(
     580             :         struct xfs_buf          *bp,
     581             :         xfs_buf_flags_t         flags)
     582             : {
     583 18332787149 :         if (flags & XBF_TRYLOCK) {
     584  4466259014 :                 if (!xfs_buf_trylock(bp)) {
     585    84242694 :                         XFS_STATS_INC(bp->b_mount, xb_busy_locked);
     586    84242694 :                         return -EAGAIN;
     587             :                 }
     588             :         } else {
     589 13866528135 :                 xfs_buf_lock(bp);
     590 13865941213 :                 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
     591             :         }
     592             : 
     593             :         /*
     594             :          * if the buffer is stale, clear all the external state associated with
     595             :          * it. We need to keep flags such as how we allocated the buffer memory
     596             :          * intact here.
     597             :          */
     598 18248125468 :         if (bp->b_flags & XBF_STALE) {
     599       40621 :                 if (flags & XBF_LIVESCAN) {
     600           0 :                         xfs_buf_unlock(bp);
     601           0 :                         return -ENOENT;
     602             :                 }
     603       40621 :                 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
     604       40621 :                 bp->b_flags &= _XBF_KMEM | _XBF_PAGES | _XBF_DIRECT_MAP;
     605       40621 :                 bp->b_ops = NULL;
     606             :         }
     607             :         return 0;
     608             : }
     609             : 
     610             : static inline int
     611 18391888009 : xfs_buf_lookup(
     612             :         struct xfs_buf_cache    *bch,
     613             :         struct xfs_buf_map      *map,
     614             :         xfs_buf_flags_t         flags,
     615             :         struct xfs_buf          **bpp)
     616             : {
     617 18391888009 :         struct xfs_buf          *bp;
     618 18391888009 :         int                     error;
     619             : 
     620 18391888009 :         rcu_read_lock();
     621 18390117456 :         bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
     622 36713176997 :         if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
     623    72832886 :                 rcu_read_unlock();
     624    72832886 :                 return -ENOENT;
     625             :         }
     626 18328434750 :         rcu_read_unlock();
     627             : 
     628 18327124466 :         error = xfs_buf_find_lock(bp, flags);
     629 18328614248 :         if (error) {
     630    84242655 :                 xfs_buf_rele(bp);
     631    84242655 :                 return error;
     632             :         }
     633             : 
     634 18244371593 :         trace_xfs_buf_find(bp, flags, _RET_IP_);
     635 18245316794 :         *bpp = bp;
     636 18245316794 :         return 0;
     637             : }
     638             : 
     639             : /*
     640             :  * Insert the new_bp into the hash table. This consumes the perag reference
     641             :  * taken for the lookup regardless of the result of the insert.
     642             :  */
     643             : static int
     644    69322947 : xfs_buf_find_insert(
     645             :         struct xfs_buftarg      *btp,
     646             :         struct xfs_buf_cache    *bch,
     647             :         struct xfs_perag        *pag,
     648             :         struct xfs_buf_map      *cmap,
     649             :         struct xfs_buf_map      *map,
     650             :         int                     nmaps,
     651             :         xfs_buf_flags_t         flags,
     652             :         struct xfs_buf          **bpp)
     653             : {
     654    69322947 :         struct xfs_buf          *new_bp;
     655    69322947 :         struct xfs_buf          *bp;
     656    69322947 :         int                     error;
     657             : 
     658    69322947 :         error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
     659    69319320 :         if (error)
     660           0 :                 goto out_drop_pag;
     661             : 
     662             :         /*
     663             :          * If the caller is ok with direct maps to xfile pages, try that.
     664             :          * ENOTBLK is the magic code to fall back to allocating memory.
     665             :          */
     666    69319320 :         if (xfile_buftarg_can_direct_map(btp)) {
     667       47113 :                 error = xfile_buf_map_pages(new_bp, flags);
     668       47113 :                 if (error && error != -ENOTBLK)
     669           0 :                         goto out_free_buf;
     670       47113 :                 if (!error)
     671       47113 :                         goto insert;
     672             :         }
     673             : 
     674             :         /*
     675             :          * For buffers that fit entirely within a single page, first attempt to
     676             :          * allocate the memory from the heap to minimise memory usage.
     677             :          */
     678    69272207 :         if (BBTOB(new_bp->b_length) < PAGE_SIZE) {
     679    69271490 :                 error = xfs_buf_alloc_kmem(new_bp, flags);
     680    69269746 :                 if (!error)
     681    69269901 :                         goto insert;
     682             :         }
     683             : 
     684             :         /*
     685             :          * For larger buffers or if we can't get heap memory for these small
     686             :          * buffers, fall back to using the page allocator.
     687             :          */
     688         785 :         error = xfs_buf_alloc_pages(new_bp, flags);
     689         717 :         if (error)
     690           0 :                 goto out_free_buf;
     691             : 
     692         717 : insert:
     693    69317731 :         spin_lock(&bch->bc_lock);
     694    69322726 :         bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
     695    69322726 :                         &new_bp->b_rhash_head, xfs_buf_hash_params);
     696    69319095 :         if (IS_ERR(bp)) {
     697           0 :                 error = PTR_ERR(bp);
     698           0 :                 spin_unlock(&bch->bc_lock);
     699           0 :                 goto out_free_buf;
     700             :         }
     701    69319095 :         if (bp) {
     702             :                 /* found an existing buffer */
     703        1044 :                 atomic_inc(&bp->b_hold);
     704        1044 :                 spin_unlock(&bch->bc_lock);
     705        1044 :                 error = xfs_buf_find_lock(bp, flags);
     706        1044 :                 if (error)
     707           0 :                         xfs_buf_rele(bp);
     708             :                 else
     709        1044 :                         *bpp = bp;
     710        1044 :                 goto out_free_buf;
     711             :         }
     712             : 
     713             :         /* The new buffer keeps the perag reference until it is freed. */
     714    69318051 :         new_bp->b_pag = pag;
     715    69318051 :         new_bp->b_cache = bch;
     716    69318051 :         spin_unlock(&bch->bc_lock);
     717    69319679 :         *bpp = new_bp;
     718    69319679 :         return 0;
     719             : 
     720        1044 : out_free_buf:
     721        1044 :         xfs_buf_free(new_bp);
     722        1044 : out_drop_pag:
     723        1044 :         if (pag)
     724        1044 :                 xfs_perag_put(pag);
     725             :         return error;
     726             : }
     727             : 
     728             : /* Find the buffer cache for a particular buftarg and map. */
     729             : static inline struct xfs_buf_cache *
     730 18383298053 : xfs_buftarg_get_cache(
     731             :         struct xfs_buftarg              *btp,
     732             :         const struct xfs_buf_map        *map,
     733             :         struct xfs_perag                **pagp)
     734             : {
     735 18383298053 :         struct xfs_mount                *mp = btp->bt_mount;
     736             : 
     737 18383298053 :         if (btp->bt_cache) {
     738  1030971234 :                 *pagp = NULL;
     739  1030971234 :                 return btp->bt_cache;
     740             :         }
     741             : 
     742 17352326819 :         *pagp = xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
     743 17362556898 :         ASSERT(*pagp != NULL);
     744 17362556898 :         return &(*pagp)->pag_bcache;
     745             : }
     746             : 
     747             : /*
     748             :  * Assembles a buffer covering the specified range. The code is optimised for
     749             :  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
     750             :  * more hits than misses.
     751             :  */
     752             : int
     753 18379576836 : xfs_buf_get_map(
     754             :         struct xfs_buftarg      *btp,
     755             :         struct xfs_buf_map      *map,
     756             :         int                     nmaps,
     757             :         xfs_buf_flags_t         flags,
     758             :         struct xfs_buf          **bpp)
     759             : {
     760 18379576836 :         struct xfs_buf_cache    *bch;
     761 18379576836 :         struct xfs_perag        *pag;
     762 18379576836 :         struct xfs_buf          *bp = NULL;
     763 18379576836 :         struct xfs_buf_map      cmap = { .bm_bn = map[0].bm_bn };
     764 18379576836 :         int                     error;
     765 18379576836 :         int                     i;
     766             : 
     767 18379576836 :         if (flags & XBF_LIVESCAN)
     768     4034235 :                 cmap.bm_flags |= XBM_LIVESCAN;
     769 36761613483 :         for (i = 0; i < nmaps; i++)
     770 18382036647 :                 cmap.bm_len += map[i].bm_len;
     771             : 
     772 18379576836 :         error = xfs_buf_map_verify(btp, &cmap);
     773 18384370296 :         if (error)
     774             :                 return error;
     775             : 
     776 18384370296 :         bch = xfs_buftarg_get_cache(btp, &cmap, &pag);
     777             : 
     778 18389382474 :         error = xfs_buf_lookup(bch, &cmap, flags, &bp);
     779 18401910052 :         if (error && error != -ENOENT)
     780    84243260 :                 goto out_put_perag;
     781             : 
     782             :         /* cache hits always outnumber misses by at least 10:1 */
     783 18317666792 :         if (unlikely(!bp)) {
     784    72832672 :                 XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
     785             : 
     786    72832672 :                 if (flags & XBF_INCORE)
     787     3509619 :                         goto out_put_perag;
     788             : 
     789             :                 /* xfs_buf_find_insert() consumes the perag reference. */
     790    69323053 :                 error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
     791             :                                 flags, &bp);
     792    69321256 :                 if (error)
     793             :                         return error;
     794             :         } else {
     795 18244834120 :                 XFS_STATS_INC(btp->bt_mount, xb_get_locked);
     796 18244834120 :                 if (pag)
     797 17213909244 :                         xfs_perag_put(pag);
     798             :         }
     799             : 
     800             :         /* We do not hold a perag reference anymore. */
     801 18316500792 :         if (!bp->b_addr) {
     802       47832 :                 error = _xfs_buf_map_pages(bp, flags);
     803       47834 :                 if (unlikely(error)) {
     804           0 :                         xfs_warn_ratelimited(btp->bt_mount,
     805             :                                 "%s: failed to map %u pages", __func__,
     806             :                                 bp->b_page_count);
     807           0 :                         xfs_buf_relse(bp);
     808           0 :                         return error;
     809             :                 }
     810             :         }
     811             : 
     812             :         /*
     813             :          * Clear b_error if this is a lookup from a caller that doesn't expect
     814             :          * valid data to be found in the buffer.
     815             :          */
     816 18316500794 :         if (!(flags & XBF_READ))
     817    60888943 :                 xfs_buf_ioerror(bp, 0);
     818             : 
     819 18316499238 :         XFS_STATS_INC(btp->bt_mount, xb_get);
     820 18316499238 :         trace_xfs_buf_get(bp, flags, _RET_IP_);
     821 18316735645 :         *bpp = bp;
     822 18316735645 :         return 0;
     823             : 
     824    87752879 : out_put_perag:
     825    87752879 :         if (pag)
     826    87752874 :                 xfs_perag_put(pag);
     827             :         return error;
     828             : }
     829             : 
     830             : int
     831    34344114 : _xfs_buf_read(
     832             :         struct xfs_buf          *bp,
     833             :         xfs_buf_flags_t         flags)
     834             : {
     835    34344114 :         ASSERT(!(flags & XBF_WRITE));
     836    34344114 :         ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
     837             : 
     838    34344114 :         bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
     839    34344114 :         bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
     840             : 
     841    34344114 :         return xfs_buf_submit(bp);
     842             : }
     843             : 
     844             : /*
     845             :  * Reverify a buffer found in cache without an attached ->b_ops.
     846             :  *
     847             :  * If the caller passed an ops structure and the buffer doesn't have ops
     848             :  * assigned, set the ops and use it to verify the contents. If verification
     849             :  * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
     850             :  * already in XBF_DONE state on entry.
     851             :  *
     852             :  * Under normal operations, every in-core buffer is verified on read I/O
     853             :  * completion. There are two scenarios that can lead to in-core buffers without
     854             :  * an assigned ->b_ops. The first is during log recovery of buffers on a V4
     855             :  * filesystem, though these buffers are purged at the end of recovery. The
     856             :  * other is online repair, which intentionally reads with a NULL buffer ops to
     857             :  * run several verifiers across an in-core buffer in order to establish buffer
     858             :  * type.  If repair can't establish that, the buffer will be left in memory
     859             :  * with NULL buffer ops.
     860             :  */
     861             : int
     862 18843054056 : xfs_buf_reverify(
     863             :         struct xfs_buf          *bp,
     864             :         const struct xfs_buf_ops *ops)
     865             : {
     866 18843054056 :         ASSERT(bp->b_flags & XBF_DONE);
     867 18843054056 :         ASSERT(bp->b_error == 0);
     868             : 
     869 18843054056 :         if (!ops || bp->b_ops)
     870             :                 return 0;
     871             : 
     872        1749 :         bp->b_ops = ops;
     873        1749 :         bp->b_ops->verify_read(bp);
     874        1749 :         if (bp->b_error)
     875        1747 :                 bp->b_flags &= ~XBF_DONE;
     876             :         return bp->b_error;
     877             : }
     878             : 
     879             : int
     880 18316242146 : xfs_buf_read_map(
     881             :         struct xfs_buftarg      *target,
     882             :         struct xfs_buf_map      *map,
     883             :         int                     nmaps,
     884             :         xfs_buf_flags_t         flags,
     885             :         struct xfs_buf          **bpp,
     886             :         const struct xfs_buf_ops *ops,
     887             :         xfs_failaddr_t          fa)
     888             : {
     889 18316242146 :         struct xfs_buf          *bp;
     890 18316242146 :         int                     error;
     891             : 
     892 18316242146 :         flags |= XBF_READ;
     893 18316242146 :         *bpp = NULL;
     894             : 
     895 18316242146 :         error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
     896 18338293310 :         if (error)
     897             :                 return error;
     898             : 
     899 18254412771 :         trace_xfs_buf_read(bp, flags, _RET_IP_);
     900             : 
     901 18253835281 :         if (!(bp->b_flags & XBF_DONE)) {
     902             :                 /* Initiate the buffer read and wait. */
     903    34332971 :                 XFS_STATS_INC(target->bt_mount, xb_get_read);
     904    34332971 :                 bp->b_ops = ops;
     905    34332971 :                 error = _xfs_buf_read(bp, flags);
     906             : 
     907             :                 /* Readahead iodone already dropped the buffer, so exit. */
     908    33283262 :                 if (flags & XBF_ASYNC)
     909             :                         return 0;
     910             :         } else {
     911             :                 /* Buffer already read; all we need to do is check it. */
     912 18219502310 :                 error = xfs_buf_reverify(bp, ops);
     913             : 
     914             :                 /* Readahead already finished; drop the buffer and exit. */
     915 18225474667 :                 if (flags & XBF_ASYNC) {
     916  4316360991 :                         xfs_buf_relse(bp);
     917  4316108798 :                         return 0;
     918             :                 }
     919             : 
     920             :                 /* We do not want read in the flags */
     921 13909113676 :                 bp->b_flags &= ~XBF_READ;
     922 13909113676 :                 ASSERT(bp->b_ops != NULL || ops == NULL);
     923             :         }
     924             : 
     925             :         /*
     926             :          * If we've had a read error, then the contents of the buffer are
     927             :          * invalid and should not be used. To ensure that a followup read tries
     928             :          * to pull the buffer from disk again, we clear the XBF_DONE flag and
     929             :          * mark the buffer stale. This ensures that anyone who has a current
     930             :          * reference to the buffer will interpret it's contents correctly and
     931             :          * future cache lookups will also treat it as an empty, uninitialised
     932             :          * buffer.
     933             :          */
     934 13922120360 :         if (error) {
     935             :                 /*
     936             :                  * Check against log shutdown for error reporting because
     937             :                  * metadata writeback may require a read first and we need to
     938             :                  * report errors in metadata writeback until the log is shut
     939             :                  * down. High level transaction read functions already check
     940             :                  * against mount shutdown, anyway, so we only need to be
     941             :                  * concerned about low level IO interactions here.
     942             :                  */
     943       90926 :                 if (!xlog_is_shutdown(target->bt_mount->m_log))
     944       33097 :                         xfs_buf_ioerror_alert(bp, fa);
     945             : 
     946       45497 :                 bp->b_flags &= ~XBF_DONE;
     947       45497 :                 xfs_buf_stale(bp);
     948       45500 :                 xfs_buf_relse(bp);
     949             : 
     950             :                 /* bad CRC means corrupted metadata */
     951       45500 :                 if (error == -EFSBADCRC)
     952         663 :                         error = -EFSCORRUPTED;
     953       45500 :                 return error;
     954             :         }
     955             : 
     956 13922074897 :         *bpp = bp;
     957 13922074897 :         return 0;
     958             : }
     959             : 
     960             : /*
     961             :  *      If we are not low on memory then do the readahead in a deadlock
     962             :  *      safe manner.
     963             :  */
     964             : void
     965  4416880000 : xfs_buf_readahead_map(
     966             :         struct xfs_buftarg      *target,
     967             :         struct xfs_buf_map      *map,
     968             :         int                     nmaps,
     969             :         const struct xfs_buf_ops *ops)
     970             : {
     971  4416880000 :         struct xfs_buf          *bp;
     972             : 
     973  8834018242 :         xfs_buf_read_map(target, map, nmaps,
     974             :                      XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
     975  4416880000 :                      __this_address);
     976  4418517337 : }
     977             : 
     978             : /*
     979             :  * Read an uncached buffer from disk. Allocates and returns a locked
     980             :  * buffer containing the disk contents or nothing. Uncached buffers always have
     981             :  * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
     982             :  * is cached or uncached during fault diagnosis.
     983             :  */
     984             : int
     985       72704 : xfs_buf_read_uncached(
     986             :         struct xfs_buftarg      *target,
     987             :         xfs_daddr_t             daddr,
     988             :         size_t                  numblks,
     989             :         xfs_buf_flags_t         flags,
     990             :         struct xfs_buf          **bpp,
     991             :         const struct xfs_buf_ops *ops)
     992             : {
     993       72704 :         struct xfs_buf          *bp;
     994       72704 :         int                     error;
     995             : 
     996       72704 :         *bpp = NULL;
     997             : 
     998       72704 :         error = xfs_buf_get_uncached(target, numblks, flags, &bp);
     999       72704 :         if (error)
    1000             :                 return error;
    1001             : 
    1002             :         /* set up the buffer for a read IO */
    1003       72704 :         ASSERT(bp->b_map_count == 1);
    1004       72704 :         bp->b_rhash_key = XFS_BUF_DADDR_NULL;
    1005       72704 :         bp->b_maps[0].bm_bn = daddr;
    1006       72704 :         bp->b_flags |= XBF_READ;
    1007       72704 :         bp->b_ops = ops;
    1008             : 
    1009       72704 :         xfs_buf_submit(bp);
    1010       72704 :         if (bp->b_error) {
    1011          16 :                 error = bp->b_error;
    1012          16 :                 xfs_buf_relse(bp);
    1013          16 :                 return error;
    1014             :         }
    1015             : 
    1016       72688 :         *bpp = bp;
    1017       72688 :         return 0;
    1018             : }
    1019             : 
    1020             : int
    1021      142758 : xfs_buf_get_uncached(
    1022             :         struct xfs_buftarg      *target,
    1023             :         size_t                  numblks,
    1024             :         xfs_buf_flags_t         flags,
    1025             :         struct xfs_buf          **bpp)
    1026             : {
    1027      142758 :         int                     error;
    1028      142758 :         struct xfs_buf          *bp;
    1029      142758 :         DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
    1030             : 
    1031      142758 :         *bpp = NULL;
    1032             : 
    1033             :         /* flags might contain irrelevant bits, pass only what we care about */
    1034      142758 :         error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
    1035      142758 :         if (error)
    1036             :                 return error;
    1037             : 
    1038      142758 :         error = xfs_buf_alloc_pages(bp, flags);
    1039      142758 :         if (error)
    1040           0 :                 goto fail_free_buf;
    1041             : 
    1042      142758 :         error = _xfs_buf_map_pages(bp, 0);
    1043      142758 :         if (unlikely(error)) {
    1044           0 :                 xfs_warn(target->bt_mount,
    1045             :                         "%s: failed to map pages", __func__);
    1046           0 :                 goto fail_free_buf;
    1047             :         }
    1048             : 
    1049      142758 :         trace_xfs_buf_get_uncached(bp, _RET_IP_);
    1050      142758 :         *bpp = bp;
    1051      142758 :         return 0;
    1052             : 
    1053           0 : fail_free_buf:
    1054           0 :         xfs_buf_free(bp);
    1055           0 :         return error;
    1056             : }
    1057             : 
    1058             : /*
    1059             :  *      Increment reference count on buffer, to hold the buffer concurrently
    1060             :  *      with another thread which may release (free) the buffer asynchronously.
    1061             :  *      Must hold the buffer already to call this function.
    1062             :  */
    1063             : void
    1064  5539959262 : xfs_buf_hold(
    1065             :         struct xfs_buf          *bp)
    1066             : {
    1067  5539959262 :         trace_xfs_buf_hold(bp, _RET_IP_);
    1068  5540199907 :         atomic_inc(&bp->b_hold);
    1069  5540229028 : }
    1070             : 
    1071             : /*
    1072             :  * Release a hold on the specified buffer. If the hold count is 1, the buffer is
    1073             :  * placed on LRU or freed (depending on b_lru_ref).
    1074             :  */
    1075             : void
    1076 24040241699 : xfs_buf_rele(
    1077             :         struct xfs_buf          *bp)
    1078             : {
    1079 24040241699 :         struct xfs_perag        *pag = bp->b_pag;
    1080 24040241699 :         struct xfs_buf_cache    *bch = bp->b_cache;
    1081 24040241699 :         bool                    release;
    1082 24040241699 :         bool                    freebuf = false;
    1083             : 
    1084 24040241699 :         trace_xfs_buf_rele(bp, _RET_IP_);
    1085             : 
    1086 24052309746 :         if (!bch) {
    1087    11940596 :                 ASSERT(list_empty(&bp->b_lru));
    1088    23881213 :                 if (atomic_dec_and_test(&bp->b_hold)) {
    1089      142765 :                         xfs_buf_ioacct_dec(bp);
    1090      142765 :                         xfs_buf_free(bp);
    1091             :                 }
    1092    11940617 :                 return;
    1093             :         }
    1094             : 
    1095 24040369150 :         ASSERT(atomic_read(&bp->b_hold) > 0);
    1096             : 
    1097             :         /*
    1098             :          * We grab the b_lock here first to serialise racing xfs_buf_rele()
    1099             :          * calls. The pag_buf_lock being taken on the last reference only
    1100             :          * serialises against racing lookups in xfs_buf_find(). IOWs, the second
    1101             :          * to last reference we drop here is not serialised against the last
    1102             :          * reference until we take bp->b_lock. Hence if we don't grab b_lock
    1103             :          * first, the last "release" reference can win the race to the lock and
    1104             :          * free the buffer before the second-to-last reference is processed,
    1105             :          * leading to a use-after-free scenario.
    1106             :          */
    1107 24040369150 :         spin_lock(&bp->b_lock);
    1108 24058591022 :         release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
    1109 24077806396 :         if (!release) {
    1110             :                 /*
    1111             :                  * Drop the in-flight state if the buffer is already on the LRU
    1112             :                  * and it holds the only reference. This is racy because we
    1113             :                  * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
    1114             :                  * ensures the decrement occurs only once per-buf.
    1115             :                  */
    1116 23965427891 :                 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
    1117  6811629317 :                         __xfs_buf_ioacct_dec(bp);
    1118 23965481109 :                 goto out_unlock;
    1119             :         }
    1120             : 
    1121             :         /* the last reference has been dropped ... */
    1122   112378505 :         __xfs_buf_ioacct_dec(bp);
    1123   112378578 :         if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
    1124             :                 /*
    1125             :                  * If the buffer is added to the LRU take a new reference to the
    1126             :                  * buffer for the LRU and clear the (now stale) dispose list
    1127             :                  * state flag
    1128             :                  */
    1129    43053192 :                 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
    1130    43053590 :                         bp->b_state &= ~XFS_BSTATE_DISPOSE;
    1131    43053590 :                         atomic_inc(&bp->b_hold);
    1132             :                 }
    1133    43053602 :                 spin_unlock(&bch->bc_lock);
    1134             :         } else {
    1135             :                 /*
    1136             :                  * most of the time buffers will already be removed from the
    1137             :                  * LRU, so optimise that case by checking for the
    1138             :                  * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
    1139             :                  * was on was the disposal list
    1140             :                  */
    1141    69325386 :                 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
    1142    32174025 :                         list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
    1143             :                 } else {
    1144    37151361 :                         ASSERT(list_empty(&bp->b_lru));
    1145             :                 }
    1146             : 
    1147    69325389 :                 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
    1148    69325389 :                 rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
    1149             :                                 xfs_buf_hash_params);
    1150    69325322 :                 spin_unlock(&bch->bc_lock);
    1151    69325382 :                 if (pag)
    1152    69278263 :                         xfs_perag_put(pag);
    1153    69325374 :                 bp->b_cache = NULL;
    1154    69325374 :                 bp->b_pag = NULL;
    1155    69325374 :                 freebuf = true;
    1156             :         }
    1157             : 
    1158 24077860085 : out_unlock:
    1159 24077860085 :         spin_unlock(&bp->b_lock);
    1160             : 
    1161 24074192886 :         if (freebuf)
    1162    69325299 :                 xfs_buf_free(bp);
    1163             : }
    1164             : 
    1165             : 
    1166             : /*
    1167             :  *      Lock a buffer object, if it is not already locked.
    1168             :  *
    1169             :  *      If we come across a stale, pinned, locked buffer, we know that we are
    1170             :  *      being asked to lock a buffer that has been reallocated. Because it is
    1171             :  *      pinned, we know that the log has not been pushed to disk and hence it
    1172             :  *      will still be locked.  Rather than continuing to have trylock attempts
    1173             :  *      fail until someone else pushes the log, push it ourselves before
    1174             :  *      returning.  This means that the xfsaild will not get stuck trying
    1175             :  *      to push on stale inode buffers.
    1176             :  */
    1177             : int
    1178  4624991111 : xfs_buf_trylock(
    1179             :         struct xfs_buf          *bp)
    1180             : {
    1181  4624991111 :         int                     locked;
    1182             : 
    1183  4624991111 :         locked = down_trylock(&bp->b_sema) == 0;
    1184  4625024374 :         if (locked)
    1185  4540113931 :                 trace_xfs_buf_trylock(bp, _RET_IP_);
    1186             :         else
    1187    84910443 :                 trace_xfs_buf_trylock_fail(bp, _RET_IP_);
    1188  4625109922 :         return locked;
    1189             : }
    1190             : 
    1191             : /*
    1192             :  *      Lock a buffer object.
    1193             :  *
    1194             :  *      If we come across a stale, pinned, locked buffer, we know that we
    1195             :  *      are being asked to lock a buffer that has been reallocated. Because
    1196             :  *      it is pinned, we know that the log has not been pushed to disk and
    1197             :  *      hence it will still be locked. Rather than sleeping until someone
    1198             :  *      else pushes the log, push it ourselves before trying to get the lock.
    1199             :  */
    1200             : void
    1201 13898746217 : xfs_buf_lock(
    1202             :         struct xfs_buf          *bp)
    1203             : {
    1204 13898746217 :         trace_xfs_buf_lock(bp, _RET_IP_);
    1205             : 
    1206 13897639167 :         if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
    1207       27972 :                 xfs_log_force(bp->b_mount, 0);
    1208 13897639167 :         down(&bp->b_sema);
    1209             : 
    1210 13897778229 :         trace_xfs_buf_lock_done(bp, _RET_IP_);
    1211 13899895709 : }
    1212             : 
    1213             : void
    1214 18488355095 : xfs_buf_unlock(
    1215             :         struct xfs_buf          *bp)
    1216             : {
    1217 18488355095 :         ASSERT(xfs_buf_islocked(bp));
    1218             : 
    1219 18488355095 :         up(&bp->b_sema);
    1220 18492919246 :         trace_xfs_buf_unlock(bp, _RET_IP_);
    1221 18486213384 : }
    1222             : 
    1223             : STATIC void
    1224   100813491 : xfs_buf_wait_unpin(
    1225             :         struct xfs_buf          *bp)
    1226             : {
    1227   100813491 :         DECLARE_WAITQUEUE       (wait, current);
    1228             : 
    1229   100813491 :         if (atomic_read(&bp->b_pin_count) == 0)
    1230   100807141 :                 return;
    1231             : 
    1232        6350 :         add_wait_queue(&bp->b_waiters, &wait);
    1233       19048 :         for (;;) {
    1234       12699 :                 set_current_state(TASK_UNINTERRUPTIBLE);
    1235       12699 :                 if (atomic_read(&bp->b_pin_count) == 0)
    1236             :                         break;
    1237        6349 :                 io_schedule();
    1238             :         }
    1239        6350 :         remove_wait_queue(&bp->b_waiters, &wait);
    1240        6350 :         set_current_state(TASK_RUNNING);
    1241             : }
    1242             : 
    1243             : static void
    1244        1898 : xfs_buf_ioerror_alert_ratelimited(
    1245             :         struct xfs_buf          *bp)
    1246             : {
    1247        1898 :         static unsigned long    lasttime;
    1248        1898 :         static struct xfs_buftarg *lasttarg;
    1249             : 
    1250        1898 :         if (bp->b_target != lasttarg ||
    1251        1698 :             time_after(jiffies, (lasttime + 5*HZ))) {
    1252         202 :                 lasttime = jiffies;
    1253         202 :                 xfs_buf_ioerror_alert(bp, __this_address);
    1254             :         }
    1255        1898 :         lasttarg = bp->b_target;
    1256        1898 : }
    1257             : 
    1258             : /*
    1259             :  * Account for this latest trip around the retry handler, and decide if
    1260             :  * we've failed enough times to constitute a permanent failure.
    1261             :  */
    1262             : static bool
    1263        1485 : xfs_buf_ioerror_permanent(
    1264             :         struct xfs_buf          *bp,
    1265             :         struct xfs_error_cfg    *cfg)
    1266             : {
    1267        1485 :         struct xfs_mount        *mp = bp->b_mount;
    1268             : 
    1269        1485 :         if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
    1270          17 :             ++bp->b_retries > cfg->max_retries)
    1271             :                 return true;
    1272        1468 :         if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
    1273           0 :             time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
    1274             :                 return true;
    1275             : 
    1276             :         /* At unmount we may treat errors differently */
    1277        2936 :         if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
    1278           4 :                 return true;
    1279             : 
    1280             :         return false;
    1281             : }
    1282             : 
    1283             : /*
    1284             :  * On a sync write or shutdown we just want to stale the buffer and let the
    1285             :  * caller handle the error in bp->b_error appropriately.
    1286             :  *
    1287             :  * If the write was asynchronous then no one will be looking for the error.  If
    1288             :  * this is the first failure of this type, clear the error state and write the
    1289             :  * buffer out again. This means we always retry an async write failure at least
    1290             :  * once, but we also need to set the buffer up to behave correctly now for
    1291             :  * repeated failures.
    1292             :  *
    1293             :  * If we get repeated async write failures, then we take action according to the
    1294             :  * error configuration we have been set up to use.
    1295             :  *
    1296             :  * Returns true if this function took care of error handling and the caller must
    1297             :  * not touch the buffer again.  Return false if the caller should proceed with
    1298             :  * normal I/O completion handling.
    1299             :  */
    1300             : static bool
    1301     2626053 : xfs_buf_ioend_handle_error(
    1302             :         struct xfs_buf          *bp)
    1303             : {
    1304     2626053 :         struct xfs_mount        *mp = bp->b_mount;
    1305     2626053 :         struct xfs_error_cfg    *cfg;
    1306             : 
    1307             :         /*
    1308             :          * If we've already shutdown the journal because of I/O errors, there's
    1309             :          * no point in giving this a retry.
    1310             :          */
    1311     5252106 :         if (xlog_is_shutdown(mp->m_log))
    1312     2624155 :                 goto out_stale;
    1313             : 
    1314        1898 :         xfs_buf_ioerror_alert_ratelimited(bp);
    1315             : 
    1316             :         /*
    1317             :          * We're not going to bother about retrying this during recovery.
    1318             :          * One strike!
    1319             :          */
    1320        1898 :         if (bp->b_flags & _XBF_LOGRECOVERY) {
    1321           0 :                 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
    1322           0 :                 return false;
    1323             :         }
    1324             : 
    1325             :         /*
    1326             :          * Synchronous writes will have callers process the error.
    1327             :          */
    1328        1898 :         if (!(bp->b_flags & XBF_ASYNC))
    1329         177 :                 goto out_stale;
    1330             : 
    1331        1721 :         trace_xfs_buf_iodone_async(bp, _RET_IP_);
    1332             : 
    1333        1721 :         cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
    1334        1721 :         if (bp->b_last_error != bp->b_error ||
    1335        1485 :             !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
    1336         236 :                 bp->b_last_error = bp->b_error;
    1337         236 :                 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
    1338         178 :                     !bp->b_first_retry_time)
    1339         178 :                         bp->b_first_retry_time = jiffies;
    1340         236 :                 goto resubmit;
    1341             :         }
    1342             : 
    1343             :         /*
    1344             :          * Permanent error - we need to trigger a shutdown if we haven't already
    1345             :          * to indicate that inconsistency will result from this action.
    1346             :          */
    1347        1485 :         if (xfs_buf_ioerror_permanent(bp, cfg)) {
    1348          21 :                 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
    1349          21 :                 goto out_stale;
    1350             :         }
    1351             : 
    1352             :         /* Still considered a transient error. Caller will schedule retries. */
    1353        1464 :         if (bp->b_flags & _XBF_INODES)
    1354           0 :                 xfs_buf_inode_io_fail(bp);
    1355        1464 :         else if (bp->b_flags & _XBF_DQUOTS)
    1356          40 :                 xfs_buf_dquot_io_fail(bp);
    1357             :         else
    1358        1424 :                 ASSERT(list_empty(&bp->b_li_list));
    1359        1464 :         xfs_buf_ioerror(bp, 0);
    1360        1464 :         xfs_buf_relse(bp);
    1361        1464 :         return true;
    1362             : 
    1363             : resubmit:
    1364         236 :         xfs_buf_ioerror(bp, 0);
    1365         236 :         bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
    1366         236 :         xfs_buf_submit(bp);
    1367         236 :         return true;
    1368     2624353 : out_stale:
    1369     2624353 :         xfs_buf_stale(bp);
    1370     2624353 :         bp->b_flags |= XBF_DONE;
    1371     2624353 :         bp->b_flags &= ~XBF_WRITE;
    1372     2624353 :         trace_xfs_buf_error_relse(bp, _RET_IP_);
    1373     2624353 :         return false;
    1374             : }
    1375             : 
    1376             : static void
    1377   137854553 : xfs_buf_ioend(
    1378             :         struct xfs_buf  *bp)
    1379             : {
    1380   137854553 :         trace_xfs_buf_iodone(bp, _RET_IP_);
    1381             : 
    1382             :         /*
    1383             :          * Pull in IO completion errors now. We are guaranteed to be running
    1384             :          * single threaded, so we don't need the lock to read b_io_error.
    1385             :          */
    1386   137854552 :         if (!bp->b_error && bp->b_io_error)
    1387       58740 :                 xfs_buf_ioerror(bp, bp->b_io_error);
    1388             : 
    1389   137854551 :         if (bp->b_flags & XBF_READ) {
    1390    34417189 :                 if (!bp->b_error && bp->b_ops)
    1391    31974562 :                         bp->b_ops->verify_read(bp);
    1392    34417194 :                 if (!bp->b_error)
    1393    34346031 :                         bp->b_flags |= XBF_DONE;
    1394             :         } else {
    1395   103437362 :                 if (!bp->b_error) {
    1396   100811308 :                         bp->b_flags &= ~XBF_WRITE_FAIL;
    1397   100811308 :                         bp->b_flags |= XBF_DONE;
    1398             :                 }
    1399             : 
    1400   103437362 :                 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
    1401             :                         return;
    1402             : 
    1403             :                 /* clear the retry state */
    1404   103435662 :                 bp->b_last_error = 0;
    1405   103435662 :                 bp->b_retries = 0;
    1406   103435662 :                 bp->b_first_retry_time = 0;
    1407             : 
    1408             :                 /*
    1409             :                  * Note that for things like remote attribute buffers, there may
    1410             :                  * not be a buffer log item here, so processing the buffer log
    1411             :                  * item must remain optional.
    1412             :                  */
    1413   103435662 :                 if (bp->b_log_item)
    1414    57430618 :                         xfs_buf_item_done(bp);
    1415             : 
    1416   103435663 :                 if (bp->b_flags & _XBF_INODES)
    1417    31251132 :                         xfs_buf_inode_iodone(bp);
    1418    72184531 :                 else if (bp->b_flags & _XBF_DQUOTS)
    1419     9413354 :                         xfs_buf_dquot_iodone(bp);
    1420             : 
    1421             :         }
    1422             : 
    1423   137852857 :         bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
    1424             :                          _XBF_LOGRECOVERY);
    1425             : 
    1426   137852857 :         if (bp->b_flags & XBF_ASYNC)
    1427   102249661 :                 xfs_buf_relse(bp);
    1428             :         else
    1429    35603196 :                 complete(&bp->b_iowait);
    1430             : }
    1431             : 
    1432             : static void
    1433   135106325 : xfs_buf_ioend_work(
    1434             :         struct work_struct      *work)
    1435             : {
    1436   135106325 :         struct xfs_buf          *bp =
    1437   135106325 :                 container_of(work, struct xfs_buf, b_ioend_work);
    1438             : 
    1439   135106325 :         xfs_buf_ioend(bp);
    1440   135106325 : }
    1441             : 
    1442             : static void
    1443   135106325 : xfs_buf_ioend_async(
    1444             :         struct xfs_buf  *bp)
    1445             : {
    1446   135106325 :         INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
    1447   135106325 :         queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
    1448   135106325 : }
    1449             : 
    1450             : void
    1451    63589576 : __xfs_buf_ioerror(
    1452             :         struct xfs_buf          *bp,
    1453             :         int                     error,
    1454             :         xfs_failaddr_t          failaddr)
    1455             : {
    1456    63589576 :         ASSERT(error <= 0 && error >= -1000);
    1457    63589576 :         bp->b_error = error;
    1458    63589576 :         trace_xfs_buf_ioerror(bp, error, failaddr);
    1459    63590338 : }
    1460             : 
    1461             : void
    1462       33306 : xfs_buf_ioerror_alert(
    1463             :         struct xfs_buf          *bp,
    1464             :         xfs_failaddr_t          func)
    1465             : {
    1466       33306 :         xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
    1467             :                 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
    1468             :                                   func, (uint64_t)xfs_buf_daddr(bp),
    1469       33306 :                                   bp->b_length, -bp->b_error);
    1470       33333 : }
    1471             : 
    1472             : /*
    1473             :  * To simulate an I/O failure, the buffer must be locked and held with at least
    1474             :  * three references. The LRU reference is dropped by the stale call. The buf
    1475             :  * item reference is dropped via ioend processing. The third reference is owned
    1476             :  * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
    1477             :  */
    1478             : void
    1479     2635110 : xfs_buf_ioend_fail(
    1480             :         struct xfs_buf  *bp)
    1481             : {
    1482     2635110 :         bp->b_flags &= ~XBF_DONE;
    1483     2635110 :         xfs_buf_stale(bp);
    1484     2635113 :         xfs_buf_ioerror(bp, -EIO);
    1485     2635113 :         xfs_buf_ioend(bp);
    1486     2635114 : }
    1487             : 
    1488             : int
    1489       95876 : xfs_bwrite(
    1490             :         struct xfs_buf          *bp)
    1491             : {
    1492       95876 :         int                     error;
    1493             : 
    1494       95876 :         ASSERT(xfs_buf_islocked(bp));
    1495             : 
    1496       95876 :         bp->b_flags |= XBF_WRITE;
    1497       95876 :         bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
    1498             :                          XBF_DONE);
    1499             : 
    1500       95876 :         error = xfs_buf_submit(bp);
    1501       95876 :         if (error)
    1502         177 :                 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
    1503       95876 :         return error;
    1504             : }
    1505             : 
    1506             : static void
    1507   135177148 : xfs_buf_bio_end_io(
    1508             :         struct bio              *bio)
    1509             : {
    1510   135177148 :         struct xfs_buf          *bp = (struct xfs_buf *)bio->bi_private;
    1511             : 
    1512   135177148 :         if (!bio->bi_status &&
    1513   214467328 :             (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
    1514    79348918 :             XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
    1515           0 :                 bio->bi_status = BLK_STS_IOERR;
    1516             : 
    1517             :         /*
    1518             :          * don't overwrite existing errors - otherwise we can lose errors on
    1519             :          * buffers that require multiple bios to complete.
    1520             :          */
    1521   135177148 :         if (bio->bi_status) {
    1522       58740 :                 int error = blk_status_to_errno(bio->bi_status);
    1523             : 
    1524       58741 :                 cmpxchg(&bp->b_io_error, 0, error);
    1525             :         }
    1526             : 
    1527   135177150 :         if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
    1528             :                 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
    1529             : 
    1530   270354301 :         if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
    1531   135077259 :                 xfs_buf_ioend_async(bp);
    1532   135177151 :         bio_put(bio);
    1533   135177151 : }
    1534             : 
    1535             : static void
    1536   135176447 : xfs_buf_ioapply_map(
    1537             :         struct xfs_buf  *bp,
    1538             :         int             map,
    1539             :         int             *buf_offset,
    1540             :         int             *count,
    1541             :         blk_opf_t       op)
    1542             : {
    1543   135176447 :         int             page_index;
    1544   135176447 :         unsigned int    total_nr_pages = bp->b_page_count;
    1545   135176447 :         int             nr_pages;
    1546   135176447 :         struct bio      *bio;
    1547   135176447 :         sector_t        sector =  bp->b_maps[map].bm_bn;
    1548   135176447 :         int             size;
    1549   135176447 :         int             offset;
    1550             : 
    1551             :         /* skip the pages in the buffer before the start offset */
    1552   135176447 :         page_index = 0;
    1553   135176447 :         offset = *buf_offset;
    1554   135176447 :         while (offset >= PAGE_SIZE) {
    1555           0 :                 page_index++;
    1556           0 :                 offset -= PAGE_SIZE;
    1557             :         }
    1558             : 
    1559             :         /*
    1560             :          * Limit the IO size to the length of the current vector, and update the
    1561             :          * remaining IO count for the next time around.
    1562             :          */
    1563   135176447 :         size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
    1564   135176447 :         *count -= size;
    1565   135176447 :         *buf_offset += size;
    1566             : 
    1567   135176447 : next_chunk:
    1568   135176447 :         atomic_inc(&bp->b_io_remaining);
    1569   135176632 :         nr_pages = bio_max_segs(total_nr_pages);
    1570             : 
    1571   135176632 :         bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
    1572   135176236 :         bio->bi_iter.bi_sector = sector;
    1573   135176236 :         bio->bi_end_io = xfs_buf_bio_end_io;
    1574   135176236 :         bio->bi_private = bp;
    1575             : 
    1576   270352035 :         for (; size && nr_pages; nr_pages--, page_index++) {
    1577   135176212 :                 int     rbytes, nbytes = PAGE_SIZE - offset;
    1578             : 
    1579   135176212 :                 if (nbytes > size)
    1580             :                         nbytes = size;
    1581             : 
    1582   135176212 :                 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
    1583             :                                       offset);
    1584   135175799 :                 if (rbytes < nbytes)
    1585             :                         break;
    1586             : 
    1587   135175799 :                 offset = 0;
    1588   135175799 :                 sector += BTOBB(nbytes);
    1589   135175799 :                 size -= nbytes;
    1590   135175799 :                 total_nr_pages--;
    1591             :         }
    1592             : 
    1593   135175823 :         if (likely(bio->bi_iter.bi_size)) {
    1594   135175823 :                 if (xfs_buf_is_vmapped(bp)) {
    1595             :                         flush_kernel_vmap_range(bp->b_addr,
    1596             :                                                 xfs_buf_vmap_len(bp));
    1597             :                 }
    1598   135175823 :                 submit_bio(bio);
    1599   135174706 :                 if (size)
    1600           0 :                         goto next_chunk;
    1601             :         } else {
    1602             :                 /*
    1603             :                  * This is guaranteed not to be the last io reference count
    1604             :                  * because the caller (xfs_buf_submit) holds a count itself.
    1605             :                  */
    1606           0 :                 atomic_dec(&bp->b_io_remaining);
    1607           0 :                 xfs_buf_ioerror(bp, -EIO);
    1608           0 :                 bio_put(bio);
    1609             :         }
    1610             : 
    1611   135174706 : }
    1612             : 
    1613             : /* Start a synchronous process-context buffer IO. */
    1614             : static inline void
    1615       42359 : xfs_buf_start_sync_io(
    1616             :         struct xfs_buf  *bp)
    1617             : {
    1618       42359 :         atomic_inc(&bp->b_io_remaining);
    1619       42355 : }
    1620             : 
    1621             : /* Finish a synchronous bprocess-context uffer IO. */
    1622             : static void
    1623       42354 : xfs_buf_end_sync_io(
    1624             :         struct xfs_buf  *bp,
    1625             :         int             error)
    1626             : {
    1627       42354 :         if (error)
    1628           0 :                 cmpxchg(&bp->b_io_error, 0, error);
    1629             : 
    1630       42354 :         if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
    1631             :                 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
    1632             : 
    1633       84713 :         if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
    1634           0 :                 xfs_buf_ioend(bp);
    1635       42359 : }
    1636             : 
    1637             : bool
    1638   350854060 : xfs_buf_check_poisoned(
    1639             :         struct xfs_buf          *bp)
    1640             : {
    1641   350854060 :         unsigned int            i;
    1642             : 
    1643   701708121 :         for (i = 0; i < bp->b_page_count; i++) {
    1644   350854061 :                 if (PageHWPoison(bp->b_pages[i]))
    1645             :                         return true;
    1646             :         }
    1647             : 
    1648   350854060 :         return false;
    1649             : }
    1650             : 
    1651             : STATIC void
    1652   135218934 : _xfs_buf_ioapply(
    1653             :         struct xfs_buf  *bp)
    1654             : {
    1655   135218934 :         struct blk_plug plug;
    1656   135218934 :         blk_opf_t       op;
    1657   135218934 :         int             offset;
    1658   135218934 :         int             size;
    1659   135218934 :         int             i;
    1660             : 
    1661             :         /*
    1662             :          * Make sure we capture only current IO errors rather than stale errors
    1663             :          * left over from previous use of the buffer (e.g. failed readahead).
    1664             :          */
    1665   135218934 :         bp->b_error = 0;
    1666             : 
    1667   135218934 :         if (bp->b_flags & XBF_WRITE) {
    1668   100813400 :                 op = REQ_OP_WRITE;
    1669             : 
    1670             :                 /*
    1671             :                  * Run the write verifier callback function if it exists. If
    1672             :                  * this function fails it will mark the buffer with an error and
    1673             :                  * the IO should not be dispatched.
    1674             :                  */
    1675   100813400 :                 if (bp->b_ops) {
    1676   100813400 :                         bp->b_ops->verify_write(bp);
    1677   100813444 :                         if (bp->b_error) {
    1678           4 :                                 xfs_force_shutdown(bp->b_mount,
    1679             :                                                    SHUTDOWN_CORRUPT_INCORE);
    1680       42362 :                                 return;
    1681             :                         }
    1682           0 :                 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
    1683           0 :                         struct xfs_mount *mp = bp->b_mount;
    1684             : 
    1685             :                         /*
    1686             :                          * non-crc filesystems don't attach verifiers during
    1687             :                          * log recovery, so don't warn for such filesystems.
    1688             :                          */
    1689           0 :                         if (xfs_has_crc(mp)) {
    1690           0 :                                 xfs_warn(mp,
    1691             :                                         "%s: no buf ops on daddr 0x%llx len %d",
    1692             :                                         __func__, xfs_buf_daddr(bp),
    1693             :                                         bp->b_length);
    1694           0 :                                 xfs_hex_dump(bp->b_addr,
    1695             :                                                 XFS_CORRUPTION_DUMP_LEN);
    1696           0 :                                 dump_stack();
    1697             :                         }
    1698             :                 }
    1699             :         } else {
    1700    34405534 :                 op = REQ_OP_READ;
    1701    34405534 :                 if (bp->b_flags & XBF_READ_AHEAD)
    1702    20276425 :                         op |= REQ_RAHEAD;
    1703             :         }
    1704             : 
    1705             :         /* we only use the buffer cache for meta-data */
    1706   135218974 :         op |= REQ_META;
    1707             : 
    1708   135218974 :         if (bp->b_target->bt_flags & XFS_BUFTARG_XFILE) {
    1709       42355 :                 int     error;
    1710             : 
    1711       42355 :                 xfs_buf_start_sync_io(bp);
    1712       42350 :                 error = xfile_buf_ioapply(bp);
    1713       42348 :                 xfs_buf_end_sync_io(bp, error);
    1714       42348 :                 return;
    1715             :         }
    1716             : 
    1717             :         /*
    1718             :          * Walk all the vectors issuing IO on them. Set up the initial offset
    1719             :          * into the buffer and the desired IO size before we start -
    1720             :          * _xfs_buf_ioapply_vec() will modify them appropriately for each
    1721             :          * subsequent call.
    1722             :          */
    1723   135176619 :         offset = bp->b_offset;
    1724   135176619 :         size = BBTOB(bp->b_length);
    1725   135176619 :         blk_start_plug(&plug);
    1726   270353116 :         for (i = 0; i < bp->b_map_count; i++) {
    1727   135176497 :                 xfs_buf_ioapply_map(bp, i, &offset, &size, op);
    1728   135175084 :                 if (bp->b_error)
    1729             :                         break;
    1730   135175084 :                 if (size <= 0)
    1731             :                         break;  /* all done */
    1732             :         }
    1733   135175008 :         blk_finish_plug(&plug);
    1734             : }
    1735             : 
    1736             : /*
    1737             :  * Wait for I/O completion of a sync buffer and return the I/O error code.
    1738             :  */
    1739             : static int
    1740    35591841 : xfs_buf_iowait(
    1741             :         struct xfs_buf  *bp)
    1742             : {
    1743    35591841 :         ASSERT(!(bp->b_flags & XBF_ASYNC));
    1744             : 
    1745    35591841 :         trace_xfs_buf_iowait(bp, _RET_IP_);
    1746    35591926 :         wait_for_completion(&bp->b_iowait);
    1747    35591508 :         trace_xfs_buf_iowait_done(bp, _RET_IP_);
    1748             : 
    1749    35591018 :         return bp->b_error;
    1750             : }
    1751             : 
    1752             : /*
    1753             :  * Buffer I/O submission path, read or write. Asynchronous submission transfers
    1754             :  * the buffer lock ownership and the current reference to the IO. It is not
    1755             :  * safe to reference the buffer after a call to this function unless the caller
    1756             :  * holds an additional reference itself.
    1757             :  */
    1758             : static int
    1759   136907435 : __xfs_buf_submit(
    1760             :         struct xfs_buf  *bp,
    1761             :         bool            wait)
    1762             : {
    1763   136907435 :         int             error = 0;
    1764             : 
    1765   136907435 :         trace_xfs_buf_submit(bp, _RET_IP_);
    1766             : 
    1767   136907388 :         ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
    1768             : 
    1769             :         /*
    1770             :          * On log shutdown we stale and complete the buffer immediately. We can
    1771             :          * be called to read the superblock before the log has been set up, so
    1772             :          * be careful checking the log state.
    1773             :          *
    1774             :          * Checking the mount shutdown state here can result in the log tail
    1775             :          * moving inappropriately on disk as the log may not yet be shut down.
    1776             :          * i.e. failing this buffer on mount shutdown can remove it from the AIL
    1777             :          * and move the tail of the log forwards without having written this
    1778             :          * buffer to disk. This corrupts the log tail state in memory, and
    1779             :          * because the log may not be shut down yet, it can then be propagated
    1780             :          * to disk before the log is shutdown. Hence we check log shutdown
    1781             :          * state here rather than mount state to avoid corrupting the log tail
    1782             :          * on shutdown.
    1783             :          */
    1784   273742166 :         if (bp->b_mount->m_log &&
    1785             :             xlog_is_shutdown(bp->b_mount->m_log)) {
    1786     1688975 :                 xfs_buf_ioend_fail(bp);
    1787     1688975 :                 return -EIO;
    1788             :         }
    1789             : 
    1790             :         /*
    1791             :          * Grab a reference so the buffer does not go away underneath us. For
    1792             :          * async buffers, I/O completion drops the callers reference, which
    1793             :          * could occur before submission returns.
    1794             :          */
    1795   135218413 :         xfs_buf_hold(bp);
    1796             : 
    1797   135218589 :         if (bp->b_flags & XBF_WRITE)
    1798   100813461 :                 xfs_buf_wait_unpin(bp);
    1799             : 
    1800             :         /* clear the internal error state to avoid spurious errors */
    1801   135218583 :         bp->b_io_error = 0;
    1802             : 
    1803             :         /*
    1804             :          * Set the count to 1 initially, this will stop an I/O completion
    1805             :          * callout which happens before we have started all the I/O from calling
    1806             :          * xfs_buf_ioend too early.
    1807             :          */
    1808   135218583 :         atomic_set(&bp->b_io_remaining, 1);
    1809   135218583 :         if (bp->b_flags & XBF_ASYNC)
    1810    99627299 :                 xfs_buf_ioacct_inc(bp);
    1811   135218589 :         _xfs_buf_ioapply(bp);
    1812             : 
    1813             :         /*
    1814             :          * If _xfs_buf_ioapply failed, we can get back here with only the IO
    1815             :          * reference we took above. If we drop it to zero, run completion so
    1816             :          * that we don't return to the caller with completion still pending.
    1817             :          */
    1818   270437630 :         if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
    1819      142186 :                 if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
    1820      113120 :                         xfs_buf_ioend(bp);
    1821             :                 else
    1822       29066 :                         xfs_buf_ioend_async(bp);
    1823             :         }
    1824             : 
    1825   135218905 :         if (wait)
    1826    14225066 :                 error = xfs_buf_iowait(bp);
    1827             : 
    1828             :         /*
    1829             :          * Release the hold that keeps the buffer referenced for the entire
    1830             :          * I/O. Note that if the buffer is async, it is not safe to reference
    1831             :          * after this release.
    1832             :          */
    1833   135218197 :         xfs_buf_rele(bp);
    1834   135218197 :         return error;
    1835             : }
    1836             : 
    1837             : void *
    1838 16751407731 : xfs_buf_offset(
    1839             :         struct xfs_buf          *bp,
    1840             :         size_t                  offset)
    1841             : {
    1842 16751407731 :         struct page             *page;
    1843             : 
    1844 16751407731 :         if (bp->b_addr)
    1845 16751407731 :                 return bp->b_addr + offset;
    1846             : 
    1847           0 :         page = bp->b_pages[offset >> PAGE_SHIFT];
    1848           0 :         return page_address(page) + (offset & (PAGE_SIZE-1));
    1849             : }
    1850             : 
    1851             : void
    1852      989514 : xfs_buf_zero(
    1853             :         struct xfs_buf          *bp,
    1854             :         size_t                  boff,
    1855             :         size_t                  bsize)
    1856             : {
    1857      989514 :         size_t                  bend;
    1858             : 
    1859      989514 :         bend = boff + bsize;
    1860     1979012 :         while (boff < bend) {
    1861      989511 :                 struct page     *page;
    1862      989511 :                 int             page_index, page_offset, csize;
    1863             : 
    1864      989511 :                 page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
    1865      989511 :                 page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
    1866      989511 :                 page = bp->b_pages[page_index];
    1867      989511 :                 csize = min_t(size_t, PAGE_SIZE - page_offset,
    1868             :                                       BBTOB(bp->b_length) - boff);
    1869             : 
    1870      989511 :                 ASSERT((csize + page_offset) <= PAGE_SIZE);
    1871             : 
    1872      989498 :                 memset(page_address(page) + page_offset, 0, csize);
    1873             : 
    1874      989498 :                 boff += csize;
    1875             :         }
    1876      989501 : }
    1877             : 
    1878             : /*
    1879             :  * Log a message about and stale a buffer that a caller has decided is corrupt.
    1880             :  *
    1881             :  * This function should be called for the kinds of metadata corruption that
    1882             :  * cannot be detect from a verifier, such as incorrect inter-block relationship
    1883             :  * data.  Do /not/ call this function from a verifier function.
    1884             :  *
    1885             :  * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
    1886             :  * be marked stale, but b_error will not be set.  The caller is responsible for
    1887             :  * releasing the buffer or fixing it.
    1888             :  */
    1889             : void
    1890           0 : __xfs_buf_mark_corrupt(
    1891             :         struct xfs_buf          *bp,
    1892             :         xfs_failaddr_t          fa)
    1893             : {
    1894           0 :         ASSERT(bp->b_flags & XBF_DONE);
    1895             : 
    1896           0 :         xfs_buf_corruption_error(bp, fa);
    1897           0 :         xfs_buf_stale(bp);
    1898           0 : }
    1899             : 
    1900             : /*
    1901             :  *      Handling of buffer targets (buftargs).
    1902             :  */
    1903             : 
    1904             : /*
    1905             :  * Wait for any bufs with callbacks that have been submitted but have not yet
    1906             :  * returned. These buffers will have an elevated hold count, so wait on those
    1907             :  * while freeing all the buffers only held by the LRU.
    1908             :  */
    1909             : static enum lru_status
    1910    23130695 : xfs_buftarg_drain_rele(
    1911             :         struct list_head        *item,
    1912             :         struct list_lru_one     *lru,
    1913             :         spinlock_t              *lru_lock,
    1914             :         void                    *arg)
    1915             : 
    1916             : {
    1917    23130695 :         struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
    1918    23130695 :         struct list_head        *dispose = arg;
    1919             : 
    1920    23130695 :         if (atomic_read(&bp->b_hold) > 1) {
    1921             :                 /* need to wait, so skip it this pass */
    1922           0 :                 trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
    1923           0 :                 return LRU_SKIP;
    1924             :         }
    1925    23130695 :         if (!spin_trylock(&bp->b_lock))
    1926             :                 return LRU_SKIP;
    1927             : 
    1928             :         /*
    1929             :          * clear the LRU reference count so the buffer doesn't get
    1930             :          * ignored in xfs_buf_rele().
    1931             :          */
    1932    23130695 :         atomic_set(&bp->b_lru_ref, 0);
    1933    23130695 :         bp->b_state |= XFS_BSTATE_DISPOSE;
    1934    23130695 :         list_lru_isolate_move(lru, item, dispose);
    1935    23130695 :         spin_unlock(&bp->b_lock);
    1936    23130695 :         return LRU_REMOVED;
    1937             : }
    1938             : 
    1939             : /*
    1940             :  * Wait for outstanding I/O on the buftarg to complete.
    1941             :  */
    1942             : void
    1943      132509 : xfs_buftarg_wait(
    1944             :         struct xfs_buftarg      *btp)
    1945             : {
    1946             :         /*
    1947             :          * First wait on the buftarg I/O count for all in-flight buffers to be
    1948             :          * released. This is critical as new buffers do not make the LRU until
    1949             :          * they are released.
    1950             :          *
    1951             :          * Next, flush the buffer workqueue to ensure all completion processing
    1952             :          * has finished. Just waiting on buffer locks is not sufficient for
    1953             :          * async IO as the reference count held over IO is not released until
    1954             :          * after the buffer lock is dropped. Hence we need to ensure here that
    1955             :          * all reference counts have been dropped before we start walking the
    1956             :          * LRU list.
    1957             :          */
    1958      134273 :         while (percpu_counter_sum(&btp->bt_io_count))
    1959        1764 :                 delay(100);
    1960      132507 :         flush_workqueue(btp->bt_mount->m_buf_workqueue);
    1961      132510 : }
    1962             : 
    1963             : void
    1964       69401 : xfs_buftarg_drain(
    1965             :         struct xfs_buftarg      *btp)
    1966             : {
    1967       69401 :         LIST_HEAD(dispose);
    1968       69401 :         int                     loop = 0;
    1969       69401 :         bool                    write_fail = false;
    1970             : 
    1971       69401 :         xfs_buftarg_wait(btp);
    1972             : 
    1973             :         /* loop until there is nothing left on the lru list. */
    1974      136455 :         while (list_lru_count(&btp->bt_lru)) {
    1975       67054 :                 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
    1976             :                               &dispose, LONG_MAX);
    1977             : 
    1978    23197747 :                 while (!list_empty(&dispose)) {
    1979    23130693 :                         struct xfs_buf *bp;
    1980    23130693 :                         bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
    1981    23130693 :                         list_del_init(&bp->b_lru);
    1982    23130692 :                         if (bp->b_flags & XBF_WRITE_FAIL) {
    1983           0 :                                 write_fail = true;
    1984           0 :                                 xfs_buf_alert_ratelimited(bp,
    1985             :                                         "XFS: Corruption Alert",
    1986             : "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
    1987             :                                         (long long)xfs_buf_daddr(bp));
    1988             :                         }
    1989    23130692 :                         xfs_buf_rele(bp);
    1990             :                 }
    1991       67054 :                 if (loop++ != 0)
    1992           0 :                         delay(100);
    1993             :         }
    1994             : 
    1995             :         /*
    1996             :          * If one or more failed buffers were freed, that means dirty metadata
    1997             :          * was thrown away. This should only ever happen after I/O completion
    1998             :          * handling has elevated I/O error(s) to permanent failures and shuts
    1999             :          * down the journal.
    2000             :          */
    2001       69405 :         if (write_fail) {
    2002           0 :                 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
    2003           0 :                 xfs_alert(btp->bt_mount,
    2004             :               "Please run xfs_repair to determine the extent of the problem.");
    2005             :         }
    2006       69405 : }
    2007             : 
    2008             : static enum lru_status
    2009    46477824 : xfs_buftarg_isolate(
    2010             :         struct list_head        *item,
    2011             :         struct list_lru_one     *lru,
    2012             :         spinlock_t              *lru_lock,
    2013             :         void                    *arg)
    2014             : {
    2015    46477824 :         struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
    2016    46477824 :         struct list_head        *dispose = arg;
    2017             : 
    2018             :         /*
    2019             :          * we are inverting the lru lock/bp->b_lock here, so use a trylock.
    2020             :          * If we fail to get the lock, just skip it.
    2021             :          */
    2022    46477824 :         if (!spin_trylock(&bp->b_lock))
    2023             :                 return LRU_SKIP;
    2024             :         /*
    2025             :          * Decrement the b_lru_ref count unless the value is already
    2026             :          * zero. If the value is already zero, we need to reclaim the
    2027             :          * buffer, otherwise it gets another trip through the LRU.
    2028             :          */
    2029    92955436 :         if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
    2030    31984498 :                 spin_unlock(&bp->b_lock);
    2031    31984498 :                 return LRU_ROTATE;
    2032             :         }
    2033             : 
    2034    14493220 :         bp->b_state |= XFS_BSTATE_DISPOSE;
    2035    14493220 :         list_lru_isolate_move(lru, item, dispose);
    2036    14493220 :         spin_unlock(&bp->b_lock);
    2037    14493220 :         return LRU_REMOVED;
    2038             : }
    2039             : 
    2040             : static unsigned long
    2041      368638 : xfs_buftarg_shrink_scan(
    2042             :         struct shrinker         *shrink,
    2043             :         struct shrink_control   *sc)
    2044             : {
    2045      368638 :         struct xfs_buftarg      *btp = container_of(shrink,
    2046             :                                         struct xfs_buftarg, bt_shrinker);
    2047      368638 :         LIST_HEAD(dispose);
    2048      368638 :         unsigned long           freed;
    2049             : 
    2050      368638 :         freed = list_lru_shrink_walk(&btp->bt_lru, sc,
    2051             :                                      xfs_buftarg_isolate, &dispose);
    2052             : 
    2053    14861858 :         while (!list_empty(&dispose)) {
    2054    14493220 :                 struct xfs_buf *bp;
    2055    14493220 :                 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
    2056    14493220 :                 list_del_init(&bp->b_lru);
    2057    14493220 :                 xfs_buf_rele(bp);
    2058             :         }
    2059             : 
    2060      368638 :         return freed;
    2061             : }
    2062             : 
    2063             : static unsigned long
    2064        7313 : xfs_buftarg_shrink_count(
    2065             :         struct shrinker         *shrink,
    2066             :         struct shrink_control   *sc)
    2067             : {
    2068        7313 :         struct xfs_buftarg      *btp = container_of(shrink,
    2069             :                                         struct xfs_buftarg, bt_shrinker);
    2070        7313 :         return list_lru_shrink_count(&btp->bt_lru, sc);
    2071             : }
    2072             : 
    2073             : void
    2074       45696 : xfs_free_buftarg(
    2075             :         struct xfs_buftarg      *btp)
    2076             : {
    2077       45696 :         unregister_shrinker(&btp->bt_shrinker);
    2078       45708 :         ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
    2079       45707 :         percpu_counter_destroy(&btp->bt_io_count);
    2080       45708 :         list_lru_destroy(&btp->bt_lru);
    2081             : 
    2082       45708 :         if (!(btp->bt_flags & XFS_BUFTARG_XFILE)) {
    2083       24351 :                 blkdev_issue_flush(btp->bt_bdev);
    2084       24351 :                 invalidate_bdev(btp->bt_bdev);
    2085       24351 :                 fs_put_dax(btp->bt_daxdev, btp->bt_mount);
    2086             :         }
    2087             : 
    2088       45708 :         kvfree(btp);
    2089       45708 : }
    2090             : 
    2091             : int
    2092       48654 : xfs_setsize_buftarg(
    2093             :         xfs_buftarg_t           *btp,
    2094             :         unsigned int            sectorsize)
    2095             : {
    2096             :         /* Set up metadata sector size info */
    2097       48654 :         btp->bt_meta_sectorsize = sectorsize;
    2098       48654 :         btp->bt_meta_sectormask = sectorsize - 1;
    2099             : 
    2100       48654 :         if (set_blocksize(btp->bt_bdev, sectorsize)) {
    2101           0 :                 xfs_warn(btp->bt_mount,
    2102             :                         "Cannot set_blocksize to %u on device %pg",
    2103             :                         sectorsize, btp->bt_bdev);
    2104           0 :                 return -EINVAL;
    2105             :         }
    2106             : 
    2107             :         /* Set up device logical sector size mask */
    2108       48654 :         btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
    2109       48654 :         btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
    2110             : 
    2111       48654 :         return 0;
    2112             : }
    2113             : 
    2114             : /*
    2115             :  * When allocating the initial buffer target we have not yet
    2116             :  * read in the superblock, so don't know what sized sectors
    2117             :  * are being used at this early stage.  Play safe.
    2118             :  */
    2119             : STATIC int
    2120       24342 : xfs_setsize_buftarg_early(
    2121             :         xfs_buftarg_t           *btp,
    2122             :         struct block_device     *bdev)
    2123             : {
    2124       48684 :         return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
    2125             : }
    2126             : 
    2127             : struct xfs_buftarg *
    2128       45691 : xfs_alloc_buftarg_common(
    2129             :         struct xfs_mount        *mp,
    2130             :         const char              *descr)
    2131             : {
    2132       45691 :         struct xfs_buftarg      *btp;
    2133             : 
    2134       45691 :         btp = kzalloc(sizeof(*btp), GFP_NOFS);
    2135       45665 :         if (!btp)
    2136             :                 return NULL;
    2137             : 
    2138       45665 :         btp->bt_mount = mp;
    2139             : 
    2140             :         /*
    2141             :          * Buffer IO error rate limiting. Limit it to no more than 10 messages
    2142             :          * per 30 seconds so as to not spam logs too much on repeated errors.
    2143             :          */
    2144       45665 :         ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
    2145             :                              DEFAULT_RATELIMIT_BURST);
    2146             : 
    2147       45666 :         if (list_lru_init(&btp->bt_lru))
    2148           0 :                 goto error_free;
    2149             : 
    2150       45690 :         if (percpu_counter_init(&btp->bt_io_count, 0, GFP_NOFS))
    2151           0 :                 goto error_lru;
    2152             : 
    2153       45699 :         btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
    2154       45699 :         btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
    2155       45699 :         btp->bt_shrinker.seeks = DEFAULT_SEEKS;
    2156       45699 :         btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
    2157       45699 :         if (register_shrinker(&btp->bt_shrinker, "xfs-%s:%s", descr,
    2158       45699 :                               mp->m_super->s_id))
    2159           0 :                 goto error_pcpu;
    2160             : 
    2161             :         return btp;
    2162             : 
    2163             : error_pcpu:
    2164           0 :         percpu_counter_destroy(&btp->bt_io_count);
    2165           0 : error_lru:
    2166           0 :         list_lru_destroy(&btp->bt_lru);
    2167           0 : error_free:
    2168           0 :         kvfree(btp);
    2169           0 :         return NULL;
    2170             : }
    2171             : 
    2172             : static inline void
    2173   102429037 : xfs_buf_list_del(
    2174             :         struct xfs_buf          *bp)
    2175             : {
    2176   102429037 :         list_del_init(&bp->b_list);
    2177   102428984 :         wake_up_var(&bp->b_list);
    2178   102428976 : }
    2179             : 
    2180             : /* Allocate a buffer cache target for a persistent block device. */
    2181             : struct xfs_buftarg *
    2182       24342 : xfs_alloc_buftarg(
    2183             :         struct xfs_mount        *mp,
    2184             :         struct block_device     *bdev)
    2185             : {
    2186       24342 :         struct xfs_buftarg      *btp;
    2187       24342 :         const struct dax_holder_operations *ops = NULL;
    2188             : 
    2189             : #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
    2190             :         ops = &xfs_dax_holder_operations;
    2191             : #endif
    2192             : 
    2193       24342 :         btp = xfs_alloc_buftarg_common(mp, "buf");
    2194       24342 :         if (!btp)
    2195             :                 return NULL;
    2196             : 
    2197       24342 :         btp->bt_dev =  bdev->bd_dev;
    2198       24342 :         btp->bt_bdev = bdev;
    2199       24342 :         btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
    2200             :                                             mp, ops);
    2201             : 
    2202       24342 :         if (xfs_setsize_buftarg_early(btp, bdev))
    2203           0 :                 goto error_free;
    2204             : 
    2205             :         return btp;
    2206             : 
    2207             : error_free:
    2208           0 :         xfs_free_buftarg(btp);
    2209           0 :         return NULL;
    2210             : }
    2211             : 
    2212             : /*
    2213             :  * Cancel a delayed write list.
    2214             :  *
    2215             :  * Remove each buffer from the list, clear the delwri queue flag and drop the
    2216             :  * associated buffer reference.
    2217             :  */
    2218             : void
    2219      154055 : xfs_buf_delwri_cancel(
    2220             :         struct list_head        *list)
    2221             : {
    2222      154055 :         struct xfs_buf          *bp;
    2223             : 
    2224      154052 :         while (!list_empty(list)) {
    2225           0 :                 bp = list_first_entry(list, struct xfs_buf, b_list);
    2226             : 
    2227           0 :                 xfs_buf_lock(bp);
    2228           0 :                 bp->b_flags &= ~_XBF_DELWRI_Q;
    2229           0 :                 xfs_buf_list_del(bp);
    2230           0 :                 xfs_buf_relse(bp);
    2231             :         }
    2232      154052 : }
    2233             : 
    2234             : /*
    2235             :  * Add a buffer to the delayed write list.
    2236             :  *
    2237             :  * This queues a buffer for writeout if it hasn't already been.  Note that
    2238             :  * neither this routine nor the buffer list submission functions perform
    2239             :  * any internal synchronization.  It is expected that the lists are thread-local
    2240             :  * to the callers.
    2241             :  *
    2242             :  * Returns true if we queued up the buffer, or false if it already had
    2243             :  * been on the buffer list.
    2244             :  */
    2245             : bool
    2246   111564239 : xfs_buf_delwri_queue(
    2247             :         struct xfs_buf          *bp,
    2248             :         struct list_head        *list)
    2249             : {
    2250   111564239 :         ASSERT(xfs_buf_islocked(bp));
    2251   111564239 :         ASSERT(!(bp->b_flags & XBF_READ));
    2252             : 
    2253             :         /*
    2254             :          * If the buffer is already marked delwri it already is queued up
    2255             :          * by someone else for imediate writeout.  Just ignore it in that
    2256             :          * case.
    2257             :          */
    2258   111564239 :         if (bp->b_flags & _XBF_DELWRI_Q) {
    2259     9135103 :                 trace_xfs_buf_delwri_queued(bp, _RET_IP_);
    2260     9135103 :                 return false;
    2261             :         }
    2262             : 
    2263   102429136 :         trace_xfs_buf_delwri_queue(bp, _RET_IP_);
    2264             : 
    2265             :         /*
    2266             :          * If a buffer gets written out synchronously or marked stale while it
    2267             :          * is on a delwri list we lazily remove it. To do this, the other party
    2268             :          * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
    2269             :          * It remains referenced and on the list.  In a rare corner case it
    2270             :          * might get readded to a delwri list after the synchronous writeout, in
    2271             :          * which case we need just need to re-add the flag here.
    2272             :          */
    2273   102429128 :         bp->b_flags |= _XBF_DELWRI_Q;
    2274   102429128 :         if (list_empty(&bp->b_list)) {
    2275   102428962 :                 atomic_inc(&bp->b_hold);
    2276   102428922 :                 list_add_tail(&bp->b_list, list);
    2277             :         }
    2278             : 
    2279             :         return true;
    2280             : }
    2281             : 
    2282             : /*
    2283             :  * Queue a buffer to this delwri list as part of a data integrity operation.
    2284             :  * If the buffer is on any other delwri list, we'll wait for that to clear
    2285             :  * so that the caller can submit the buffer for IO and wait for the result.
    2286             :  * Callers must ensure the buffer is not already on the list.
    2287             :  */
    2288             : void
    2289      435520 : xfs_buf_delwri_queue_here(
    2290             :         struct xfs_buf          *bp,
    2291             :         struct list_head        *buffer_list)
    2292             : {
    2293             :         /*
    2294             :          * We need this buffer to end up on the /caller's/ delwri list, not any
    2295             :          * old list.  This can happen if the buffer is marked stale (which
    2296             :          * clears DELWRI_Q) after the AIL queues the buffer to its list but
    2297             :          * before the AIL has a chance to submit the list.
    2298             :          */
    2299      435538 :         while (!list_empty(&bp->b_list)) {
    2300           5 :                 xfs_buf_unlock(bp);
    2301          10 :                 wait_var_event(&bp->b_list, list_empty(&bp->b_list));
    2302           5 :                 xfs_buf_lock(bp);
    2303             :         }
    2304             : 
    2305      435533 :         ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
    2306             : 
    2307             :         /* This buffer is uptodate; don't let it get reread. */
    2308      435533 :         bp->b_flags |= XBF_DONE;
    2309             : 
    2310      435533 :         xfs_buf_delwri_queue(bp, buffer_list);
    2311      435532 : }
    2312             : 
    2313             : /*
    2314             :  * Compare function is more complex than it needs to be because
    2315             :  * the return value is only 32 bits and we are doing comparisons
    2316             :  * on 64 bit values
    2317             :  */
    2318             : static int
    2319   896949102 : xfs_buf_cmp(
    2320             :         void                    *priv,
    2321             :         const struct list_head  *a,
    2322             :         const struct list_head  *b)
    2323             : {
    2324   896949102 :         struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
    2325   896949102 :         struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
    2326   896949102 :         xfs_daddr_t             diff;
    2327             : 
    2328   896949102 :         diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
    2329   896949102 :         if (diff < 0)
    2330             :                 return -1;
    2331   445016207 :         if (diff > 0)
    2332   445015692 :                 return 1;
    2333             :         return 0;
    2334             : }
    2335             : 
    2336             : /*
    2337             :  * Submit buffers for write. If wait_list is specified, the buffers are
    2338             :  * submitted using sync I/O and placed on the wait list such that the caller can
    2339             :  * iowait each buffer. Otherwise async I/O is used and the buffers are released
    2340             :  * at I/O completion time. In either case, buffers remain locked until I/O
    2341             :  * completes and the buffer is released from the queue.
    2342             :  */
    2343             : static int
    2344     3609582 : xfs_buf_delwri_submit_buffers(
    2345             :         struct list_head        *buffer_list,
    2346             :         struct list_head        *wait_list)
    2347             : {
    2348     3609582 :         struct xfs_buf          *bp, *n;
    2349     3609582 :         int                     pinned = 0;
    2350     3609582 :         struct blk_plug         plug;
    2351             : 
    2352     3609582 :         list_sort(NULL, buffer_list, xfs_buf_cmp);
    2353             : 
    2354     3609544 :         blk_start_plug(&plug);
    2355   107331000 :         list_for_each_entry_safe(bp, n, buffer_list, b_list) {
    2356   103721414 :                 if (!wait_list) {
    2357    82354690 :                         if (!xfs_buf_trylock(bp))
    2358       55323 :                                 continue;
    2359    82299367 :                         if (xfs_buf_ispinned(bp)) {
    2360     1237043 :                                 xfs_buf_unlock(bp);
    2361     1237043 :                                 pinned++;
    2362     1237043 :                                 continue;
    2363             :                         }
    2364             :                 } else {
    2365    21366724 :                         xfs_buf_lock(bp);
    2366             :                 }
    2367             : 
    2368             :                 /*
    2369             :                  * Someone else might have written the buffer synchronously or
    2370             :                  * marked it stale in the meantime.  In that case only the
    2371             :                  * _XBF_DELWRI_Q flag got cleared, and we have to drop the
    2372             :                  * reference and remove it from the list here.
    2373             :                  */
    2374   102428997 :                 if (!(bp->b_flags & _XBF_DELWRI_Q)) {
    2375       33981 :                         xfs_buf_list_del(bp);
    2376       33981 :                         xfs_buf_relse(bp);
    2377       33981 :                         continue;
    2378             :                 }
    2379             : 
    2380   102395016 :                 trace_xfs_buf_delwri_split(bp, _RET_IP_);
    2381             : 
    2382             :                 /*
    2383             :                  * If we have a wait list, each buffer (and associated delwri
    2384             :                  * queue reference) transfers to it and is submitted
    2385             :                  * synchronously. Otherwise, drop the buffer from the delwri
    2386             :                  * queue and submit async.
    2387             :                  */
    2388   102395071 :                 bp->b_flags &= ~_XBF_DELWRI_Q;
    2389   102395071 :                 bp->b_flags |= XBF_WRITE;
    2390   102395071 :                 if (wait_list) {
    2391    21366728 :                         bp->b_flags &= ~XBF_ASYNC;
    2392    21366728 :                         list_move_tail(&bp->b_list, wait_list);
    2393             :                 } else {
    2394    81028343 :                         bp->b_flags |= XBF_ASYNC;
    2395    81028343 :                         xfs_buf_list_del(bp);
    2396             :                 }
    2397   102395049 :                 __xfs_buf_submit(bp, false);
    2398             :         }
    2399     3609586 :         blk_finish_plug(&plug);
    2400             : 
    2401     3609604 :         return pinned;
    2402             : }
    2403             : 
    2404             : /*
    2405             :  * Write out a buffer list asynchronously.
    2406             :  *
    2407             :  * This will take the @buffer_list, write all non-locked and non-pinned buffers
    2408             :  * out and not wait for I/O completion on any of the buffers.  This interface
    2409             :  * is only safely useable for callers that can track I/O completion by higher
    2410             :  * level means, e.g. AIL pushing as the @buffer_list is consumed in this
    2411             :  * function.
    2412             :  *
    2413             :  * Note: this function will skip buffers it would block on, and in doing so
    2414             :  * leaves them on @buffer_list so they can be retried on a later pass. As such,
    2415             :  * it is up to the caller to ensure that the buffer list is fully submitted or
    2416             :  * cancelled appropriately when they are finished with the list. Failure to
    2417             :  * cancel or resubmit the list until it is empty will result in leaked buffers
    2418             :  * at unmount time.
    2419             :  */
    2420             : int
    2421     2604317 : xfs_buf_delwri_submit_nowait(
    2422             :         struct list_head        *buffer_list)
    2423             : {
    2424     2604317 :         return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
    2425             : }
    2426             : 
    2427             : /*
    2428             :  * Write out a buffer list synchronously.
    2429             :  *
    2430             :  * This will take the @buffer_list, write all buffers out and wait for I/O
    2431             :  * completion on all of the buffers. @buffer_list is consumed by the function,
    2432             :  * so callers must have some other way of tracking buffers if they require such
    2433             :  * functionality.
    2434             :  */
    2435             : int
    2436     1005241 : xfs_buf_delwri_submit(
    2437             :         struct list_head        *buffer_list)
    2438             : {
    2439     1005241 :         LIST_HEAD               (wait_list);
    2440     1005241 :         int                     error = 0, error2;
    2441     1005241 :         struct xfs_buf          *bp;
    2442             : 
    2443     1005241 :         xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
    2444             : 
    2445             :         /* Wait for IO to complete. */
    2446    22372014 :         while (!list_empty(&wait_list)) {
    2447    21366771 :                 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
    2448             : 
    2449    21366771 :                 xfs_buf_list_del(bp);
    2450             : 
    2451             :                 /*
    2452             :                  * Wait on the locked buffer, check for errors and unlock and
    2453             :                  * release the delwri queue reference.
    2454             :                  */
    2455    21366765 :                 error2 = xfs_buf_iowait(bp);
    2456    21366772 :                 xfs_buf_relse(bp);
    2457    21366773 :                 if (!error)
    2458    21366773 :                         error = error2;
    2459             :         }
    2460             : 
    2461     1005294 :         return error;
    2462             : }
    2463             : 
    2464             : /*
    2465             :  * Push a single buffer on a delwri queue.
    2466             :  *
    2467             :  * The purpose of this function is to submit a single buffer of a delwri queue
    2468             :  * and return with the buffer still on the original queue. The waiting delwri
    2469             :  * buffer submission infrastructure guarantees transfer of the delwri queue
    2470             :  * buffer reference to a temporary wait list. We reuse this infrastructure to
    2471             :  * transfer the buffer back to the original queue.
    2472             :  *
    2473             :  * Note the buffer transitions from the queued state, to the submitted and wait
    2474             :  * listed state and back to the queued state during this call. The buffer
    2475             :  * locking and queue management logic between _delwri_pushbuf() and
    2476             :  * _delwri_queue() guarantee that the buffer cannot be queued to another list
    2477             :  * before returning.
    2478             :  */
    2479             : int
    2480           0 : xfs_buf_delwri_pushbuf(
    2481             :         struct xfs_buf          *bp,
    2482             :         struct list_head        *buffer_list)
    2483             : {
    2484           0 :         LIST_HEAD               (submit_list);
    2485           0 :         int                     error;
    2486             : 
    2487           0 :         ASSERT(bp->b_flags & _XBF_DELWRI_Q);
    2488             : 
    2489           0 :         trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
    2490             : 
    2491             :         /*
    2492             :          * Isolate the buffer to a new local list so we can submit it for I/O
    2493             :          * independently from the rest of the original list.
    2494             :          */
    2495           0 :         xfs_buf_lock(bp);
    2496           0 :         list_move(&bp->b_list, &submit_list);
    2497           0 :         xfs_buf_unlock(bp);
    2498             : 
    2499             :         /*
    2500             :          * Delwri submission clears the DELWRI_Q buffer flag and returns with
    2501             :          * the buffer on the wait list with the original reference. Rather than
    2502             :          * bounce the buffer from a local wait list back to the original list
    2503             :          * after I/O completion, reuse the original list as the wait list.
    2504             :          */
    2505           0 :         xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
    2506             : 
    2507             :         /*
    2508             :          * The buffer is now locked, under I/O and wait listed on the original
    2509             :          * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
    2510             :          * return with the buffer unlocked and on the original queue.
    2511             :          */
    2512           0 :         error = xfs_buf_iowait(bp);
    2513           0 :         bp->b_flags |= _XBF_DELWRI_Q;
    2514           0 :         xfs_buf_unlock(bp);
    2515             : 
    2516           0 :         return error;
    2517             : }
    2518             : 
    2519 12481233258 : void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
    2520             : {
    2521             :         /*
    2522             :          * Set the lru reference count to 0 based on the error injection tag.
    2523             :          * This allows userspace to disrupt buffer caching for debug/testing
    2524             :          * purposes.
    2525             :          */
    2526 12481233258 :         if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
    2527          20 :                 lru_ref = 0;
    2528             : 
    2529 12483800643 :         atomic_set(&bp->b_lru_ref, lru_ref);
    2530 12483800643 : }
    2531             : 
    2532             : /*
    2533             :  * Verify an on-disk magic value against the magic value specified in the
    2534             :  * verifier structure. The verifier magic is in disk byte order so the caller is
    2535             :  * expected to pass the value directly from disk.
    2536             :  */
    2537             : bool
    2538   465626815 : xfs_verify_magic(
    2539             :         struct xfs_buf          *bp,
    2540             :         __be32                  dmagic)
    2541             : {
    2542   465626815 :         struct xfs_mount        *mp = bp->b_mount;
    2543   465626815 :         int                     idx;
    2544             : 
    2545   465626815 :         idx = xfs_has_crc(mp);
    2546   931253671 :         if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
    2547             :                 return false;
    2548   465626815 :         return dmagic == bp->b_ops->magic[idx];
    2549             : }
    2550             : /*
    2551             :  * Verify an on-disk magic value against the magic value specified in the
    2552             :  * verifier structure. The verifier magic is in disk byte order so the caller is
    2553             :  * expected to pass the value directly from disk.
    2554             :  */
    2555             : bool
    2556  1506548537 : xfs_verify_magic16(
    2557             :         struct xfs_buf          *bp,
    2558             :         __be16                  dmagic)
    2559             : {
    2560  1506548537 :         struct xfs_mount        *mp = bp->b_mount;
    2561  1506548537 :         int                     idx;
    2562             : 
    2563  1506548537 :         idx = xfs_has_crc(mp);
    2564  3013100246 :         if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
    2565             :                 return false;
    2566  1506548537 :         return dmagic == bp->b_ops->magic16[idx];
    2567             : }
    2568             : 
    2569             : /* Return the number of sectors for a buffer target. */
    2570             : xfs_daddr_t
    2571  1336966698 : xfs_buftarg_nr_sectors(
    2572             :         struct xfs_buftarg      *btp)
    2573             : {
    2574  1336966698 :         if (btp->bt_flags & XFS_BUFTARG_XFILE)
    2575  1336966698 :                 return xfile_buftarg_nr_sectors(btp);
    2576           0 :         return bdev_nr_sectors(btp->bt_bdev);
    2577             : }

Generated by: LCOV version 1.14