LCOV - code coverage report
Current view: top level - fs/xfs - xfs_buf.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwx @ Mon Jul 31 20:08:22 PDT 2023 Lines: 837 923 90.7 %
Date: 2023-07-31 20:08:22 Functions: 69 71 97.2 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
       4             :  * All Rights Reserved.
       5             :  */
       6             : #include "xfs.h"
       7             : #include <linux/backing-dev.h>
       8             : #include <linux/dax.h>
       9             : 
      10             : #include "xfs_shared.h"
      11             : #include "xfs_format.h"
      12             : #include "xfs_log_format.h"
      13             : #include "xfs_trans_resv.h"
      14             : #include "xfs_mount.h"
      15             : #include "xfs_trace.h"
      16             : #include "xfs_log.h"
      17             : #include "xfs_log_recover.h"
      18             : #include "xfs_log_priv.h"
      19             : #include "xfs_trans.h"
      20             : #include "xfs_buf_item.h"
      21             : #include "xfs_errortag.h"
      22             : #include "xfs_error.h"
      23             : #include "xfs_ag.h"
      24             : 
      25             : struct kmem_cache *xfs_buf_cache;
      26             : 
      27             : /*
      28             :  * Locking orders
      29             :  *
      30             :  * xfs_buf_ioacct_inc:
      31             :  * xfs_buf_ioacct_dec:
      32             :  *      b_sema (caller holds)
      33             :  *        b_lock
      34             :  *
      35             :  * xfs_buf_stale:
      36             :  *      b_sema (caller holds)
      37             :  *        b_lock
      38             :  *          lru_lock
      39             :  *
      40             :  * xfs_buf_rele:
      41             :  *      b_lock
      42             :  *        pag_buf_lock
      43             :  *          lru_lock
      44             :  *
      45             :  * xfs_buftarg_drain_rele
      46             :  *      lru_lock
      47             :  *        b_lock (trylock due to inversion)
      48             :  *
      49             :  * xfs_buftarg_isolate
      50             :  *      lru_lock
      51             :  *        b_lock (trylock due to inversion)
      52             :  */
      53             : 
      54             : static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
      55             : 
      56             : static inline int
      57    29768477 : xfs_buf_submit(
      58             :         struct xfs_buf          *bp)
      59             : {
      60    29768477 :         return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
      61             : }
      62             : 
      63             : static inline int
      64             : xfs_buf_is_vmapped(
      65             :         struct xfs_buf  *bp)
      66             : {
      67             :         /*
      68             :          * Return true if the buffer is vmapped.
      69             :          *
      70             :          * b_addr is null if the buffer is not mapped, but the code is clever
      71             :          * enough to know it doesn't have to map a single page, so the check has
      72             :          * to be both for b_addr and bp->b_page_count > 1.
      73             :          */
      74    55765522 :         return bp->b_addr && bp->b_page_count > 1;
      75             : }
      76             : 
      77             : static inline int
      78             : xfs_buf_vmap_len(
      79             :         struct xfs_buf  *bp)
      80             : {
      81             :         return (bp->b_page_count * PAGE_SIZE);
      82             : }
      83             : 
      84             : /*
      85             :  * Bump the I/O in flight count on the buftarg if we haven't yet done so for
      86             :  * this buffer. The count is incremented once per buffer (per hold cycle)
      87             :  * because the corresponding decrement is deferred to buffer release. Buffers
      88             :  * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
      89             :  * tracking adds unnecessary overhead. This is used for sychronization purposes
      90             :  * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
      91             :  * in-flight buffers.
      92             :  *
      93             :  * Buffers that are never released (e.g., superblock, iclog buffers) must set
      94             :  * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
      95             :  * never reaches zero and unmount hangs indefinitely.
      96             :  */
      97             : static inline void
      98    83497989 : xfs_buf_ioacct_inc(
      99             :         struct xfs_buf  *bp)
     100             : {
     101    83497989 :         if (bp->b_flags & XBF_NO_IOACCT)
     102             :                 return;
     103             : 
     104    83252516 :         ASSERT(bp->b_flags & XBF_ASYNC);
     105    83252516 :         spin_lock(&bp->b_lock);
     106    83252635 :         if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
     107    79741417 :                 bp->b_state |= XFS_BSTATE_IN_FLIGHT;
     108    79741417 :                 percpu_counter_inc(&bp->b_target->bt_io_count);
     109             :         }
     110    83252538 :         spin_unlock(&bp->b_lock);
     111             : }
     112             : 
     113             : /*
     114             :  * Clear the in-flight state on a buffer about to be released to the LRU or
     115             :  * freed and unaccount from the buftarg.
     116             :  */
     117             : static inline void
     118  8350534547 : __xfs_buf_ioacct_dec(
     119             :         struct xfs_buf  *bp)
     120             : {
     121  8350534547 :         lockdep_assert_held(&bp->b_lock);
     122             : 
     123  8350534547 :         if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
     124    79741366 :                 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
     125    79741366 :                 percpu_counter_dec(&bp->b_target->bt_io_count);
     126             :         }
     127  8350534431 : }
     128             : 
     129             : static inline void
     130      489053 : xfs_buf_ioacct_dec(
     131             :         struct xfs_buf  *bp)
     132             : {
     133      489053 :         spin_lock(&bp->b_lock);
     134      489053 :         __xfs_buf_ioacct_dec(bp);
     135      489053 :         spin_unlock(&bp->b_lock);
     136      489053 : }
     137             : 
     138             : /*
     139             :  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
     140             :  * b_lru_ref count so that the buffer is freed immediately when the buffer
     141             :  * reference count falls to zero. If the buffer is already on the LRU, we need
     142             :  * to remove the reference that LRU holds on the buffer.
     143             :  *
     144             :  * This prevents build-up of stale buffers on the LRU.
     145             :  */
     146             : void
     147    26771590 : xfs_buf_stale(
     148             :         struct xfs_buf  *bp)
     149             : {
     150    26771590 :         ASSERT(xfs_buf_islocked(bp));
     151             : 
     152    26771590 :         bp->b_flags |= XBF_STALE;
     153             : 
     154             :         /*
     155             :          * Clear the delwri status so that a delwri queue walker will not
     156             :          * flush this buffer to disk now that it is stale. The delwri queue has
     157             :          * a reference to the buffer, so this is safe to do.
     158             :          */
     159    26771590 :         bp->b_flags &= ~_XBF_DELWRI_Q;
     160             : 
     161             :         /*
     162             :          * Once the buffer is marked stale and unlocked, a subsequent lookup
     163             :          * could reset b_flags. There is no guarantee that the buffer is
     164             :          * unaccounted (released to LRU) before that occurs. Drop in-flight
     165             :          * status now to preserve accounting consistency.
     166             :          */
     167    26771590 :         spin_lock(&bp->b_lock);
     168    26818029 :         __xfs_buf_ioacct_dec(bp);
     169             : 
     170    26791654 :         atomic_set(&bp->b_lru_ref, 0);
     171    53615776 :         if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
     172    26791611 :             (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
     173     6104371 :                 atomic_dec(&bp->b_hold);
     174             : 
     175    26824253 :         ASSERT(atomic_read(&bp->b_hold) >= 1);
     176    26824253 :         spin_unlock(&bp->b_lock);
     177    26824656 : }
     178             : 
     179             : static int
     180    56946235 : xfs_buf_get_maps(
     181             :         struct xfs_buf          *bp,
     182             :         int                     map_count)
     183             : {
     184    56946235 :         ASSERT(bp->b_maps == NULL);
     185    56946235 :         bp->b_map_count = map_count;
     186             : 
     187    56946235 :         if (map_count == 1) {
     188    56946051 :                 bp->b_maps = &bp->__b_map;
     189    56946051 :                 return 0;
     190             :         }
     191             : 
     192         184 :         bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
     193             :                                 KM_NOFS);
     194         184 :         if (!bp->b_maps)
     195           0 :                 return -ENOMEM;
     196             :         return 0;
     197             : }
     198             : 
     199             : /*
     200             :  *      Frees b_pages if it was allocated.
     201             :  */
     202             : static void
     203             : xfs_buf_free_maps(
     204             :         struct xfs_buf  *bp)
     205             : {
     206    56920302 :         if (bp->b_maps != &bp->__b_map) {
     207         184 :                 kmem_free(bp->b_maps);
     208         184 :                 bp->b_maps = NULL;
     209             :         }
     210             : }
     211             : 
     212             : static int
     213    56928216 : _xfs_buf_alloc(
     214             :         struct xfs_buftarg      *target,
     215             :         struct xfs_buf_map      *map,
     216             :         int                     nmaps,
     217             :         xfs_buf_flags_t         flags,
     218             :         struct xfs_buf          **bpp)
     219             : {
     220    56928216 :         struct xfs_buf          *bp;
     221    56928216 :         int                     error;
     222    56928216 :         int                     i;
     223             : 
     224    56928216 :         *bpp = NULL;
     225    56928216 :         bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
     226             : 
     227             :         /*
     228             :          * We don't want certain flags to appear in b_flags unless they are
     229             :          * specifically set by later operations on the buffer.
     230             :          */
     231    56958816 :         flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
     232             : 
     233    56958816 :         atomic_set(&bp->b_hold, 1);
     234    56958816 :         atomic_set(&bp->b_lru_ref, 1);
     235    56958816 :         init_completion(&bp->b_iowait);
     236    56948641 :         INIT_LIST_HEAD(&bp->b_lru);
     237    56948641 :         INIT_LIST_HEAD(&bp->b_list);
     238    56948641 :         INIT_LIST_HEAD(&bp->b_li_list);
     239    56948641 :         sema_init(&bp->b_sema, 0); /* held, no waiters */
     240    56948641 :         spin_lock_init(&bp->b_lock);
     241    56946979 :         bp->b_target = target;
     242    56946979 :         bp->b_mount = target->bt_mount;
     243    56946979 :         bp->b_flags = flags;
     244             : 
     245             :         /*
     246             :          * Set length and io_length to the same value initially.
     247             :          * I/O routines should use io_length, which will be the same in
     248             :          * most cases but may be reset (e.g. XFS recovery).
     249             :          */
     250    56946979 :         error = xfs_buf_get_maps(bp, nmaps);
     251    56932097 :         if (error)  {
     252           0 :                 kmem_cache_free(xfs_buf_cache, bp);
     253           0 :                 return error;
     254             :         }
     255             : 
     256    56932097 :         bp->b_rhash_key = map[0].bm_bn;
     257    56932097 :         bp->b_length = 0;
     258   113861987 :         for (i = 0; i < nmaps; i++) {
     259    56929890 :                 bp->b_maps[i].bm_bn = map[i].bm_bn;
     260    56929890 :                 bp->b_maps[i].bm_len = map[i].bm_len;
     261    56929890 :                 bp->b_length += map[i].bm_len;
     262             :         }
     263             : 
     264    56932097 :         atomic_set(&bp->b_pin_count, 0);
     265    56932097 :         init_waitqueue_head(&bp->b_waiters);
     266             : 
     267    56935783 :         XFS_STATS_INC(bp->b_mount, xb_create);
     268    56937832 :         trace_xfs_buf_init(bp, _RET_IP_);
     269             : 
     270    56935227 :         *bpp = bp;
     271    56935227 :         return 0;
     272             : }
     273             : 
     274             : static void
     275    56685734 : xfs_buf_free_pages(
     276             :         struct xfs_buf  *bp)
     277             : {
     278    56685734 :         uint            i;
     279             : 
     280    56685734 :         ASSERT(bp->b_flags & _XBF_PAGES);
     281             : 
     282    56685734 :         if (xfs_buf_is_vmapped(bp))
     283    12842264 :                 vm_unmap_ram(bp->b_addr, bp->b_page_count);
     284             : 
     285   154583733 :         for (i = 0; i < bp->b_page_count; i++) {
     286    97898286 :                 if (bp->b_pages[i])
     287    97898286 :                         __free_page(bp->b_pages[i]);
     288             :         }
     289    56685447 :         mm_account_reclaimed_pages(bp->b_page_count);
     290             : 
     291    56685482 :         if (bp->b_pages != bp->b_page_array)
     292    13704690 :                 kmem_free(bp->b_pages);
     293    56685483 :         bp->b_pages = NULL;
     294    56685483 :         bp->b_flags &= ~_XBF_PAGES;
     295    56685483 : }
     296             : 
     297             : static void
     298    56920302 : xfs_buf_free_callback(
     299             :         struct callback_head    *cb)
     300             : {
     301    56920302 :         struct xfs_buf          *bp = container_of(cb, struct xfs_buf, b_rcu);
     302             : 
     303    56920302 :         xfs_buf_free_maps(bp);
     304    56920302 :         kmem_cache_free(xfs_buf_cache, bp);
     305    56941683 : }
     306             : 
     307             : static void
     308    56967818 : xfs_buf_free(
     309             :         struct xfs_buf          *bp)
     310             : {
     311    56967818 :         trace_xfs_buf_free(bp, _RET_IP_);
     312             : 
     313    56967453 :         ASSERT(list_empty(&bp->b_lru));
     314             : 
     315    56967453 :         if (bp->b_flags & _XBF_PAGES)
     316    56685803 :                 xfs_buf_free_pages(bp);
     317      281650 :         else if (bp->b_flags & _XBF_KMEM)
     318      281650 :                 kmem_free(bp->b_addr);
     319             : 
     320    56967303 :         call_rcu(&bp->b_rcu, xfs_buf_free_callback);
     321    56967207 : }
     322             : 
     323             : static int
     324      281622 : xfs_buf_alloc_kmem(
     325             :         struct xfs_buf  *bp,
     326             :         xfs_buf_flags_t flags)
     327             : {
     328      281622 :         xfs_km_flags_t  kmflag_mask = KM_NOFS;
     329      281622 :         size_t          size = BBTOB(bp->b_length);
     330             : 
     331             :         /* Assure zeroed buffer for non-read cases. */
     332      281622 :         if (!(flags & XBF_READ))
     333        6067 :                 kmflag_mask |= KM_ZERO;
     334             : 
     335      281622 :         bp->b_addr = kmem_alloc(size, kmflag_mask);
     336      281624 :         if (!bp->b_addr)
     337             :                 return -ENOMEM;
     338             : 
     339      281624 :         if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
     340             :             ((unsigned long)bp->b_addr & PAGE_MASK)) {
     341             :                 /* b_addr spans two pages - use alloc_page instead */
     342           0 :                 kmem_free(bp->b_addr);
     343           0 :                 bp->b_addr = NULL;
     344           0 :                 return -ENOMEM;
     345             :         }
     346      281624 :         bp->b_offset = offset_in_page(bp->b_addr);
     347      281624 :         bp->b_pages = bp->b_page_array;
     348      281624 :         bp->b_pages[0] = kmem_to_page(bp->b_addr);
     349      281595 :         bp->b_page_count = 1;
     350      281595 :         bp->b_flags |= _XBF_KMEM;
     351      281595 :         return 0;
     352             : }
     353             : 
     354             : static int
     355    56650512 : xfs_buf_alloc_pages(
     356             :         struct xfs_buf  *bp,
     357             :         xfs_buf_flags_t flags)
     358             : {
     359    56650512 :         gfp_t           gfp_mask = __GFP_NOWARN;
     360    56650512 :         long            filled = 0;
     361             : 
     362    56650512 :         if (flags & XBF_READ_AHEAD)
     363             :                 gfp_mask |= __GFP_NORETRY;
     364             :         else
     365    37132389 :                 gfp_mask |= GFP_NOFS;
     366             : 
     367             :         /* Make sure that we have a page list */
     368    56650512 :         bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
     369    56650512 :         if (bp->b_page_count <= XB_PAGES) {
     370    42947288 :                 bp->b_pages = bp->b_page_array;
     371             :         } else {
     372    13703224 :                 bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
     373             :                                         gfp_mask);
     374    13703146 :                 if (!bp->b_pages)
     375             :                         return -ENOMEM;
     376             :         }
     377    56650434 :         bp->b_flags |= _XBF_PAGES;
     378             : 
     379             :         /* Assure zeroed buffer for non-read cases. */
     380    56650434 :         if (!(flags & XBF_READ))
     381    27395848 :                 gfp_mask |= __GFP_ZERO;
     382             : 
     383             :         /*
     384             :          * Bulk filling of pages can take multiple calls. Not filling the entire
     385             :          * array is not an allocation failure, so don't back off if we get at
     386             :          * least one extra page.
     387             :          */
     388    56650466 :         for (;;) {
     389    56650466 :                 long    last = filled;
     390             : 
     391    56650466 :                 filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
     392             :                                                 bp->b_pages);
     393    56656602 :                 if (filled == bp->b_page_count) {
     394    56656570 :                         XFS_STATS_INC(bp->b_mount, xb_page_found);
     395    56658041 :                         break;
     396             :                 }
     397             : 
     398          32 :                 if (filled != last)
     399          32 :                         continue;
     400             : 
     401           0 :                 if (flags & XBF_READ_AHEAD) {
     402           0 :                         xfs_buf_free_pages(bp);
     403           0 :                         return -ENOMEM;
     404             :                 }
     405             : 
     406           0 :                 XFS_STATS_INC(bp->b_mount, xb_page_retries);
     407           0 :                 memalloc_retry_wait(gfp_mask);
     408             :         }
     409    56658041 :         return 0;
     410             : }
     411             : 
     412             : /*
     413             :  *      Map buffer into kernel address-space if necessary.
     414             :  */
     415             : STATIC int
     416   292158670 : _xfs_buf_map_pages(
     417             :         struct xfs_buf          *bp,
     418             :         xfs_buf_flags_t         flags)
     419             : {
     420   292158670 :         ASSERT(bp->b_flags & _XBF_PAGES);
     421   292158670 :         if (bp->b_page_count == 1) {
     422             :                 /* A single page buffer is always mappable */
     423    42854352 :                 bp->b_addr = page_address(bp->b_pages[0]);
     424   249304318 :         } else if (flags & XBF_UNMAPPED) {
     425   236462207 :                 bp->b_addr = NULL;
     426             :         } else {
     427    12842111 :                 int retried = 0;
     428    12842111 :                 unsigned nofs_flag;
     429             : 
     430             :                 /*
     431             :                  * vm_map_ram() will allocate auxiliary structures (e.g.
     432             :                  * pagetables) with GFP_KERNEL, yet we are likely to be under
     433             :                  * GFP_NOFS context here. Hence we need to tell memory reclaim
     434             :                  * that we are in such a context via PF_MEMALLOC_NOFS to prevent
     435             :                  * memory reclaim re-entering the filesystem here and
     436             :                  * potentially deadlocking.
     437             :                  */
     438    12842111 :                 nofs_flag = memalloc_nofs_save();
     439    12842111 :                 do {
     440    12842111 :                         bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
     441             :                                                 -1);
     442    12842127 :                         if (bp->b_addr)
     443             :                                 break;
     444           0 :                         vm_unmap_aliases();
     445           0 :                 } while (retried++ <= 1);
     446    12842127 :                 memalloc_nofs_restore(nofs_flag);
     447             : 
     448    12842127 :                 if (!bp->b_addr)
     449           0 :                         return -ENOMEM;
     450             :         }
     451             : 
     452             :         return 0;
     453             : }
     454             : 
     455             : /*
     456             :  *      Finding and Reading Buffers
     457             :  */
     458             : static int
     459 30535556347 : _xfs_buf_obj_cmp(
     460             :         struct rhashtable_compare_arg   *arg,
     461             :         const void                      *obj)
     462             : {
     463 30535556347 :         const struct xfs_buf_map        *map = arg->key;
     464 30535556347 :         const struct xfs_buf            *bp = obj;
     465             : 
     466             :         /*
     467             :          * The key hashing in the lookup path depends on the key being the
     468             :          * first element of the compare_arg, make sure to assert this.
     469             :          */
     470 30535556347 :         BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
     471             : 
     472 30535556347 :         if (bp->b_rhash_key != map->bm_bn)
     473             :                 return 1;
     474             : 
     475 23858573419 :         if (unlikely(bp->b_length != map->bm_len)) {
     476             :                 /*
     477             :                  * found a block number match. If the range doesn't
     478             :                  * match, the only way this is allowed is if the buffer
     479             :                  * in the cache is stale and the transaction that made
     480             :                  * it stale has not yet committed. i.e. we are
     481             :                  * reallocating a busy extent. Skip this buffer and
     482             :                  * continue searching for an exact match.
     483             :                  */
     484          42 :                 ASSERT(bp->b_flags & XBF_STALE);
     485          42 :                 return 1;
     486             :         }
     487             :         return 0;
     488             : }
     489             : 
     490             : static const struct rhashtable_params xfs_buf_hash_params = {
     491             :         .min_size               = 32,   /* empty AGs have minimal footprint */
     492             :         .nelem_hint             = 16,
     493             :         .key_len                = sizeof(xfs_daddr_t),
     494             :         .key_offset             = offsetof(struct xfs_buf, b_rhash_key),
     495             :         .head_offset            = offsetof(struct xfs_buf, b_rhash_head),
     496             :         .automatic_shrinking    = true,
     497             :         .obj_cmpfn              = _xfs_buf_obj_cmp,
     498             : };
     499             : 
     500             : int
     501      444281 : xfs_buf_hash_init(
     502             :         struct xfs_perag        *pag)
     503             : {
     504      444281 :         spin_lock_init(&pag->pag_buf_lock);
     505      444281 :         return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
     506             : }
     507             : 
     508             : void
     509      444321 : xfs_buf_hash_destroy(
     510             :         struct xfs_perag        *pag)
     511             : {
     512      444321 :         rhashtable_destroy(&pag->pag_buf_hash);
     513      444321 : }
     514             : 
     515             : static int
     516 23918897128 : xfs_buf_map_verify(
     517             :         struct xfs_buftarg      *btp,
     518             :         struct xfs_buf_map      *map)
     519             : {
     520 23918897128 :         xfs_daddr_t             eofs;
     521             : 
     522             :         /* Check for IOs smaller than the sector size / not sector aligned */
     523 23918897128 :         ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
     524 23918897128 :         ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
     525             : 
     526             :         /*
     527             :          * Corrupted block numbers can get through to here, unfortunately, so we
     528             :          * have to check that the buffer falls within the filesystem bounds.
     529             :          */
     530 23918897128 :         eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
     531 23918897128 :         if (map->bm_bn < 0 || map->bm_bn >= eofs) {
     532           0 :                 xfs_alert(btp->bt_mount,
     533             :                           "%s: daddr 0x%llx out of range, EOFS 0x%llx",
     534             :                           __func__, map->bm_bn, eofs);
     535           0 :                 WARN_ON(1);
     536           0 :                 return -EFSCORRUPTED;
     537             :         }
     538             :         return 0;
     539             : }
     540             : 
     541             : static int
     542 23883431118 : xfs_buf_find_lock(
     543             :         struct xfs_buf          *bp,
     544             :         xfs_buf_flags_t         flags)
     545             : {
     546 23883431118 :         if (flags & XBF_TRYLOCK) {
     547  4563306388 :                 if (!xfs_buf_trylock(bp)) {
     548   120288963 :                         XFS_STATS_INC(bp->b_mount, xb_busy_locked);
     549   120288918 :                         return -EAGAIN;
     550             :                 }
     551             :         } else {
     552 19320124730 :                 xfs_buf_lock(bp);
     553 19291356519 :                 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
     554             :         }
     555             : 
     556             :         /*
     557             :          * if the buffer is stale, clear all the external state associated with
     558             :          * it. We need to keep flags such as how we allocated the buffer memory
     559             :          * intact here.
     560             :          */
     561 23735566375 :         if (bp->b_flags & XBF_STALE) {
     562       36537 :                 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
     563       36537 :                 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
     564       36537 :                 bp->b_ops = NULL;
     565             :         }
     566             :         return 0;
     567             : }
     568             : 
     569             : static inline int
     570 23927336802 : xfs_buf_lookup(
     571             :         struct xfs_perag        *pag,
     572             :         struct xfs_buf_map      *map,
     573             :         xfs_buf_flags_t         flags,
     574             :         struct xfs_buf          **bpp)
     575             : {
     576 23927336802 :         struct xfs_buf          *bp;
     577 23927336802 :         int                     error;
     578             : 
     579 23927336802 :         rcu_read_lock();
     580 23924591626 :         bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
     581 47800868153 :         if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
     582    56462052 :                 rcu_read_unlock();
     583    56462052 :                 return -ENOENT;
     584             :         }
     585 23886077710 :         rcu_read_unlock();
     586             : 
     587 23885588535 :         error = xfs_buf_find_lock(bp, flags);
     588 23852162479 :         if (error) {
     589   120289240 :                 xfs_buf_rele(bp);
     590   120289240 :                 return error;
     591             :         }
     592             : 
     593 23731873239 :         trace_xfs_buf_find(bp, flags, _RET_IP_);
     594 23733080320 :         *bpp = bp;
     595 23733080320 :         return 0;
     596             : }
     597             : 
     598             : /*
     599             :  * Insert the new_bp into the hash table. This consumes the perag reference
     600             :  * taken for the lookup regardless of the result of the insert.
     601             :  */
     602             : static int
     603    56448379 : xfs_buf_find_insert(
     604             :         struct xfs_buftarg      *btp,
     605             :         struct xfs_perag        *pag,
     606             :         struct xfs_buf_map      *cmap,
     607             :         struct xfs_buf_map      *map,
     608             :         int                     nmaps,
     609             :         xfs_buf_flags_t         flags,
     610             :         struct xfs_buf          **bpp)
     611             : {
     612    56448379 :         struct xfs_buf          *new_bp;
     613    56448379 :         struct xfs_buf          *bp;
     614    56448379 :         int                     error;
     615             : 
     616    56448379 :         error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
     617    56449228 :         if (error)
     618           0 :                 goto out_drop_pag;
     619             : 
     620             :         /*
     621             :          * For buffers that fit entirely within a single page, first attempt to
     622             :          * allocate the memory from the heap to minimise memory usage. If we
     623             :          * can't get heap memory for these small buffers, we fall back to using
     624             :          * the page allocator.
     625             :          */
     626    56730827 :         if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
     627      281616 :             xfs_buf_alloc_kmem(new_bp, flags) < 0) {
     628    56167612 :                 error = xfs_buf_alloc_pages(new_bp, flags);
     629    56146138 :                 if (error)
     630           0 :                         goto out_free_buf;
     631             :         }
     632             : 
     633    56427737 :         spin_lock(&pag->pag_buf_lock);
     634    56451397 :         bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
     635             :                         &new_bp->b_rhash_head, xfs_buf_hash_params);
     636    56425640 :         if (IS_ERR(bp)) {
     637           0 :                 error = PTR_ERR(bp);
     638           0 :                 spin_unlock(&pag->pag_buf_lock);
     639           0 :                 goto out_free_buf;
     640             :         }
     641    56425640 :         if (bp) {
     642             :                 /* found an existing buffer */
     643        1337 :                 atomic_inc(&bp->b_hold);
     644        1337 :                 spin_unlock(&pag->pag_buf_lock);
     645        1337 :                 error = xfs_buf_find_lock(bp, flags);
     646        1337 :                 if (error)
     647           1 :                         xfs_buf_rele(bp);
     648             :                 else
     649        1336 :                         *bpp = bp;
     650        1337 :                 goto out_free_buf;
     651             :         }
     652             : 
     653             :         /* The new buffer keeps the perag reference until it is freed. */
     654    56424303 :         new_bp->b_pag = pag;
     655    56424303 :         spin_unlock(&pag->pag_buf_lock);
     656    56466788 :         *bpp = new_bp;
     657    56466788 :         return 0;
     658             : 
     659        1337 : out_free_buf:
     660        1337 :         xfs_buf_free(new_bp);
     661        1337 : out_drop_pag:
     662        1337 :         xfs_perag_put(pag);
     663        1337 :         return error;
     664             : }
     665             : 
     666             : /*
     667             :  * Assembles a buffer covering the specified range. The code is optimised for
     668             :  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
     669             :  * more hits than misses.
     670             :  */
     671             : int
     672 23928799482 : xfs_buf_get_map(
     673             :         struct xfs_buftarg      *btp,
     674             :         struct xfs_buf_map      *map,
     675             :         int                     nmaps,
     676             :         xfs_buf_flags_t         flags,
     677             :         struct xfs_buf          **bpp)
     678             : {
     679 23928799482 :         struct xfs_perag        *pag;
     680 23928799482 :         struct xfs_buf          *bp = NULL;
     681 23928799482 :         struct xfs_buf_map      cmap = { .bm_bn = map[0].bm_bn };
     682 23928799482 :         int                     error;
     683 23928799482 :         int                     i;
     684             : 
     685 47851237954 :         for (i = 0; i < nmaps; i++)
     686 23922438472 :                 cmap.bm_len += map[i].bm_len;
     687             : 
     688 23928799482 :         error = xfs_buf_map_verify(btp, &cmap);
     689 23911942284 :         if (error)
     690             :                 return error;
     691             : 
     692 23911929364 :         pag = xfs_perag_get(btp->bt_mount,
     693             :                             xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
     694             : 
     695 23936007829 :         error = xfs_buf_lookup(pag, &cmap, flags, &bp);
     696 23909801178 :         if (error && error != -ENOENT)
     697   120293563 :                 goto out_put_perag;
     698             : 
     699             :         /* cache hits always outnumber misses by at least 10:1 */
     700 23789507615 :         if (unlikely(!bp)) {
     701    56446704 :                 XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
     702             : 
     703    56459216 :                 if (flags & XBF_INCORE)
     704        2255 :                         goto out_put_perag;
     705             : 
     706             :                 /* xfs_buf_find_insert() consumes the perag reference. */
     707    56456961 :                 error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
     708             :                                 flags, &bp);
     709    56465075 :                 if (error)
     710             :                         return error;
     711             :         } else {
     712 23733060911 :                 XFS_STATS_INC(btp->bt_mount, xb_get_locked);
     713 23734861378 :                 xfs_perag_put(pag);
     714             :         }
     715             : 
     716             :         /* We do not hold a perag reference anymore. */
     717 23817453802 :         if (!bp->b_addr) {
     718   292056663 :                 error = _xfs_buf_map_pages(bp, flags);
     719   291563828 :                 if (unlikely(error)) {
     720           0 :                         xfs_warn_ratelimited(btp->bt_mount,
     721             :                                 "%s: failed to map %u pages", __func__,
     722             :                                 bp->b_page_count);
     723           0 :                         xfs_buf_relse(bp);
     724           0 :                         return error;
     725             :                 }
     726             :         }
     727             : 
     728             :         /*
     729             :          * Clear b_error if this is a lookup from a caller that doesn't expect
     730             :          * valid data to be found in the buffer.
     731             :          */
     732 23816960967 :         if (!(flags & XBF_READ))
     733    44827775 :                 xfs_buf_ioerror(bp, 0);
     734             : 
     735 23816877118 :         XFS_STATS_INC(btp->bt_mount, xb_get);
     736 23814641436 :         trace_xfs_buf_get(bp, flags, _RET_IP_);
     737 23808224629 :         *bpp = bp;
     738 23808224629 :         return 0;
     739             : 
     740   120295818 : out_put_perag:
     741   120295818 :         xfs_perag_put(pag);
     742   120295818 :         return error;
     743             : }
     744             : 
     745             : int
     746    29547325 : _xfs_buf_read(
     747             :         struct xfs_buf          *bp,
     748             :         xfs_buf_flags_t         flags)
     749             : {
     750    29547325 :         ASSERT(!(flags & XBF_WRITE));
     751    29547325 :         ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
     752             : 
     753    29547325 :         bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
     754    29547325 :         bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
     755             : 
     756    29547325 :         return xfs_buf_submit(bp);
     757             : }
     758             : 
     759             : /*
     760             :  * Reverify a buffer found in cache without an attached ->b_ops.
     761             :  *
     762             :  * If the caller passed an ops structure and the buffer doesn't have ops
     763             :  * assigned, set the ops and use it to verify the contents. If verification
     764             :  * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
     765             :  * already in XBF_DONE state on entry.
     766             :  *
     767             :  * Under normal operations, every in-core buffer is verified on read I/O
     768             :  * completion. There are two scenarios that can lead to in-core buffers without
     769             :  * an assigned ->b_ops. The first is during log recovery of buffers on a V4
     770             :  * filesystem, though these buffers are purged at the end of recovery. The
     771             :  * other is online repair, which intentionally reads with a NULL buffer ops to
     772             :  * run several verifiers across an in-core buffer in order to establish buffer
     773             :  * type.  If repair can't establish that, the buffer will be left in memory
     774             :  * with NULL buffer ops.
     775             :  */
     776             : int
     777 24651246947 : xfs_buf_reverify(
     778             :         struct xfs_buf          *bp,
     779             :         const struct xfs_buf_ops *ops)
     780             : {
     781 24651246947 :         ASSERT(bp->b_flags & XBF_DONE);
     782 24651246947 :         ASSERT(bp->b_error == 0);
     783             : 
     784 24651246947 :         if (!ops || bp->b_ops)
     785             :                 return 0;
     786             : 
     787         306 :         bp->b_ops = ops;
     788         306 :         bp->b_ops->verify_read(bp);
     789         306 :         if (bp->b_error)
     790         306 :                 bp->b_flags &= ~XBF_DONE;
     791             :         return bp->b_error;
     792             : }
     793             : 
     794             : int
     795 23881721974 : xfs_buf_read_map(
     796             :         struct xfs_buftarg      *target,
     797             :         struct xfs_buf_map      *map,
     798             :         int                     nmaps,
     799             :         xfs_buf_flags_t         flags,
     800             :         struct xfs_buf          **bpp,
     801             :         const struct xfs_buf_ops *ops,
     802             :         xfs_failaddr_t          fa)
     803             : {
     804 23881721974 :         struct xfs_buf          *bp;
     805 23881721974 :         int                     error;
     806             : 
     807 23881721974 :         flags |= XBF_READ;
     808 23881721974 :         *bpp = NULL;
     809             : 
     810 23881721974 :         error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
     811 23884551789 :         if (error)
     812             :                 return error;
     813             : 
     814 23764899881 :         trace_xfs_buf_read(bp, flags, _RET_IP_);
     815             : 
     816 23757049316 :         if (!(bp->b_flags & XBF_DONE)) {
     817             :                 /* Initiate the buffer read and wait. */
     818    29534335 :                 XFS_STATS_INC(target->bt_mount, xb_get_read);
     819    29535107 :                 bp->b_ops = ops;
     820    29535107 :                 error = _xfs_buf_read(bp, flags);
     821             : 
     822             :                 /* Readahead iodone already dropped the buffer, so exit. */
     823    28520908 :                 if (flags & XBF_ASYNC)
     824             :                         return 0;
     825             :         } else {
     826             :                 /* Buffer already read; all we need to do is check it. */
     827 23727514981 :                 error = xfs_buf_reverify(bp, ops);
     828             : 
     829             :                 /* Readahead already finished; drop the buffer and exit. */
     830 23727011355 :                 if (flags & XBF_ASYNC) {
     831  4363979629 :                         xfs_buf_relse(bp);
     832  4362757263 :                         return 0;
     833             :                 }
     834             : 
     835             :                 /* We do not want read in the flags */
     836 19363031726 :                 bp->b_flags &= ~XBF_READ;
     837 19363031726 :                 ASSERT(bp->b_ops != NULL || ops == NULL);
     838             :         }
     839             : 
     840             :         /*
     841             :          * If we've had a read error, then the contents of the buffer are
     842             :          * invalid and should not be used. To ensure that a followup read tries
     843             :          * to pull the buffer from disk again, we clear the XBF_DONE flag and
     844             :          * mark the buffer stale. This ensures that anyone who has a current
     845             :          * reference to the buffer will interpret it's contents correctly and
     846             :          * future cache lookups will also treat it as an empty, uninitialised
     847             :          * buffer.
     848             :          */
     849 19371897363 :         if (error) {
     850             :                 /*
     851             :                  * Check against log shutdown for error reporting because
     852             :                  * metadata writeback may require a read first and we need to
     853             :                  * report errors in metadata writeback until the log is shut
     854             :                  * down. High level transaction read functions already check
     855             :                  * against mount shutdown, anyway, so we only need to be
     856             :                  * concerned about low level IO interactions here.
     857             :                  */
     858       70176 :                 if (!xlog_is_shutdown(target->bt_mount->m_log))
     859       19381 :                         xfs_buf_ioerror_alert(bp, fa);
     860             : 
     861       35104 :                 bp->b_flags &= ~XBF_DONE;
     862       35104 :                 xfs_buf_stale(bp);
     863       35108 :                 xfs_buf_relse(bp);
     864             : 
     865             :                 /* bad CRC means corrupted metadata */
     866       35105 :                 if (error == -EFSBADCRC)
     867        2281 :                         error = -EFSCORRUPTED;
     868       35105 :                 return error;
     869             :         }
     870             : 
     871 19371862275 :         *bpp = bp;
     872 19371862275 :         return 0;
     873             : }
     874             : 
     875             : /*
     876             :  *      If we are not low on memory then do the readahead in a deadlock
     877             :  *      safe manner.
     878             :  */
     879             : void
     880  4497039163 : xfs_buf_readahead_map(
     881             :         struct xfs_buftarg      *target,
     882             :         struct xfs_buf_map      *map,
     883             :         int                     nmaps,
     884             :         const struct xfs_buf_ops *ops)
     885             : {
     886  4497039163 :         struct xfs_buf          *bp;
     887             : 
     888  8993989803 :         xfs_buf_read_map(target, map, nmaps,
     889             :                      XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
     890  4497039163 :                      __this_address);
     891  4498544140 : }
     892             : 
     893             : /*
     894             :  * Read an uncached buffer from disk. Allocates and returns a locked
     895             :  * buffer containing the disk contents or nothing. Uncached buffers always have
     896             :  * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
     897             :  * is cached or uncached during fault diagnosis.
     898             :  */
     899             : int
     900      214079 : xfs_buf_read_uncached(
     901             :         struct xfs_buftarg      *target,
     902             :         xfs_daddr_t             daddr,
     903             :         size_t                  numblks,
     904             :         xfs_buf_flags_t         flags,
     905             :         struct xfs_buf          **bpp,
     906             :         const struct xfs_buf_ops *ops)
     907             : {
     908      214079 :         struct xfs_buf          *bp;
     909      214079 :         int                     error;
     910             : 
     911      214079 :         *bpp = NULL;
     912             : 
     913      214079 :         error = xfs_buf_get_uncached(target, numblks, flags, &bp);
     914      214079 :         if (error)
     915             :                 return error;
     916             : 
     917             :         /* set up the buffer for a read IO */
     918      214079 :         ASSERT(bp->b_map_count == 1);
     919      214079 :         bp->b_rhash_key = XFS_BUF_DADDR_NULL;
     920      214079 :         bp->b_maps[0].bm_bn = daddr;
     921      214079 :         bp->b_flags |= XBF_READ;
     922      214079 :         bp->b_ops = ops;
     923             : 
     924      214079 :         xfs_buf_submit(bp);
     925      214079 :         if (bp->b_error) {
     926         585 :                 error = bp->b_error;
     927         585 :                 xfs_buf_relse(bp);
     928         585 :                 return error;
     929             :         }
     930             : 
     931      213494 :         *bpp = bp;
     932      213494 :         return 0;
     933             : }
     934             : 
     935             : int
     936      489046 : xfs_buf_get_uncached(
     937             :         struct xfs_buftarg      *target,
     938             :         size_t                  numblks,
     939             :         xfs_buf_flags_t         flags,
     940             :         struct xfs_buf          **bpp)
     941             : {
     942      489046 :         int                     error;
     943      489046 :         struct xfs_buf          *bp;
     944      489046 :         DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
     945             : 
     946      489046 :         *bpp = NULL;
     947             : 
     948             :         /* flags might contain irrelevant bits, pass only what we care about */
     949      489046 :         error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
     950      489046 :         if (error)
     951             :                 return error;
     952             : 
     953      489046 :         error = xfs_buf_alloc_pages(bp, flags);
     954      489046 :         if (error)
     955           0 :                 goto fail_free_buf;
     956             : 
     957      489046 :         error = _xfs_buf_map_pages(bp, 0);
     958      489046 :         if (unlikely(error)) {
     959           0 :                 xfs_warn(target->bt_mount,
     960             :                         "%s: failed to map pages", __func__);
     961           0 :                 goto fail_free_buf;
     962             :         }
     963             : 
     964      489046 :         trace_xfs_buf_get_uncached(bp, _RET_IP_);
     965      489046 :         *bpp = bp;
     966      489046 :         return 0;
     967             : 
     968           0 : fail_free_buf:
     969           0 :         xfs_buf_free(bp);
     970           0 :         return error;
     971             : }
     972             : 
     973             : /*
     974             :  *      Increment reference count on buffer, to hold the buffer concurrently
     975             :  *      with another thread which may release (free) the buffer asynchronously.
     976             :  *      Must hold the buffer already to call this function.
     977             :  */
     978             : void
     979  7209898572 : xfs_buf_hold(
     980             :         struct xfs_buf          *bp)
     981             : {
     982  7209898572 :         trace_xfs_buf_hold(bp, _RET_IP_);
     983  7214931241 :         atomic_inc(&bp->b_hold);
     984  7225396211 : }
     985             : 
     986             : /*
     987             :  * Release a hold on the specified buffer. If the hold count is 1, the buffer is
     988             :  * placed on LRU or freed (depending on b_lru_ref).
     989             :  */
     990             : void
     991 31245111983 : xfs_buf_rele(
     992             :         struct xfs_buf          *bp)
     993             : {
     994 31245111983 :         struct xfs_perag        *pag = bp->b_pag;
     995 31245111983 :         bool                    release;
     996 31245111983 :         bool                    freebuf = false;
     997             : 
     998 31245111983 :         trace_xfs_buf_rele(bp, _RET_IP_);
     999             : 
    1000 31242480173 :         if (!pag) {
    1001    85103264 :                 ASSERT(list_empty(&bp->b_lru));
    1002    85103264 :                 if (atomic_dec_and_test(&bp->b_hold)) {
    1003      489053 :                         xfs_buf_ioacct_dec(bp);
    1004      489053 :                         xfs_buf_free(bp);
    1005             :                 }
    1006    85103276 :                 return;
    1007             :         }
    1008             : 
    1009 31157376909 :         ASSERT(atomic_read(&bp->b_hold) > 0);
    1010             : 
    1011             :         /*
    1012             :          * We grab the b_lock here first to serialise racing xfs_buf_rele()
    1013             :          * calls. The pag_buf_lock being taken on the last reference only
    1014             :          * serialises against racing lookups in xfs_buf_find(). IOWs, the second
    1015             :          * to last reference we drop here is not serialised against the last
    1016             :          * reference until we take bp->b_lock. Hence if we don't grab b_lock
    1017             :          * first, the last "release" reference can win the race to the lock and
    1018             :          * free the buffer before the second-to-last reference is processed,
    1019             :          * leading to a use-after-free scenario.
    1020             :          */
    1021 31157376909 :         spin_lock(&bp->b_lock);
    1022 31206067950 :         release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
    1023 31204663856 :         if (!release) {
    1024             :                 /*
    1025             :                  * Drop the in-flight state if the buffer is already on the LRU
    1026             :                  * and it holds the only reference. This is racy because we
    1027             :                  * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
    1028             :                  * ensures the decrement occurs only once per-buf.
    1029             :                  */
    1030 31110036446 :                 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
    1031  8229190104 :                         __xfs_buf_ioacct_dec(bp);
    1032 31105212753 :                 goto out_unlock;
    1033             :         }
    1034             : 
    1035             :         /* the last reference has been dropped ... */
    1036    94627410 :         __xfs_buf_ioacct_dec(bp);
    1037    94626901 :         if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
    1038             :                 /*
    1039             :                  * If the buffer is added to the LRU take a new reference to the
    1040             :                  * buffer for the LRU and clear the (now stale) dispose list
    1041             :                  * state flag
    1042             :                  */
    1043    38149895 :                 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
    1044    38153933 :                         bp->b_state &= ~XFS_BSTATE_DISPOSE;
    1045    38153933 :                         atomic_inc(&bp->b_hold);
    1046             :                 }
    1047    38153950 :                 spin_unlock(&pag->pag_buf_lock);
    1048             :         } else {
    1049             :                 /*
    1050             :                  * most of the time buffers will already be removed from the
    1051             :                  * LRU, so optimise that case by checking for the
    1052             :                  * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
    1053             :                  * was on was the disposal list
    1054             :                  */
    1055    56477006 :                 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
    1056    24429815 :                         list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
    1057             :                 } else {
    1058    32047191 :                         ASSERT(list_empty(&bp->b_lru));
    1059             :                 }
    1060             : 
    1061    56477842 :                 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
    1062    56477842 :                 rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
    1063             :                                        xfs_buf_hash_params);
    1064    56477544 :                 spin_unlock(&pag->pag_buf_lock);
    1065    56477703 :                 xfs_perag_put(pag);
    1066    56477703 :                 freebuf = true;
    1067             :         }
    1068             : 
    1069 31199844334 : out_unlock:
    1070 31199844334 :         spin_unlock(&bp->b_lock);
    1071             : 
    1072 31210558254 :         if (freebuf)
    1073    56477732 :                 xfs_buf_free(bp);
    1074             : }
    1075             : 
    1076             : 
    1077             : /*
    1078             :  *      Lock a buffer object, if it is not already locked.
    1079             :  *
    1080             :  *      If we come across a stale, pinned, locked buffer, we know that we are
    1081             :  *      being asked to lock a buffer that has been reallocated. Because it is
    1082             :  *      pinned, we know that the log has not been pushed to disk and hence it
    1083             :  *      will still be locked.  Rather than continuing to have trylock attempts
    1084             :  *      fail until someone else pushes the log, push it ourselves before
    1085             :  *      returning.  This means that the xfsaild will not get stuck trying
    1086             :  *      to push on stale inode buffers.
    1087             :  */
    1088             : int
    1089  4690918376 : xfs_buf_trylock(
    1090             :         struct xfs_buf          *bp)
    1091             : {
    1092  4690918376 :         int                     locked;
    1093             : 
    1094  4690918376 :         locked = down_trylock(&bp->b_sema) == 0;
    1095  4691281466 :         if (locked)
    1096  4570202432 :                 trace_xfs_buf_trylock(bp, _RET_IP_);
    1097             :         else
    1098   121079034 :                 trace_xfs_buf_trylock_fail(bp, _RET_IP_);
    1099  4689315989 :         return locked;
    1100             : }
    1101             : 
    1102             : /*
    1103             :  *      Lock a buffer object.
    1104             :  *
    1105             :  *      If we come across a stale, pinned, locked buffer, we know that we
    1106             :  *      are being asked to lock a buffer that has been reallocated. Because
    1107             :  *      it is pinned, we know that the log has not been pushed to disk and
    1108             :  *      hence it will still be locked. Rather than sleeping until someone
    1109             :  *      else pushes the log, push it ourselves before trying to get the lock.
    1110             :  */
    1111             : void
    1112 19422285852 : xfs_buf_lock(
    1113             :         struct xfs_buf          *bp)
    1114             : {
    1115 19422285852 :         trace_xfs_buf_lock(bp, _RET_IP_);
    1116             : 
    1117 19417440051 :         if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
    1118       27747 :                 xfs_log_force(bp->b_mount, 0);
    1119 19417440051 :         down(&bp->b_sema);
    1120             : 
    1121 19421133151 :         trace_xfs_buf_lock_done(bp, _RET_IP_);
    1122 19405476376 : }
    1123             : 
    1124             : void
    1125 24045746525 : xfs_buf_unlock(
    1126             :         struct xfs_buf          *bp)
    1127             : {
    1128 24045746525 :         ASSERT(xfs_buf_islocked(bp));
    1129             : 
    1130 24045746525 :         up(&bp->b_sema);
    1131 24047880728 :         trace_xfs_buf_unlock(bp, _RET_IP_);
    1132 24026959672 : }
    1133             : 
    1134             : STATIC void
    1135    89901298 : xfs_buf_wait_unpin(
    1136             :         struct xfs_buf          *bp)
    1137             : {
    1138    89901298 :         DECLARE_WAITQUEUE       (wait, current);
    1139             : 
    1140    89901298 :         if (atomic_read(&bp->b_pin_count) == 0)
    1141    89894388 :                 return;
    1142             : 
    1143        6910 :         add_wait_queue(&bp->b_waiters, &wait);
    1144       20730 :         for (;;) {
    1145       13820 :                 set_current_state(TASK_UNINTERRUPTIBLE);
    1146       13820 :                 if (atomic_read(&bp->b_pin_count) == 0)
    1147             :                         break;
    1148        6910 :                 io_schedule();
    1149             :         }
    1150        6910 :         remove_wait_queue(&bp->b_waiters, &wait);
    1151        6910 :         set_current_state(TASK_RUNNING);
    1152             : }
    1153             : 
    1154             : static void
    1155       21920 : xfs_buf_ioerror_alert_ratelimited(
    1156             :         struct xfs_buf          *bp)
    1157             : {
    1158       21920 :         static unsigned long    lasttime;
    1159       21920 :         static struct xfs_buftarg *lasttarg;
    1160             : 
    1161       21920 :         if (bp->b_target != lasttarg ||
    1162       21833 :             time_after(jiffies, (lasttime + 5*HZ))) {
    1163          98 :                 lasttime = jiffies;
    1164          98 :                 xfs_buf_ioerror_alert(bp, __this_address);
    1165             :         }
    1166       21920 :         lasttarg = bp->b_target;
    1167       21920 : }
    1168             : 
    1169             : /*
    1170             :  * Account for this latest trip around the retry handler, and decide if
    1171             :  * we've failed enough times to constitute a permanent failure.
    1172             :  */
    1173             : static bool
    1174       21365 : xfs_buf_ioerror_permanent(
    1175             :         struct xfs_buf          *bp,
    1176             :         struct xfs_error_cfg    *cfg)
    1177             : {
    1178       21365 :         struct xfs_mount        *mp = bp->b_mount;
    1179             : 
    1180       21365 :         if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
    1181          54 :             ++bp->b_retries > cfg->max_retries)
    1182             :                 return true;
    1183       21311 :         if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
    1184           0 :             time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
    1185             :                 return true;
    1186             : 
    1187             :         /* At unmount we may treat errors differently */
    1188       42622 :         if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
    1189           6 :                 return true;
    1190             : 
    1191             :         return false;
    1192             : }
    1193             : 
    1194             : /*
    1195             :  * On a sync write or shutdown we just want to stale the buffer and let the
    1196             :  * caller handle the error in bp->b_error appropriately.
    1197             :  *
    1198             :  * If the write was asynchronous then no one will be looking for the error.  If
    1199             :  * this is the first failure of this type, clear the error state and write the
    1200             :  * buffer out again. This means we always retry an async write failure at least
    1201             :  * once, but we also need to set the buffer up to behave correctly now for
    1202             :  * repeated failures.
    1203             :  *
    1204             :  * If we get repeated async write failures, then we take action according to the
    1205             :  * error configuration we have been set up to use.
    1206             :  *
    1207             :  * Returns true if this function took care of error handling and the caller must
    1208             :  * not touch the buffer again.  Return false if the caller should proceed with
    1209             :  * normal I/O completion handling.
    1210             :  */
    1211             : static bool
    1212     2962067 : xfs_buf_ioend_handle_error(
    1213             :         struct xfs_buf          *bp)
    1214             : {
    1215     2962067 :         struct xfs_mount        *mp = bp->b_mount;
    1216     2962067 :         struct xfs_error_cfg    *cfg;
    1217             : 
    1218             :         /*
    1219             :          * If we've already shutdown the journal because of I/O errors, there's
    1220             :          * no point in giving this a retry.
    1221             :          */
    1222     5924134 :         if (xlog_is_shutdown(mp->m_log))
    1223     2940147 :                 goto out_stale;
    1224             : 
    1225       21920 :         xfs_buf_ioerror_alert_ratelimited(bp);
    1226             : 
    1227             :         /*
    1228             :          * We're not going to bother about retrying this during recovery.
    1229             :          * One strike!
    1230             :          */
    1231       21920 :         if (bp->b_flags & _XBF_LOGRECOVERY) {
    1232           0 :                 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
    1233           0 :                 return false;
    1234             :         }
    1235             : 
    1236             :         /*
    1237             :          * Synchronous writes will have callers process the error.
    1238             :          */
    1239       21920 :         if (!(bp->b_flags & XBF_ASYNC))
    1240           0 :                 goto out_stale;
    1241             : 
    1242       21920 :         trace_xfs_buf_iodone_async(bp, _RET_IP_);
    1243             : 
    1244       21920 :         cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
    1245       21920 :         if (bp->b_last_error != bp->b_error ||
    1246       21365 :             !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
    1247         555 :                 bp->b_last_error = bp->b_error;
    1248         555 :                 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
    1249         415 :                     !bp->b_first_retry_time)
    1250         415 :                         bp->b_first_retry_time = jiffies;
    1251         555 :                 goto resubmit;
    1252             :         }
    1253             : 
    1254             :         /*
    1255             :          * Permanent error - we need to trigger a shutdown if we haven't already
    1256             :          * to indicate that inconsistency will result from this action.
    1257             :          */
    1258       21365 :         if (xfs_buf_ioerror_permanent(bp, cfg)) {
    1259          60 :                 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
    1260          60 :                 goto out_stale;
    1261             :         }
    1262             : 
    1263             :         /* Still considered a transient error. Caller will schedule retries. */
    1264       21305 :         if (bp->b_flags & _XBF_INODES)
    1265         177 :                 xfs_buf_inode_io_fail(bp);
    1266       21128 :         else if (bp->b_flags & _XBF_DQUOTS)
    1267         108 :                 xfs_buf_dquot_io_fail(bp);
    1268             :         else
    1269       21020 :                 ASSERT(list_empty(&bp->b_li_list));
    1270       21305 :         xfs_buf_ioerror(bp, 0);
    1271       21305 :         xfs_buf_relse(bp);
    1272       21305 :         return true;
    1273             : 
    1274             : resubmit:
    1275         555 :         xfs_buf_ioerror(bp, 0);
    1276         555 :         bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
    1277         555 :         xfs_buf_submit(bp);
    1278         555 :         return true;
    1279     2940207 : out_stale:
    1280     2940207 :         xfs_buf_stale(bp);
    1281     2940207 :         bp->b_flags |= XBF_DONE;
    1282     2940207 :         bp->b_flags &= ~XBF_WRITE;
    1283     2940207 :         trace_xfs_buf_error_relse(bp, _RET_IP_);
    1284     2940207 :         return false;
    1285             : }
    1286             : 
    1287             : static void
    1288   122611076 : xfs_buf_ioend(
    1289             :         struct xfs_buf  *bp)
    1290             : {
    1291   122611076 :         trace_xfs_buf_iodone(bp, _RET_IP_);
    1292             : 
    1293             :         /*
    1294             :          * Pull in IO completion errors now. We are guaranteed to be running
    1295             :          * single threaded, so we don't need the lock to read b_io_error.
    1296             :          */
    1297   122611077 :         if (!bp->b_error && bp->b_io_error)
    1298       74283 :                 xfs_buf_ioerror(bp, bp->b_io_error);
    1299             : 
    1300   122611077 :         if (bp->b_flags & XBF_READ) {
    1301    29770020 :                 if (!bp->b_error && bp->b_ops)
    1302    26968415 :                         bp->b_ops->verify_read(bp);
    1303    29770020 :                 if (!bp->b_error)
    1304    29695345 :                         bp->b_flags |= XBF_DONE;
    1305             :         } else {
    1306    92841057 :                 if (!bp->b_error) {
    1307    89878991 :                         bp->b_flags &= ~XBF_WRITE_FAIL;
    1308    89878991 :                         bp->b_flags |= XBF_DONE;
    1309             :                 }
    1310             : 
    1311    92841057 :                 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
    1312             :                         return;
    1313             : 
    1314             :                 /* clear the retry state */
    1315    92819197 :                 bp->b_last_error = 0;
    1316    92819197 :                 bp->b_retries = 0;
    1317    92819197 :                 bp->b_first_retry_time = 0;
    1318             : 
    1319             :                 /*
    1320             :                  * Note that for things like remote attribute buffers, there may
    1321             :                  * not be a buffer log item here, so processing the buffer log
    1322             :                  * item must remain optional.
    1323             :                  */
    1324    92819197 :                 if (bp->b_log_item)
    1325    52515915 :                         xfs_buf_item_done(bp);
    1326             : 
    1327    92819198 :                 if (bp->b_flags & _XBF_INODES)
    1328    24100458 :                         xfs_buf_inode_iodone(bp);
    1329    68718740 :                 else if (bp->b_flags & _XBF_DQUOTS)
    1330    10263687 :                         xfs_buf_dquot_iodone(bp);
    1331             : 
    1332             :         }
    1333             : 
    1334   122589218 :         bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
    1335             :                          _XBF_LOGRECOVERY);
    1336             : 
    1337   122589218 :         if (bp->b_flags & XBF_ASYNC)
    1338    86416329 :                 xfs_buf_relse(bp);
    1339             :         else
    1340    36172889 :                 complete(&bp->b_iowait);
    1341             : }
    1342             : 
    1343             : static void
    1344   119644199 : xfs_buf_ioend_work(
    1345             :         struct work_struct      *work)
    1346             : {
    1347   119644199 :         struct xfs_buf          *bp =
    1348   119644199 :                 container_of(work, struct xfs_buf, b_ioend_work);
    1349             : 
    1350   119644199 :         xfs_buf_ioend(bp);
    1351   119644198 : }
    1352             : 
    1353             : static void
    1354   119644198 : xfs_buf_ioend_async(
    1355             :         struct xfs_buf  *bp)
    1356             : {
    1357   119644198 :         INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
    1358   119644198 :         queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
    1359   119644199 : }
    1360             : 
    1361             : void
    1362    47876782 : __xfs_buf_ioerror(
    1363             :         struct xfs_buf          *bp,
    1364             :         int                     error,
    1365             :         xfs_failaddr_t          failaddr)
    1366             : {
    1367    47876782 :         ASSERT(error <= 0 && error >= -1000);
    1368    47876782 :         bp->b_error = error;
    1369    47876782 :         trace_xfs_buf_ioerror(bp, error, failaddr);
    1370    47800921 : }
    1371             : 
    1372             : void
    1373       19474 : xfs_buf_ioerror_alert(
    1374             :         struct xfs_buf          *bp,
    1375             :         xfs_failaddr_t          func)
    1376             : {
    1377       19474 :         xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
    1378             :                 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
    1379             :                                   func, (uint64_t)xfs_buf_daddr(bp),
    1380       19474 :                                   bp->b_length, -bp->b_error);
    1381       19497 : }
    1382             : 
    1383             : /*
    1384             :  * To simulate an I/O failure, the buffer must be locked and held with at least
    1385             :  * three references. The LRU reference is dropped by the stale call. The buf
    1386             :  * item reference is dropped via ioend processing. The third reference is owned
    1387             :  * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
    1388             :  */
    1389             : void
    1390     2954357 : xfs_buf_ioend_fail(
    1391             :         struct xfs_buf  *bp)
    1392             : {
    1393     2954357 :         bp->b_flags &= ~XBF_DONE;
    1394     2954357 :         xfs_buf_stale(bp);
    1395     2954357 :         xfs_buf_ioerror(bp, -EIO);
    1396     2954357 :         xfs_buf_ioend(bp);
    1397     2954357 : }
    1398             : 
    1399             : int
    1400        4445 : xfs_bwrite(
    1401             :         struct xfs_buf          *bp)
    1402             : {
    1403        4445 :         int                     error;
    1404             : 
    1405        4445 :         ASSERT(xfs_buf_islocked(bp));
    1406             : 
    1407        4445 :         bp->b_flags |= XBF_WRITE;
    1408        4445 :         bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
    1409             :                          XBF_DONE);
    1410             : 
    1411        4445 :         error = xfs_buf_submit(bp);
    1412        4445 :         if (error)
    1413           0 :                 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
    1414        4445 :         return error;
    1415             : }
    1416             : 
    1417             : static void
    1418   119657355 : xfs_buf_bio_end_io(
    1419             :         struct bio              *bio)
    1420             : {
    1421   119657355 :         struct xfs_buf          *bp = (struct xfs_buf *)bio->bi_private;
    1422             : 
    1423   119657355 :         if (!bio->bi_status &&
    1424   183518994 :             (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
    1425    63935922 :             XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
    1426           0 :                 bio->bi_status = BLK_STS_IOERR;
    1427             : 
    1428             :         /*
    1429             :          * don't overwrite existing errors - otherwise we can lose errors on
    1430             :          * buffers that require multiple bios to complete.
    1431             :          */
    1432   119657355 :         if (bio->bi_status) {
    1433       74283 :                 int error = blk_status_to_errno(bio->bi_status);
    1434             : 
    1435       74283 :                 cmpxchg(&bp->b_io_error, 0, error);
    1436             :         }
    1437             : 
    1438   119657355 :         if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
    1439             :                 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
    1440             : 
    1441   119657355 :         if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
    1442   119596461 :                 xfs_buf_ioend_async(bp);
    1443   119657356 :         bio_put(bio);
    1444   119657354 : }
    1445             : 
    1446             : static void
    1447   119646456 : xfs_buf_ioapply_map(
    1448             :         struct xfs_buf  *bp,
    1449             :         int             map,
    1450             :         int             *buf_offset,
    1451             :         int             *count,
    1452             :         blk_opf_t       op)
    1453             : {
    1454   119646456 :         int             page_index;
    1455   119646456 :         unsigned int    total_nr_pages = bp->b_page_count;
    1456   119646456 :         int             nr_pages;
    1457   119646456 :         struct bio      *bio;
    1458   119646456 :         sector_t        sector =  bp->b_maps[map].bm_bn;
    1459   119646456 :         int             size;
    1460   119646456 :         int             offset;
    1461             : 
    1462             :         /* skip the pages in the buffer before the start offset */
    1463   119646456 :         page_index = 0;
    1464   119646456 :         offset = *buf_offset;
    1465   119649322 :         while (offset >= PAGE_SIZE) {
    1466        2866 :                 page_index++;
    1467        2866 :                 offset -= PAGE_SIZE;
    1468             :         }
    1469             : 
    1470             :         /*
    1471             :          * Limit the IO size to the length of the current vector, and update the
    1472             :          * remaining IO count for the next time around.
    1473             :          */
    1474   119646456 :         size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
    1475   119646456 :         *count -= size;
    1476   119646456 :         *buf_offset += size;
    1477             : 
    1478   119646456 : next_chunk:
    1479   119646456 :         atomic_inc(&bp->b_io_remaining);
    1480   119652946 :         nr_pages = bio_max_segs(total_nr_pages);
    1481             : 
    1482   119652946 :         bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
    1483   119651458 :         bio->bi_iter.bi_sector = sector;
    1484   119651458 :         bio->bi_end_io = xfs_buf_bio_end_io;
    1485   119651458 :         bio->bi_private = bp;
    1486             : 
    1487   360122796 :         for (; size && nr_pages; nr_pages--, page_index++) {
    1488   240478423 :                 int     rbytes, nbytes = PAGE_SIZE - offset;
    1489             : 
    1490   240478423 :                 if (nbytes > size)
    1491             :                         nbytes = size;
    1492             : 
    1493   240478423 :                 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
    1494             :                                       offset);
    1495   240471338 :                 if (rbytes < nbytes)
    1496             :                         break;
    1497             : 
    1498   240471338 :                 offset = 0;
    1499   240471338 :                 sector += BTOBB(nbytes);
    1500   240471338 :                 size -= nbytes;
    1501   240471338 :                 total_nr_pages--;
    1502             :         }
    1503             : 
    1504   119644373 :         if (likely(bio->bi_iter.bi_size)) {
    1505   119644373 :                 if (xfs_buf_is_vmapped(bp)) {
    1506             :                         flush_kernel_vmap_range(bp->b_addr,
    1507             :                                                 xfs_buf_vmap_len(bp));
    1508             :                 }
    1509   119644373 :                 submit_bio(bio);
    1510   119643400 :                 if (size)
    1511           0 :                         goto next_chunk;
    1512             :         } else {
    1513             :                 /*
    1514             :                  * This is guaranteed not to be the last io reference count
    1515             :                  * because the caller (xfs_buf_submit) holds a count itself.
    1516             :                  */
    1517           0 :                 atomic_dec(&bp->b_io_remaining);
    1518           0 :                 xfs_buf_ioerror(bp, -EIO);
    1519           0 :                 bio_put(bio);
    1520             :         }
    1521             : 
    1522   119643400 : }
    1523             : 
    1524             : STATIC void
    1525   119649122 : _xfs_buf_ioapply(
    1526             :         struct xfs_buf  *bp)
    1527             : {
    1528   119649122 :         struct blk_plug plug;
    1529   119649122 :         blk_opf_t       op;
    1530   119649122 :         int             offset;
    1531   119649122 :         int             size;
    1532   119649122 :         int             i;
    1533             : 
    1534             :         /*
    1535             :          * Make sure we capture only current IO errors rather than stale errors
    1536             :          * left over from previous use of the buffer (e.g. failed readahead).
    1537             :          */
    1538   119649122 :         bp->b_error = 0;
    1539             : 
    1540   119649122 :         if (bp->b_flags & XBF_WRITE) {
    1541    89901297 :                 op = REQ_OP_WRITE;
    1542             : 
    1543             :                 /*
    1544             :                  * Run the write verifier callback function if it exists. If
    1545             :                  * this function fails it will mark the buffer with an error and
    1546             :                  * the IO should not be dispatched.
    1547             :                  */
    1548    89901297 :                 if (bp->b_ops) {
    1549    89901297 :                         bp->b_ops->verify_write(bp);
    1550    89901298 :                         if (bp->b_error) {
    1551          22 :                                 xfs_force_shutdown(bp->b_mount,
    1552             :                                                    SHUTDOWN_CORRUPT_INCORE);
    1553          22 :                                 return;
    1554             :                         }
    1555           0 :                 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
    1556           0 :                         struct xfs_mount *mp = bp->b_mount;
    1557             : 
    1558             :                         /*
    1559             :                          * non-crc filesystems don't attach verifiers during
    1560             :                          * log recovery, so don't warn for such filesystems.
    1561             :                          */
    1562           0 :                         if (xfs_has_crc(mp)) {
    1563           0 :                                 xfs_warn(mp,
    1564             :                                         "%s: no buf ops on daddr 0x%llx len %d",
    1565             :                                         __func__, xfs_buf_daddr(bp),
    1566             :                                         bp->b_length);
    1567           0 :                                 xfs_hex_dump(bp->b_addr,
    1568             :                                                 XFS_CORRUPTION_DUMP_LEN);
    1569           0 :                                 dump_stack();
    1570             :                         }
    1571             :                 }
    1572             :         } else {
    1573    29747825 :                 op = REQ_OP_READ;
    1574    29747825 :                 if (bp->b_flags & XBF_READ_AHEAD)
    1575    19540096 :                         op |= REQ_RAHEAD;
    1576             :         }
    1577             : 
    1578             :         /* we only use the buffer cache for meta-data */
    1579   119649101 :         op |= REQ_META;
    1580             : 
    1581             :         /*
    1582             :          * Walk all the vectors issuing IO on them. Set up the initial offset
    1583             :          * into the buffer and the desired IO size before we start -
    1584             :          * _xfs_buf_ioapply_vec() will modify them appropriately for each
    1585             :          * subsequent call.
    1586             :          */
    1587   119649101 :         offset = bp->b_offset;
    1588   119649101 :         size = BBTOB(bp->b_length);
    1589   119649101 :         blk_start_plug(&plug);
    1590   239293960 :         for (i = 0; i < bp->b_map_count; i++) {
    1591   119644859 :                 xfs_buf_ioapply_map(bp, i, &offset, &size, op);
    1592   119637745 :                 if (bp->b_error)
    1593             :                         break;
    1594   119637745 :                 if (size <= 0)
    1595             :                         break;  /* all done */
    1596             :         }
    1597   119637091 :         blk_finish_plug(&plug);
    1598             : }
    1599             : 
    1600             : /*
    1601             :  * Wait for I/O completion of a sync buffer and return the I/O error code.
    1602             :  */
    1603             : static int
    1604    36156488 : xfs_buf_iowait(
    1605             :         struct xfs_buf  *bp)
    1606             : {
    1607    36156488 :         ASSERT(!(bp->b_flags & XBF_ASYNC));
    1608             : 
    1609    36156488 :         trace_xfs_buf_iowait(bp, _RET_IP_);
    1610    36154540 :         wait_for_completion(&bp->b_iowait);
    1611    36156272 :         trace_xfs_buf_iowait_done(bp, _RET_IP_);
    1612             : 
    1613    36154246 :         return bp->b_error;
    1614             : }
    1615             : 
    1616             : /*
    1617             :  * Buffer I/O submission path, read or write. Asynchronous submission transfers
    1618             :  * the buffer lock ownership and the current reference to the IO. It is not
    1619             :  * safe to reference the buffer after a call to this function unless the caller
    1620             :  * holds an additional reference itself.
    1621             :  */
    1622             : static int
    1623   121463468 : __xfs_buf_submit(
    1624             :         struct xfs_buf  *bp,
    1625             :         bool            wait)
    1626             : {
    1627   121463468 :         int             error = 0;
    1628             : 
    1629   121463468 :         trace_xfs_buf_submit(bp, _RET_IP_);
    1630             : 
    1631   121459624 :         ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
    1632             : 
    1633             :         /*
    1634             :          * On log shutdown we stale and complete the buffer immediately. We can
    1635             :          * be called to read the superblock before the log has been set up, so
    1636             :          * be careful checking the log state.
    1637             :          *
    1638             :          * Checking the mount shutdown state here can result in the log tail
    1639             :          * moving inappropriately on disk as the log may not yet be shut down.
    1640             :          * i.e. failing this buffer on mount shutdown can remove it from the AIL
    1641             :          * and move the tail of the log forwards without having written this
    1642             :          * buffer to disk. This corrupts the log tail state in memory, and
    1643             :          * because the log may not be shut down yet, it can then be propagated
    1644             :          * to disk before the log is shutdown. Hence we check log shutdown
    1645             :          * state here rather than mount state to avoid corrupting the log tail
    1646             :          * on shutdown.
    1647             :          */
    1648   242705689 :         if (bp->b_mount->m_log &&
    1649             :             xlog_is_shutdown(bp->b_mount->m_log)) {
    1650     1814997 :                 xfs_buf_ioend_fail(bp);
    1651     1814997 :                 return -EIO;
    1652             :         }
    1653             : 
    1654             :         /*
    1655             :          * Grab a reference so the buffer does not go away underneath us. For
    1656             :          * async buffers, I/O completion drops the callers reference, which
    1657             :          * could occur before submission returns.
    1658             :          */
    1659   119644627 :         xfs_buf_hold(bp);
    1660             : 
    1661   119652887 :         if (bp->b_flags & XBF_WRITE)
    1662    89901297 :                 xfs_buf_wait_unpin(bp);
    1663             : 
    1664             :         /* clear the internal error state to avoid spurious errors */
    1665   119652888 :         bp->b_io_error = 0;
    1666             : 
    1667             :         /*
    1668             :          * Set the count to 1 initially, this will stop an I/O completion
    1669             :          * callout which happens before we have started all the I/O from calling
    1670             :          * xfs_buf_ioend too early.
    1671             :          */
    1672   119652888 :         atomic_set(&bp->b_io_remaining, 1);
    1673   119652888 :         if (bp->b_flags & XBF_ASYNC)
    1674    83498017 :                 xfs_buf_ioacct_inc(bp);
    1675   119652892 :         _xfs_buf_ioapply(bp);
    1676             : 
    1677             :         /*
    1678             :          * If _xfs_buf_ioapply failed, we can get back here with only the IO
    1679             :          * reference we took above. If we drop it to zero, run completion so
    1680             :          * that we don't return to the caller with completion still pending.
    1681             :          */
    1682   119653925 :         if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
    1683       60262 :                 if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
    1684       12524 :                         xfs_buf_ioend(bp);
    1685             :                 else
    1686       47738 :                         xfs_buf_ioend_async(bp);
    1687             :         }
    1688             : 
    1689   119656126 :         if (wait)
    1690    10218175 :                 error = xfs_buf_iowait(bp);
    1691             : 
    1692             :         /*
    1693             :          * Release the hold that keeps the buffer referenced for the entire
    1694             :          * I/O. Note that if the buffer is async, it is not safe to reference
    1695             :          * after this release.
    1696             :          */
    1697   119653713 :         xfs_buf_rele(bp);
    1698   119653713 :         return error;
    1699             : }
    1700             : 
    1701             : void *
    1702 20382121975 : xfs_buf_offset(
    1703             :         struct xfs_buf          *bp,
    1704             :         size_t                  offset)
    1705             : {
    1706 20382121975 :         struct page             *page;
    1707             : 
    1708 20382121975 :         if (bp->b_addr)
    1709 16109283921 :                 return bp->b_addr + offset;
    1710             : 
    1711  4272838054 :         page = bp->b_pages[offset >> PAGE_SHIFT];
    1712  4272838054 :         return page_address(page) + (offset & (PAGE_SIZE-1));
    1713             : }
    1714             : 
    1715             : void
    1716     1957084 : xfs_buf_zero(
    1717             :         struct xfs_buf          *bp,
    1718             :         size_t                  boff,
    1719             :         size_t                  bsize)
    1720             : {
    1721     1957084 :         size_t                  bend;
    1722             : 
    1723     1957084 :         bend = boff + bsize;
    1724     8613975 :         while (boff < bend) {
    1725     6655705 :                 struct page     *page;
    1726     6655705 :                 int             page_index, page_offset, csize;
    1727             : 
    1728     6655705 :                 page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
    1729     6655705 :                 page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
    1730     6655705 :                 page = bp->b_pages[page_index];
    1731     6655705 :                 csize = min_t(size_t, PAGE_SIZE - page_offset,
    1732             :                                       BBTOB(bp->b_length) - boff);
    1733             : 
    1734     6655705 :                 ASSERT((csize + page_offset) <= PAGE_SIZE);
    1735             : 
    1736     6656891 :                 memset(page_address(page) + page_offset, 0, csize);
    1737             : 
    1738     6656891 :                 boff += csize;
    1739             :         }
    1740     1958270 : }
    1741             : 
    1742             : /*
    1743             :  * Log a message about and stale a buffer that a caller has decided is corrupt.
    1744             :  *
    1745             :  * This function should be called for the kinds of metadata corruption that
    1746             :  * cannot be detect from a verifier, such as incorrect inter-block relationship
    1747             :  * data.  Do /not/ call this function from a verifier function.
    1748             :  *
    1749             :  * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
    1750             :  * be marked stale, but b_error will not be set.  The caller is responsible for
    1751             :  * releasing the buffer or fixing it.
    1752             :  */
    1753             : void
    1754           0 : __xfs_buf_mark_corrupt(
    1755             :         struct xfs_buf          *bp,
    1756             :         xfs_failaddr_t          fa)
    1757             : {
    1758           0 :         ASSERT(bp->b_flags & XBF_DONE);
    1759             : 
    1760           0 :         xfs_buf_corruption_error(bp, fa);
    1761           0 :         xfs_buf_stale(bp);
    1762           0 : }
    1763             : 
    1764             : /*
    1765             :  *      Handling of buffer targets (buftargs).
    1766             :  */
    1767             : 
    1768             : /*
    1769             :  * Wait for any bufs with callbacks that have been submitted but have not yet
    1770             :  * returned. These buffers will have an elevated hold count, so wait on those
    1771             :  * while freeing all the buffers only held by the LRU.
    1772             :  */
    1773             : static enum lru_status
    1774    31267399 : xfs_buftarg_drain_rele(
    1775             :         struct list_head        *item,
    1776             :         struct list_lru_one     *lru,
    1777             :         spinlock_t              *lru_lock,
    1778             :         void                    *arg)
    1779             : 
    1780             : {
    1781    31267399 :         struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
    1782    31267399 :         struct list_head        *dispose = arg;
    1783             : 
    1784    31267399 :         if (atomic_read(&bp->b_hold) > 1) {
    1785             :                 /* need to wait, so skip it this pass */
    1786           2 :                 trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
    1787           2 :                 return LRU_SKIP;
    1788             :         }
    1789    31267397 :         if (!spin_trylock(&bp->b_lock))
    1790             :                 return LRU_SKIP;
    1791             : 
    1792             :         /*
    1793             :          * clear the LRU reference count so the buffer doesn't get
    1794             :          * ignored in xfs_buf_rele().
    1795             :          */
    1796    31267397 :         atomic_set(&bp->b_lru_ref, 0);
    1797    31267397 :         bp->b_state |= XFS_BSTATE_DISPOSE;
    1798    31267397 :         list_lru_isolate_move(lru, item, dispose);
    1799    31267397 :         spin_unlock(&bp->b_lock);
    1800    31267397 :         return LRU_REMOVED;
    1801             : }
    1802             : 
    1803             : /*
    1804             :  * Wait for outstanding I/O on the buftarg to complete.
    1805             :  */
    1806             : void
    1807      243076 : xfs_buftarg_wait(
    1808             :         struct xfs_buftarg      *btp)
    1809             : {
    1810             :         /*
    1811             :          * First wait on the buftarg I/O count for all in-flight buffers to be
    1812             :          * released. This is critical as new buffers do not make the LRU until
    1813             :          * they are released.
    1814             :          *
    1815             :          * Next, flush the buffer workqueue to ensure all completion processing
    1816             :          * has finished. Just waiting on buffer locks is not sufficient for
    1817             :          * async IO as the reference count held over IO is not released until
    1818             :          * after the buffer lock is dropped. Hence we need to ensure here that
    1819             :          * all reference counts have been dropped before we start walking the
    1820             :          * LRU list.
    1821             :          */
    1822      246692 :         while (percpu_counter_sum(&btp->bt_io_count))
    1823        3616 :                 delay(100);
    1824      243076 :         flush_workqueue(btp->bt_mount->m_buf_workqueue);
    1825      243076 : }
    1826             : 
    1827             : void
    1828      118655 : xfs_buftarg_drain(
    1829             :         struct xfs_buftarg      *btp)
    1830             : {
    1831      118655 :         LIST_HEAD(dispose);
    1832      118655 :         int                     loop = 0;
    1833      118655 :         bool                    write_fail = false;
    1834             : 
    1835      118655 :         xfs_buftarg_wait(btp);
    1836             : 
    1837             :         /* loop until there is nothing left on the lru list. */
    1838      234581 :         while (list_lru_count(&btp->bt_lru)) {
    1839      115926 :                 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
    1840             :                               &dispose, LONG_MAX);
    1841             : 
    1842    31383323 :                 while (!list_empty(&dispose)) {
    1843    31267397 :                         struct xfs_buf *bp;
    1844    31267397 :                         bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
    1845    31267397 :                         list_del_init(&bp->b_lru);
    1846    31267397 :                         if (bp->b_flags & XBF_WRITE_FAIL) {
    1847           0 :                                 write_fail = true;
    1848           0 :                                 xfs_buf_alert_ratelimited(bp,
    1849             :                                         "XFS: Corruption Alert",
    1850             : "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
    1851             :                                         (long long)xfs_buf_daddr(bp));
    1852             :                         }
    1853    31267397 :                         xfs_buf_rele(bp);
    1854             :                 }
    1855      115926 :                 if (loop++ != 0)
    1856           1 :                         delay(100);
    1857             :         }
    1858             : 
    1859             :         /*
    1860             :          * If one or more failed buffers were freed, that means dirty metadata
    1861             :          * was thrown away. This should only ever happen after I/O completion
    1862             :          * handling has elevated I/O error(s) to permanent failures and shuts
    1863             :          * down the journal.
    1864             :          */
    1865      118655 :         if (write_fail) {
    1866           0 :                 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
    1867           0 :                 xfs_alert(btp->bt_mount,
    1868             :               "Please run xfs_repair to determine the extent of the problem.");
    1869             :         }
    1870      118655 : }
    1871             : 
    1872             : static enum lru_status
    1873     2386841 : xfs_buftarg_isolate(
    1874             :         struct list_head        *item,
    1875             :         struct list_lru_one     *lru,
    1876             :         spinlock_t              *lru_lock,
    1877             :         void                    *arg)
    1878             : {
    1879     2386841 :         struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
    1880     2386841 :         struct list_head        *dispose = arg;
    1881             : 
    1882             :         /*
    1883             :          * we are inverting the lru lock/bp->b_lock here, so use a trylock.
    1884             :          * If we fail to get the lock, just skip it.
    1885             :          */
    1886     2386841 :         if (!spin_trylock(&bp->b_lock))
    1887             :                 return LRU_SKIP;
    1888             :         /*
    1889             :          * Decrement the b_lru_ref count unless the value is already
    1890             :          * zero. If the value is already zero, we need to reclaim the
    1891             :          * buffer, otherwise it gets another trip through the LRU.
    1892             :          */
    1893     4773680 :         if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
    1894     1604625 :                 spin_unlock(&bp->b_lock);
    1895     1604625 :                 return LRU_ROTATE;
    1896             :         }
    1897             : 
    1898      782215 :         bp->b_state |= XFS_BSTATE_DISPOSE;
    1899      782215 :         list_lru_isolate_move(lru, item, dispose);
    1900      782215 :         spin_unlock(&bp->b_lock);
    1901      782215 :         return LRU_REMOVED;
    1902             : }
    1903             : 
    1904             : static unsigned long
    1905       20694 : xfs_buftarg_shrink_scan(
    1906             :         struct shrinker         *shrink,
    1907             :         struct shrink_control   *sc)
    1908             : {
    1909       20694 :         struct xfs_buftarg      *btp = container_of(shrink,
    1910             :                                         struct xfs_buftarg, bt_shrinker);
    1911       20694 :         LIST_HEAD(dispose);
    1912       20694 :         unsigned long           freed;
    1913             : 
    1914       20694 :         freed = list_lru_shrink_walk(&btp->bt_lru, sc,
    1915             :                                      xfs_buftarg_isolate, &dispose);
    1916             : 
    1917      802909 :         while (!list_empty(&dispose)) {
    1918      782215 :                 struct xfs_buf *bp;
    1919      782215 :                 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
    1920      782215 :                 list_del_init(&bp->b_lru);
    1921      782215 :                 xfs_buf_rele(bp);
    1922             :         }
    1923             : 
    1924       20694 :         return freed;
    1925             : }
    1926             : 
    1927             : static unsigned long
    1928       12802 : xfs_buftarg_shrink_count(
    1929             :         struct shrinker         *shrink,
    1930             :         struct shrink_control   *sc)
    1931             : {
    1932       12802 :         struct xfs_buftarg      *btp = container_of(shrink,
    1933             :                                         struct xfs_buftarg, bt_shrinker);
    1934       12802 :         return list_lru_shrink_count(&btp->bt_lru, sc);
    1935             : }
    1936             : 
    1937             : void
    1938       94230 : xfs_free_buftarg(
    1939             :         struct xfs_buftarg      *btp)
    1940             : {
    1941       94230 :         unregister_shrinker(&btp->bt_shrinker);
    1942       94230 :         ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
    1943       94230 :         percpu_counter_destroy(&btp->bt_io_count);
    1944       94230 :         list_lru_destroy(&btp->bt_lru);
    1945             : 
    1946       94230 :         blkdev_issue_flush(btp->bt_bdev);
    1947       94230 :         invalidate_bdev(btp->bt_bdev);
    1948       94230 :         fs_put_dax(btp->bt_daxdev, btp->bt_mount);
    1949             : 
    1950       94230 :         kmem_free(btp);
    1951       94230 : }
    1952             : 
    1953             : int
    1954      187732 : xfs_setsize_buftarg(
    1955             :         xfs_buftarg_t           *btp,
    1956             :         unsigned int            sectorsize)
    1957             : {
    1958             :         /* Set up metadata sector size info */
    1959      187732 :         btp->bt_meta_sectorsize = sectorsize;
    1960      187732 :         btp->bt_meta_sectormask = sectorsize - 1;
    1961             : 
    1962      187732 :         if (set_blocksize(btp->bt_bdev, sectorsize)) {
    1963           0 :                 xfs_warn(btp->bt_mount,
    1964             :                         "Cannot set_blocksize to %u on device %pg",
    1965             :                         sectorsize, btp->bt_bdev);
    1966           0 :                 return -EINVAL;
    1967             :         }
    1968             : 
    1969             :         /* Set up device logical sector size mask */
    1970      187732 :         btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
    1971      187732 :         btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
    1972             : 
    1973      187732 :         return 0;
    1974             : }
    1975             : 
    1976             : /*
    1977             :  * When allocating the initial buffer target we have not yet
    1978             :  * read in the superblock, so don't know what sized sectors
    1979             :  * are being used at this early stage.  Play safe.
    1980             :  */
    1981             : STATIC int
    1982       94220 : xfs_setsize_buftarg_early(
    1983             :         xfs_buftarg_t           *btp,
    1984             :         struct block_device     *bdev)
    1985             : {
    1986      188440 :         return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
    1987             : }
    1988             : 
    1989             : struct xfs_buftarg *
    1990       94220 : xfs_alloc_buftarg(
    1991             :         struct xfs_mount        *mp,
    1992             :         struct block_device     *bdev)
    1993             : {
    1994       94220 :         xfs_buftarg_t           *btp;
    1995       94220 :         const struct dax_holder_operations *ops = NULL;
    1996             : 
    1997             : #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
    1998       94220 :         ops = &xfs_dax_holder_operations;
    1999             : #endif
    2000       94220 :         btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
    2001             : 
    2002       94220 :         btp->bt_mount = mp;
    2003       94220 :         btp->bt_dev =  bdev->bd_dev;
    2004       94220 :         btp->bt_bdev = bdev;
    2005       94220 :         btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
    2006             :                                             mp, ops);
    2007             : 
    2008             :         /*
    2009             :          * Buffer IO error rate limiting. Limit it to no more than 10 messages
    2010             :          * per 30 seconds so as to not spam logs too much on repeated errors.
    2011             :          */
    2012       94220 :         ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
    2013             :                              DEFAULT_RATELIMIT_BURST);
    2014             : 
    2015       94220 :         if (xfs_setsize_buftarg_early(btp, bdev))
    2016           0 :                 goto error_free;
    2017             : 
    2018       94220 :         if (list_lru_init(&btp->bt_lru))
    2019           0 :                 goto error_free;
    2020             : 
    2021       94220 :         if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
    2022           0 :                 goto error_lru;
    2023             : 
    2024       94220 :         btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
    2025       94220 :         btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
    2026       94220 :         btp->bt_shrinker.seeks = DEFAULT_SEEKS;
    2027       94220 :         btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
    2028       94220 :         if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
    2029       94220 :                               mp->m_super->s_id))
    2030           0 :                 goto error_pcpu;
    2031             :         return btp;
    2032             : 
    2033             : error_pcpu:
    2034           0 :         percpu_counter_destroy(&btp->bt_io_count);
    2035           0 : error_lru:
    2036           0 :         list_lru_destroy(&btp->bt_lru);
    2037           0 : error_free:
    2038           0 :         kmem_free(btp);
    2039           0 :         return NULL;
    2040             : }
    2041             : 
    2042             : /*
    2043             :  * Cancel a delayed write list.
    2044             :  *
    2045             :  * Remove each buffer from the list, clear the delwri queue flag and drop the
    2046             :  * associated buffer reference.
    2047             :  */
    2048             : void
    2049       66350 : xfs_buf_delwri_cancel(
    2050             :         struct list_head        *list)
    2051             : {
    2052       66350 :         struct xfs_buf          *bp;
    2053             : 
    2054       66350 :         while (!list_empty(list)) {
    2055           0 :                 bp = list_first_entry(list, struct xfs_buf, b_list);
    2056             : 
    2057           0 :                 xfs_buf_lock(bp);
    2058           0 :                 bp->b_flags &= ~_XBF_DELWRI_Q;
    2059           0 :                 list_del_init(&bp->b_list);
    2060           0 :                 xfs_buf_relse(bp);
    2061             :         }
    2062       66350 : }
    2063             : 
    2064             : /*
    2065             :  * Add a buffer to the delayed write list.
    2066             :  *
    2067             :  * This queues a buffer for writeout if it hasn't already been.  Note that
    2068             :  * neither this routine nor the buffer list submission functions perform
    2069             :  * any internal synchronization.  It is expected that the lists are thread-local
    2070             :  * to the callers.
    2071             :  *
    2072             :  * Returns true if we queued up the buffer, or false if it already had
    2073             :  * been on the buffer list.
    2074             :  */
    2075             : bool
    2076   103539846 : xfs_buf_delwri_queue(
    2077             :         struct xfs_buf          *bp,
    2078             :         struct list_head        *list)
    2079             : {
    2080   103539846 :         ASSERT(xfs_buf_islocked(bp));
    2081   103539846 :         ASSERT(!(bp->b_flags & XBF_READ));
    2082             : 
    2083             :         /*
    2084             :          * If the buffer is already marked delwri it already is queued up
    2085             :          * by someone else for imediate writeout.  Just ignore it in that
    2086             :          * case.
    2087             :          */
    2088   103539846 :         if (bp->b_flags & _XBF_DELWRI_Q) {
    2089    11813899 :                 trace_xfs_buf_delwri_queued(bp, _RET_IP_);
    2090    11813899 :                 return false;
    2091             :         }
    2092             : 
    2093    91725947 :         trace_xfs_buf_delwri_queue(bp, _RET_IP_);
    2094             : 
    2095             :         /*
    2096             :          * If a buffer gets written out synchronously or marked stale while it
    2097             :          * is on a delwri list we lazily remove it. To do this, the other party
    2098             :          * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
    2099             :          * It remains referenced and on the list.  In a rare corner case it
    2100             :          * might get readded to a delwri list after the synchronous writeout, in
    2101             :          * which case we need just need to re-add the flag here.
    2102             :          */
    2103    91725946 :         bp->b_flags |= _XBF_DELWRI_Q;
    2104    91725946 :         if (list_empty(&bp->b_list)) {
    2105    91725944 :                 atomic_inc(&bp->b_hold);
    2106    91725945 :                 list_add_tail(&bp->b_list, list);
    2107             :         }
    2108             : 
    2109             :         return true;
    2110             : }
    2111             : 
    2112             : /*
    2113             :  * Compare function is more complex than it needs to be because
    2114             :  * the return value is only 32 bits and we are doing comparisons
    2115             :  * on 64 bit values
    2116             :  */
    2117             : static int
    2118   735895379 : xfs_buf_cmp(
    2119             :         void                    *priv,
    2120             :         const struct list_head  *a,
    2121             :         const struct list_head  *b)
    2122             : {
    2123   735895379 :         struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
    2124   735895379 :         struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
    2125   735895379 :         xfs_daddr_t             diff;
    2126             : 
    2127   735895379 :         diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
    2128   735895379 :         if (diff < 0)
    2129             :                 return -1;
    2130   362762096 :         if (diff > 0)
    2131   362759832 :                 return 1;
    2132             :         return 0;
    2133             : }
    2134             : 
    2135             : /*
    2136             :  * Submit buffers for write. If wait_list is specified, the buffers are
    2137             :  * submitted using sync I/O and placed on the wait list such that the caller can
    2138             :  * iowait each buffer. Otherwise async I/O is used and the buffers are released
    2139             :  * at I/O completion time. In either case, buffers remain locked until I/O
    2140             :  * completes and the buffer is released from the queue.
    2141             :  */
    2142             : static int
    2143     4181305 : xfs_buf_delwri_submit_buffers(
    2144             :         struct list_head        *buffer_list,
    2145             :         struct list_head        *wait_list)
    2146             : {
    2147     4181305 :         struct xfs_buf          *bp, *n;
    2148     4181305 :         int                     pinned = 0;
    2149     4181305 :         struct blk_plug         plug;
    2150             : 
    2151     4181305 :         list_sort(NULL, buffer_list, xfs_buf_cmp);
    2152             : 
    2153     4181041 :         blk_start_plug(&plug);
    2154    96799790 :         list_for_each_entry_safe(bp, n, buffer_list, b_list) {
    2155    92618573 :                 if (!wait_list) {
    2156    66679402 :                         if (!xfs_buf_trylock(bp))
    2157       42383 :                                 continue;
    2158    66637019 :                         if (xfs_buf_ispinned(bp)) {
    2159      850245 :                                 xfs_buf_unlock(bp);
    2160      850245 :                                 pinned++;
    2161      850245 :                                 continue;
    2162             :                         }
    2163             :                 } else {
    2164    25939171 :                         xfs_buf_lock(bp);
    2165             :                 }
    2166             : 
    2167             :                 /*
    2168             :                  * Someone else might have written the buffer synchronously or
    2169             :                  * marked it stale in the meantime.  In that case only the
    2170             :                  * _XBF_DELWRI_Q flag got cleared, and we have to drop the
    2171             :                  * reference and remove it from the list here.
    2172             :                  */
    2173    91725945 :                 if (!(bp->b_flags & _XBF_DELWRI_Q)) {
    2174       29246 :                         list_del_init(&bp->b_list);
    2175       29246 :                         xfs_buf_relse(bp);
    2176       29246 :                         continue;
    2177             :                 }
    2178             : 
    2179    91696699 :                 trace_xfs_buf_delwri_split(bp, _RET_IP_);
    2180             : 
    2181             :                 /*
    2182             :                  * If we have a wait list, each buffer (and associated delwri
    2183             :                  * queue reference) transfers to it and is submitted
    2184             :                  * synchronously. Otherwise, drop the buffer from the delwri
    2185             :                  * queue and submit async.
    2186             :                  */
    2187    91696699 :                 bp->b_flags &= ~_XBF_DELWRI_Q;
    2188    91696699 :                 bp->b_flags |= XBF_WRITE;
    2189    91696699 :                 if (wait_list) {
    2190    25939171 :                         bp->b_flags &= ~XBF_ASYNC;
    2191    25939171 :                         list_move_tail(&bp->b_list, wait_list);
    2192             :                 } else {
    2193    65757528 :                         bp->b_flags |= XBF_ASYNC;
    2194    65757528 :                         list_del_init(&bp->b_list);
    2195             :                 }
    2196    91696699 :                 __xfs_buf_submit(bp, false);
    2197             :         }
    2198     4181217 :         blk_finish_plug(&plug);
    2199             : 
    2200     4181052 :         return pinned;
    2201             : }
    2202             : 
    2203             : /*
    2204             :  * Write out a buffer list asynchronously.
    2205             :  *
    2206             :  * This will take the @buffer_list, write all non-locked and non-pinned buffers
    2207             :  * out and not wait for I/O completion on any of the buffers.  This interface
    2208             :  * is only safely useable for callers that can track I/O completion by higher
    2209             :  * level means, e.g. AIL pushing as the @buffer_list is consumed in this
    2210             :  * function.
    2211             :  *
    2212             :  * Note: this function will skip buffers it would block on, and in doing so
    2213             :  * leaves them on @buffer_list so they can be retried on a later pass. As such,
    2214             :  * it is up to the caller to ensure that the buffer list is fully submitted or
    2215             :  * cancelled appropriately when they are finished with the list. Failure to
    2216             :  * cancel or resubmit the list until it is empty will result in leaked buffers
    2217             :  * at unmount time.
    2218             :  */
    2219             : int
    2220     3249126 : xfs_buf_delwri_submit_nowait(
    2221             :         struct list_head        *buffer_list)
    2222             : {
    2223     3249126 :         return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
    2224             : }
    2225             : 
    2226             : /*
    2227             :  * Write out a buffer list synchronously.
    2228             :  *
    2229             :  * This will take the @buffer_list, write all buffers out and wait for I/O
    2230             :  * completion on all of the buffers. @buffer_list is consumed by the function,
    2231             :  * so callers must have some other way of tracking buffers if they require such
    2232             :  * functionality.
    2233             :  */
    2234             : int
    2235      932211 : xfs_buf_delwri_submit(
    2236             :         struct list_head        *buffer_list)
    2237             : {
    2238      932211 :         LIST_HEAD               (wait_list);
    2239      932211 :         int                     error = 0, error2;
    2240      932211 :         struct xfs_buf          *bp;
    2241             : 
    2242      932211 :         xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
    2243             : 
    2244             :         /* Wait for IO to complete. */
    2245    26871382 :         while (!list_empty(&wait_list)) {
    2246    25939171 :                 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
    2247             : 
    2248    25939171 :                 list_del_init(&bp->b_list);
    2249             : 
    2250             :                 /*
    2251             :                  * Wait on the locked buffer, check for errors and unlock and
    2252             :                  * release the delwri queue reference.
    2253             :                  */
    2254    25939171 :                 error2 = xfs_buf_iowait(bp);
    2255    25939171 :                 xfs_buf_relse(bp);
    2256    25939171 :                 if (!error)
    2257    25939171 :                         error = error2;
    2258             :         }
    2259             : 
    2260      932211 :         return error;
    2261             : }
    2262             : 
    2263             : /*
    2264             :  * Push a single buffer on a delwri queue.
    2265             :  *
    2266             :  * The purpose of this function is to submit a single buffer of a delwri queue
    2267             :  * and return with the buffer still on the original queue. The waiting delwri
    2268             :  * buffer submission infrastructure guarantees transfer of the delwri queue
    2269             :  * buffer reference to a temporary wait list. We reuse this infrastructure to
    2270             :  * transfer the buffer back to the original queue.
    2271             :  *
    2272             :  * Note the buffer transitions from the queued state, to the submitted and wait
    2273             :  * listed state and back to the queued state during this call. The buffer
    2274             :  * locking and queue management logic between _delwri_pushbuf() and
    2275             :  * _delwri_queue() guarantee that the buffer cannot be queued to another list
    2276             :  * before returning.
    2277             :  */
    2278             : int
    2279           0 : xfs_buf_delwri_pushbuf(
    2280             :         struct xfs_buf          *bp,
    2281             :         struct list_head        *buffer_list)
    2282             : {
    2283           0 :         LIST_HEAD               (submit_list);
    2284           0 :         int                     error;
    2285             : 
    2286           0 :         ASSERT(bp->b_flags & _XBF_DELWRI_Q);
    2287             : 
    2288           0 :         trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
    2289             : 
    2290             :         /*
    2291             :          * Isolate the buffer to a new local list so we can submit it for I/O
    2292             :          * independently from the rest of the original list.
    2293             :          */
    2294           0 :         xfs_buf_lock(bp);
    2295           0 :         list_move(&bp->b_list, &submit_list);
    2296           0 :         xfs_buf_unlock(bp);
    2297             : 
    2298             :         /*
    2299             :          * Delwri submission clears the DELWRI_Q buffer flag and returns with
    2300             :          * the buffer on the wait list with the original reference. Rather than
    2301             :          * bounce the buffer from a local wait list back to the original list
    2302             :          * after I/O completion, reuse the original list as the wait list.
    2303             :          */
    2304           0 :         xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
    2305             : 
    2306             :         /*
    2307             :          * The buffer is now locked, under I/O and wait listed on the original
    2308             :          * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
    2309             :          * return with the buffer unlocked and on the original queue.
    2310             :          */
    2311           0 :         error = xfs_buf_iowait(bp);
    2312           0 :         bp->b_flags |= _XBF_DELWRI_Q;
    2313           0 :         xfs_buf_unlock(bp);
    2314             : 
    2315           0 :         return error;
    2316             : }
    2317             : 
    2318 16381079072 : void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
    2319             : {
    2320             :         /*
    2321             :          * Set the lru reference count to 0 based on the error injection tag.
    2322             :          * This allows userspace to disrupt buffer caching for debug/testing
    2323             :          * purposes.
    2324             :          */
    2325 16381079072 :         if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
    2326          38 :                 lru_ref = 0;
    2327             : 
    2328 16377975949 :         atomic_set(&bp->b_lru_ref, lru_ref);
    2329 16377975949 : }
    2330             : 
    2331             : /*
    2332             :  * Verify an on-disk magic value against the magic value specified in the
    2333             :  * verifier structure. The verifier magic is in disk byte order so the caller is
    2334             :  * expected to pass the value directly from disk.
    2335             :  */
    2336             : bool
    2337   190206866 : xfs_verify_magic(
    2338             :         struct xfs_buf          *bp,
    2339             :         __be32                  dmagic)
    2340             : {
    2341   190206866 :         struct xfs_mount        *mp = bp->b_mount;
    2342   190206866 :         int                     idx;
    2343             : 
    2344   190206866 :         idx = xfs_has_crc(mp);
    2345   190206866 :         if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
    2346             :                 return false;
    2347   190223376 :         return dmagic == bp->b_ops->magic[idx];
    2348             : }
    2349             : /*
    2350             :  * Verify an on-disk magic value against the magic value specified in the
    2351             :  * verifier structure. The verifier magic is in disk byte order so the caller is
    2352             :  * expected to pass the value directly from disk.
    2353             :  */
    2354             : bool
    2355  1311257848 : xfs_verify_magic16(
    2356             :         struct xfs_buf          *bp,
    2357             :         __be16                  dmagic)
    2358             : {
    2359  1311257848 :         struct xfs_mount        *mp = bp->b_mount;
    2360  1311257848 :         int                     idx;
    2361             : 
    2362  1311257848 :         idx = xfs_has_crc(mp);
    2363  1311257848 :         if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
    2364             :                 return false;
    2365  1311257853 :         return dmagic == bp->b_ops->magic16[idx];
    2366             : }

Generated by: LCOV version 1.14