LCOV - code coverage report
Current view: top level - fs/xfs - xfs_buf.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwa @ Mon Jul 31 20:08:17 PDT 2023 Lines: 822 922 89.2 %
Date: 2023-07-31 20:08:17 Functions: 69 71 97.2 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
       4             :  * All Rights Reserved.
       5             :  */
       6             : #include "xfs.h"
       7             : #include <linux/backing-dev.h>
       8             : #include <linux/dax.h>
       9             : 
      10             : #include "xfs_shared.h"
      11             : #include "xfs_format.h"
      12             : #include "xfs_log_format.h"
      13             : #include "xfs_trans_resv.h"
      14             : #include "xfs_mount.h"
      15             : #include "xfs_trace.h"
      16             : #include "xfs_log.h"
      17             : #include "xfs_log_recover.h"
      18             : #include "xfs_log_priv.h"
      19             : #include "xfs_trans.h"
      20             : #include "xfs_buf_item.h"
      21             : #include "xfs_errortag.h"
      22             : #include "xfs_error.h"
      23             : #include "xfs_ag.h"
      24             : 
      25             : struct kmem_cache *xfs_buf_cache;
      26             : 
      27             : /*
      28             :  * Locking orders
      29             :  *
      30             :  * xfs_buf_ioacct_inc:
      31             :  * xfs_buf_ioacct_dec:
      32             :  *      b_sema (caller holds)
      33             :  *        b_lock
      34             :  *
      35             :  * xfs_buf_stale:
      36             :  *      b_sema (caller holds)
      37             :  *        b_lock
      38             :  *          lru_lock
      39             :  *
      40             :  * xfs_buf_rele:
      41             :  *      b_lock
      42             :  *        pag_buf_lock
      43             :  *          lru_lock
      44             :  *
      45             :  * xfs_buftarg_drain_rele
      46             :  *      lru_lock
      47             :  *        b_lock (trylock due to inversion)
      48             :  *
      49             :  * xfs_buftarg_isolate
      50             :  *      lru_lock
      51             :  *        b_lock (trylock due to inversion)
      52             :  */
      53             : 
      54             : static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
      55             : 
      56             : static inline int
      57    25901253 : xfs_buf_submit(
      58             :         struct xfs_buf          *bp)
      59             : {
      60    25901253 :         return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
      61             : }
      62             : 
      63             : static inline int
      64             : xfs_buf_is_vmapped(
      65             :         struct xfs_buf  *bp)
      66             : {
      67             :         /*
      68             :          * Return true if the buffer is vmapped.
      69             :          *
      70             :          * b_addr is null if the buffer is not mapped, but the code is clever
      71             :          * enough to know it doesn't have to map a single page, so the check has
      72             :          * to be both for b_addr and bp->b_page_count > 1.
      73             :          */
      74      138446 :         return bp->b_addr && bp->b_page_count > 1;
      75             : }
      76             : 
      77             : static inline int
      78             : xfs_buf_vmap_len(
      79             :         struct xfs_buf  *bp)
      80             : {
      81             :         return (bp->b_page_count * PAGE_SIZE);
      82             : }
      83             : 
      84             : /*
      85             :  * Bump the I/O in flight count on the buftarg if we haven't yet done so for
      86             :  * this buffer. The count is incremented once per buffer (per hold cycle)
      87             :  * because the corresponding decrement is deferred to buffer release. Buffers
      88             :  * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
      89             :  * tracking adds unnecessary overhead. This is used for sychronization purposes
      90             :  * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
      91             :  * in-flight buffers.
      92             :  *
      93             :  * Buffers that are never released (e.g., superblock, iclog buffers) must set
      94             :  * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
      95             :  * never reaches zero and unmount hangs indefinitely.
      96             :  */
      97             : static inline void
      98    67517842 : xfs_buf_ioacct_inc(
      99             :         struct xfs_buf  *bp)
     100             : {
     101    67517842 :         if (bp->b_flags & XBF_NO_IOACCT)
     102             :                 return;
     103             : 
     104    67382086 :         ASSERT(bp->b_flags & XBF_ASYNC);
     105    67382086 :         spin_lock(&bp->b_lock);
     106    67382156 :         if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
     107    64783616 :                 bp->b_state |= XFS_BSTATE_IN_FLIGHT;
     108    64783616 :                 percpu_counter_inc(&bp->b_target->bt_io_count);
     109             :         }
     110    67382149 :         spin_unlock(&bp->b_lock);
     111             : }
     112             : 
     113             : /*
     114             :  * Clear the in-flight state on a buffer about to be released to the LRU or
     115             :  * freed and unaccount from the buftarg.
     116             :  */
     117             : static inline void
     118  5430642246 : __xfs_buf_ioacct_dec(
     119             :         struct xfs_buf  *bp)
     120             : {
     121  5430642246 :         lockdep_assert_held(&bp->b_lock);
     122             : 
     123  5430642246 :         if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
     124    64783428 :                 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
     125    64783428 :                 percpu_counter_dec(&bp->b_target->bt_io_count);
     126             :         }
     127  5430642131 : }
     128             : 
     129             : static inline void
     130      137874 : xfs_buf_ioacct_dec(
     131             :         struct xfs_buf  *bp)
     132             : {
     133      137874 :         spin_lock(&bp->b_lock);
     134      137874 :         __xfs_buf_ioacct_dec(bp);
     135      137874 :         spin_unlock(&bp->b_lock);
     136      137874 : }
     137             : 
     138             : /*
     139             :  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
     140             :  * b_lru_ref count so that the buffer is freed immediately when the buffer
     141             :  * reference count falls to zero. If the buffer is already on the LRU, we need
     142             :  * to remove the reference that LRU holds on the buffer.
     143             :  *
     144             :  * This prevents build-up of stale buffers on the LRU.
     145             :  */
     146             : void
     147    25037198 : xfs_buf_stale(
     148             :         struct xfs_buf  *bp)
     149             : {
     150    25037198 :         ASSERT(xfs_buf_islocked(bp));
     151             : 
     152    25037198 :         bp->b_flags |= XBF_STALE;
     153             : 
     154             :         /*
     155             :          * Clear the delwri status so that a delwri queue walker will not
     156             :          * flush this buffer to disk now that it is stale. The delwri queue has
     157             :          * a reference to the buffer, so this is safe to do.
     158             :          */
     159    25037198 :         bp->b_flags &= ~_XBF_DELWRI_Q;
     160             : 
     161             :         /*
     162             :          * Once the buffer is marked stale and unlocked, a subsequent lookup
     163             :          * could reset b_flags. There is no guarantee that the buffer is
     164             :          * unaccounted (released to LRU) before that occurs. Drop in-flight
     165             :          * status now to preserve accounting consistency.
     166             :          */
     167    25037198 :         spin_lock(&bp->b_lock);
     168    25037521 :         __xfs_buf_ioacct_dec(bp);
     169             : 
     170    25037532 :         atomic_set(&bp->b_lru_ref, 0);
     171    50075746 :         if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
     172    25037077 :             (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
     173     4325556 :                 atomic_dec(&bp->b_hold);
     174             : 
     175    25038670 :         ASSERT(atomic_read(&bp->b_hold) >= 1);
     176    25038670 :         spin_unlock(&bp->b_lock);
     177    25038047 : }
     178             : 
     179             : static int
     180    50137449 : xfs_buf_get_maps(
     181             :         struct xfs_buf          *bp,
     182             :         int                     map_count)
     183             : {
     184    50137449 :         ASSERT(bp->b_maps == NULL);
     185    50137449 :         bp->b_map_count = map_count;
     186             : 
     187    50137449 :         if (map_count == 1) {
     188    50137409 :                 bp->b_maps = &bp->__b_map;
     189    50137409 :                 return 0;
     190             :         }
     191             : 
     192          40 :         bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
     193             :                                 KM_NOFS);
     194          40 :         if (!bp->b_maps)
     195           0 :                 return -ENOMEM;
     196             :         return 0;
     197             : }
     198             : 
     199             : /*
     200             :  *      Frees b_pages if it was allocated.
     201             :  */
     202             : static void
     203             : xfs_buf_free_maps(
     204             :         struct xfs_buf  *bp)
     205             : {
     206    50093770 :         if (bp->b_maps != &bp->__b_map) {
     207          40 :                 kmem_free(bp->b_maps);
     208          40 :                 bp->b_maps = NULL;
     209             :         }
     210             : }
     211             : 
     212             : static int
     213    50138889 : _xfs_buf_alloc(
     214             :         struct xfs_buftarg      *target,
     215             :         struct xfs_buf_map      *map,
     216             :         int                     nmaps,
     217             :         xfs_buf_flags_t         flags,
     218             :         struct xfs_buf          **bpp)
     219             : {
     220    50138889 :         struct xfs_buf          *bp;
     221    50138889 :         int                     error;
     222    50138889 :         int                     i;
     223             : 
     224    50138889 :         *bpp = NULL;
     225    50138889 :         bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
     226             : 
     227             :         /*
     228             :          * We don't want certain flags to appear in b_flags unless they are
     229             :          * specifically set by later operations on the buffer.
     230             :          */
     231    50136889 :         flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
     232             : 
     233    50136889 :         atomic_set(&bp->b_hold, 1);
     234    50136889 :         atomic_set(&bp->b_lru_ref, 1);
     235    50136889 :         init_completion(&bp->b_iowait);
     236    50133921 :         INIT_LIST_HEAD(&bp->b_lru);
     237    50133921 :         INIT_LIST_HEAD(&bp->b_list);
     238    50133921 :         INIT_LIST_HEAD(&bp->b_li_list);
     239    50133921 :         sema_init(&bp->b_sema, 0); /* held, no waiters */
     240    50133921 :         spin_lock_init(&bp->b_lock);
     241    50137323 :         bp->b_target = target;
     242    50137323 :         bp->b_mount = target->bt_mount;
     243    50137323 :         bp->b_flags = flags;
     244             : 
     245             :         /*
     246             :          * Set length and io_length to the same value initially.
     247             :          * I/O routines should use io_length, which will be the same in
     248             :          * most cases but may be reset (e.g. XFS recovery).
     249             :          */
     250    50137323 :         error = xfs_buf_get_maps(bp, nmaps);
     251    50137968 :         if (error)  {
     252           0 :                 kmem_cache_free(xfs_buf_cache, bp);
     253           0 :                 return error;
     254             :         }
     255             : 
     256    50137968 :         bp->b_rhash_key = map[0].bm_bn;
     257    50137968 :         bp->b_length = 0;
     258   100279026 :         for (i = 0; i < nmaps; i++) {
     259    50141058 :                 bp->b_maps[i].bm_bn = map[i].bm_bn;
     260    50141058 :                 bp->b_maps[i].bm_len = map[i].bm_len;
     261    50141058 :                 bp->b_length += map[i].bm_len;
     262             :         }
     263             : 
     264    50137968 :         atomic_set(&bp->b_pin_count, 0);
     265    50137968 :         init_waitqueue_head(&bp->b_waiters);
     266             : 
     267    50140177 :         XFS_STATS_INC(bp->b_mount, xb_create);
     268    50140445 :         trace_xfs_buf_init(bp, _RET_IP_);
     269             : 
     270    50139813 :         *bpp = bp;
     271    50139813 :         return 0;
     272             : }
     273             : 
     274             : static void
     275      138446 : xfs_buf_free_pages(
     276             :         struct xfs_buf  *bp)
     277             : {
     278      138446 :         uint            i;
     279             : 
     280      138446 :         ASSERT(bp->b_flags & _XBF_PAGES);
     281             : 
     282      138446 :         if (xfs_buf_is_vmapped(bp))
     283          62 :                 vm_unmap_ram(bp->b_addr, bp->b_page_count);
     284             : 
     285      276954 :         for (i = 0; i < bp->b_page_count; i++) {
     286      138508 :                 if (bp->b_pages[i])
     287      138508 :                         __free_page(bp->b_pages[i]);
     288             :         }
     289      138446 :         mm_account_reclaimed_pages(bp->b_page_count);
     290             : 
     291      138446 :         if (bp->b_pages != bp->b_page_array)
     292           0 :                 kmem_free(bp->b_pages);
     293      138446 :         bp->b_pages = NULL;
     294      138446 :         bp->b_flags &= ~_XBF_PAGES;
     295      138446 : }
     296             : 
     297             : static void
     298    50093770 : xfs_buf_free_callback(
     299             :         struct callback_head    *cb)
     300             : {
     301    50093770 :         struct xfs_buf          *bp = container_of(cb, struct xfs_buf, b_rcu);
     302             : 
     303    50093770 :         xfs_buf_free_maps(bp);
     304    50093770 :         kmem_cache_free(xfs_buf_cache, bp);
     305    50099633 : }
     306             : 
     307             : static void
     308    50142387 : xfs_buf_free(
     309             :         struct xfs_buf          *bp)
     310             : {
     311    50142387 :         trace_xfs_buf_free(bp, _RET_IP_);
     312             : 
     313    50142394 :         ASSERT(list_empty(&bp->b_lru));
     314             : 
     315    50142394 :         if (bp->b_flags & _XBF_PAGES)
     316      138446 :                 xfs_buf_free_pages(bp);
     317    50003948 :         else if (bp->b_flags & _XBF_KMEM)
     318    50003934 :                 kmem_free(bp->b_addr);
     319             : 
     320    50142396 :         call_rcu(&bp->b_rcu, xfs_buf_free_callback);
     321    50142364 : }
     322             : 
     323             : static int
     324    50001505 : xfs_buf_alloc_kmem(
     325             :         struct xfs_buf  *bp,
     326             :         xfs_buf_flags_t flags)
     327             : {
     328    50001505 :         xfs_km_flags_t  kmflag_mask = KM_NOFS;
     329    50001505 :         size_t          size = BBTOB(bp->b_length);
     330             : 
     331             :         /* Assure zeroed buffer for non-read cases. */
     332    50001505 :         if (!(flags & XBF_READ))
     333    24195065 :                 kmflag_mask |= KM_ZERO;
     334             : 
     335    50001505 :         bp->b_addr = kmem_alloc(size, kmflag_mask);
     336    50000583 :         if (!bp->b_addr)
     337             :                 return -ENOMEM;
     338             : 
     339    50000583 :         if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
     340             :             ((unsigned long)bp->b_addr & PAGE_MASK)) {
     341             :                 /* b_addr spans two pages - use alloc_page instead */
     342           0 :                 kmem_free(bp->b_addr);
     343           0 :                 bp->b_addr = NULL;
     344           0 :                 return -ENOMEM;
     345             :         }
     346    50000583 :         bp->b_offset = offset_in_page(bp->b_addr);
     347    50000583 :         bp->b_pages = bp->b_page_array;
     348    50000583 :         bp->b_pages[0] = kmem_to_page(bp->b_addr);
     349    50001097 :         bp->b_page_count = 1;
     350    50001097 :         bp->b_flags |= _XBF_KMEM;
     351    50001097 :         return 0;
     352             : }
     353             : 
     354             : static int
     355      138443 : xfs_buf_alloc_pages(
     356             :         struct xfs_buf  *bp,
     357             :         xfs_buf_flags_t flags)
     358             : {
     359      138443 :         gfp_t           gfp_mask = __GFP_NOWARN;
     360      138443 :         long            filled = 0;
     361             : 
     362      138443 :         if (flags & XBF_READ_AHEAD)
     363             :                 gfp_mask |= __GFP_NORETRY;
     364             :         else
     365      138435 :                 gfp_mask |= GFP_NOFS;
     366             : 
     367             :         /* Make sure that we have a page list */
     368      138443 :         bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
     369      138443 :         if (bp->b_page_count <= XB_PAGES) {
     370      138443 :                 bp->b_pages = bp->b_page_array;
     371             :         } else {
     372           0 :                 bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
     373             :                                         gfp_mask);
     374           0 :                 if (!bp->b_pages)
     375             :                         return -ENOMEM;
     376             :         }
     377      138443 :         bp->b_flags |= _XBF_PAGES;
     378             : 
     379             :         /* Assure zeroed buffer for non-read cases. */
     380      138443 :         if (!(flags & XBF_READ))
     381      138098 :                 gfp_mask |= __GFP_ZERO;
     382             : 
     383             :         /*
     384             :          * Bulk filling of pages can take multiple calls. Not filling the entire
     385             :          * array is not an allocation failure, so don't back off if we get at
     386             :          * least one extra page.
     387             :          */
     388      138443 :         for (;;) {
     389      138443 :                 long    last = filled;
     390             : 
     391      138443 :                 filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
     392             :                                                 bp->b_pages);
     393      138443 :                 if (filled == bp->b_page_count) {
     394      138443 :                         XFS_STATS_INC(bp->b_mount, xb_page_found);
     395      138443 :                         break;
     396             :                 }
     397             : 
     398           0 :                 if (filled != last)
     399           0 :                         continue;
     400             : 
     401           0 :                 if (flags & XBF_READ_AHEAD) {
     402           0 :                         xfs_buf_free_pages(bp);
     403           0 :                         return -ENOMEM;
     404             :                 }
     405             : 
     406           0 :                 XFS_STATS_INC(bp->b_mount, xb_page_retries);
     407           0 :                 memalloc_retry_wait(gfp_mask);
     408             :         }
     409      138443 :         return 0;
     410             : }
     411             : 
     412             : /*
     413             :  *      Map buffer into kernel address-space if necessary.
     414             :  */
     415             : STATIC int
     416      138443 : _xfs_buf_map_pages(
     417             :         struct xfs_buf          *bp,
     418             :         xfs_buf_flags_t         flags)
     419             : {
     420      138443 :         ASSERT(bp->b_flags & _XBF_PAGES);
     421      138443 :         if (bp->b_page_count == 1) {
     422             :                 /* A single page buffer is always mappable */
     423      138381 :                 bp->b_addr = page_address(bp->b_pages[0]);
     424          62 :         } else if (flags & XBF_UNMAPPED) {
     425           0 :                 bp->b_addr = NULL;
     426             :         } else {
     427          62 :                 int retried = 0;
     428          62 :                 unsigned nofs_flag;
     429             : 
     430             :                 /*
     431             :                  * vm_map_ram() will allocate auxiliary structures (e.g.
     432             :                  * pagetables) with GFP_KERNEL, yet we are likely to be under
     433             :                  * GFP_NOFS context here. Hence we need to tell memory reclaim
     434             :                  * that we are in such a context via PF_MEMALLOC_NOFS to prevent
     435             :                  * memory reclaim re-entering the filesystem here and
     436             :                  * potentially deadlocking.
     437             :                  */
     438          62 :                 nofs_flag = memalloc_nofs_save();
     439          62 :                 do {
     440          62 :                         bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
     441             :                                                 -1);
     442          62 :                         if (bp->b_addr)
     443             :                                 break;
     444           0 :                         vm_unmap_aliases();
     445           0 :                 } while (retried++ <= 1);
     446          62 :                 memalloc_nofs_restore(nofs_flag);
     447             : 
     448          62 :                 if (!bp->b_addr)
     449           0 :                         return -ENOMEM;
     450             :         }
     451             : 
     452             :         return 0;
     453             : }
     454             : 
     455             : /*
     456             :  *      Finding and Reading Buffers
     457             :  */
     458             : static int
     459 20743386446 : _xfs_buf_obj_cmp(
     460             :         struct rhashtable_compare_arg   *arg,
     461             :         const void                      *obj)
     462             : {
     463 20743386446 :         const struct xfs_buf_map        *map = arg->key;
     464 20743386446 :         const struct xfs_buf            *bp = obj;
     465             : 
     466             :         /*
     467             :          * The key hashing in the lookup path depends on the key being the
     468             :          * first element of the compare_arg, make sure to assert this.
     469             :          */
     470 20743386446 :         BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
     471             : 
     472 20743386446 :         if (bp->b_rhash_key != map->bm_bn)
     473             :                 return 1;
     474             : 
     475 15859671386 :         if (unlikely(bp->b_length != map->bm_len)) {
     476             :                 /*
     477             :                  * found a block number match. If the range doesn't
     478             :                  * match, the only way this is allowed is if the buffer
     479             :                  * in the cache is stale and the transaction that made
     480             :                  * it stale has not yet committed. i.e. we are
     481             :                  * reallocating a busy extent. Skip this buffer and
     482             :                  * continue searching for an exact match.
     483             :                  */
     484           4 :                 ASSERT(bp->b_flags & XBF_STALE);
     485           4 :                 return 1;
     486             :         }
     487             :         return 0;
     488             : }
     489             : 
     490             : static const struct rhashtable_params xfs_buf_hash_params = {
     491             :         .min_size               = 32,   /* empty AGs have minimal footprint */
     492             :         .nelem_hint             = 16,
     493             :         .key_len                = sizeof(xfs_daddr_t),
     494             :         .key_offset             = offsetof(struct xfs_buf, b_rhash_key),
     495             :         .head_offset            = offsetof(struct xfs_buf, b_rhash_head),
     496             :         .automatic_shrinking    = true,
     497             :         .obj_cmpfn              = _xfs_buf_obj_cmp,
     498             : };
     499             : 
     500             : int
     501      126902 : xfs_buf_hash_init(
     502             :         struct xfs_perag        *pag)
     503             : {
     504      126902 :         spin_lock_init(&pag->pag_buf_lock);
     505      126902 :         return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
     506             : }
     507             : 
     508             : void
     509      126914 : xfs_buf_hash_destroy(
     510             :         struct xfs_perag        *pag)
     511             : {
     512      126914 :         rhashtable_destroy(&pag->pag_buf_hash);
     513      126914 : }
     514             : 
     515             : static int
     516 15907990164 : xfs_buf_map_verify(
     517             :         struct xfs_buftarg      *btp,
     518             :         struct xfs_buf_map      *map)
     519             : {
     520 15907990164 :         xfs_daddr_t             eofs;
     521             : 
     522             :         /* Check for IOs smaller than the sector size / not sector aligned */
     523 15907990164 :         ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
     524 15907990164 :         ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
     525             : 
     526             :         /*
     527             :          * Corrupted block numbers can get through to here, unfortunately, so we
     528             :          * have to check that the buffer falls within the filesystem bounds.
     529             :          */
     530 15907990164 :         eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
     531 15907990164 :         if (map->bm_bn < 0 || map->bm_bn >= eofs) {
     532           0 :                 xfs_alert(btp->bt_mount,
     533             :                           "%s: daddr 0x%llx out of range, EOFS 0x%llx",
     534             :                           __func__, map->bm_bn, eofs);
     535           0 :                 WARN_ON(1);
     536           0 :                 return -EFSCORRUPTED;
     537             :         }
     538             :         return 0;
     539             : }
     540             : 
     541             : static int
     542 15880108707 : xfs_buf_find_lock(
     543             :         struct xfs_buf          *bp,
     544             :         xfs_buf_flags_t         flags)
     545             : {
     546 15880108707 :         if (flags & XBF_TRYLOCK) {
     547  3618143817 :                 if (!xfs_buf_trylock(bp)) {
     548    75959770 :                         XFS_STATS_INC(bp->b_mount, xb_busy_locked);
     549    75959895 :                         return -EAGAIN;
     550             :                 }
     551             :         } else {
     552 12261964890 :                 xfs_buf_lock(bp);
     553 12259913074 :                 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
     554             :         }
     555             : 
     556             :         /*
     557             :          * if the buffer is stale, clear all the external state associated with
     558             :          * it. We need to keep flags such as how we allocated the buffer memory
     559             :          * intact here.
     560             :          */
     561 15805315757 :         if (bp->b_flags & XBF_STALE) {
     562       30992 :                 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
     563       30992 :                 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
     564       30992 :                 bp->b_ops = NULL;
     565             :         }
     566             :         return 0;
     567             : }
     568             : 
     569             : static inline int
     570 15911680121 : xfs_buf_lookup(
     571             :         struct xfs_perag        *pag,
     572             :         struct xfs_buf_map      *map,
     573             :         xfs_buf_flags_t         flags,
     574             :         struct xfs_buf          **bpp)
     575             : {
     576 15911680121 :         struct xfs_buf          *bp;
     577 15911680121 :         int                     error;
     578             : 
     579 15911680121 :         rcu_read_lock();
     580 15913828542 :         bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
     581 31784352029 :         if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
     582    50003937 :                 rcu_read_unlock();
     583    50003937 :                 return -ENOENT;
     584             :         }
     585 15876192630 :         rcu_read_unlock();
     586             : 
     587 15876733090 :         error = xfs_buf_find_lock(bp, flags);
     588 15873499846 :         if (error) {
     589    75959608 :                 xfs_buf_rele(bp);
     590    75959608 :                 return error;
     591             :         }
     592             : 
     593 15797540238 :         trace_xfs_buf_find(bp, flags, _RET_IP_);
     594 15800824013 :         *bpp = bp;
     595 15800824013 :         return 0;
     596             : }
     597             : 
     598             : /*
     599             :  * Insert the new_bp into the hash table. This consumes the perag reference
     600             :  * taken for the lookup regardless of the result of the insert.
     601             :  */
     602             : static int
     603    50002493 : xfs_buf_find_insert(
     604             :         struct xfs_buftarg      *btp,
     605             :         struct xfs_perag        *pag,
     606             :         struct xfs_buf_map      *cmap,
     607             :         struct xfs_buf_map      *map,
     608             :         int                     nmaps,
     609             :         xfs_buf_flags_t         flags,
     610             :         struct xfs_buf          **bpp)
     611             : {
     612    50002493 :         struct xfs_buf          *new_bp;
     613    50002493 :         struct xfs_buf          *bp;
     614    50002493 :         int                     error;
     615             : 
     616    50002493 :         error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
     617    50002049 :         if (error)
     618           0 :                 goto out_drop_pag;
     619             : 
     620             :         /*
     621             :          * For buffers that fit entirely within a single page, first attempt to
     622             :          * allocate the memory from the heap to minimise memory usage. If we
     623             :          * can't get heap memory for these small buffers, we fall back to using
     624             :          * the page allocator.
     625             :          */
     626   100003084 :         if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
     627    50001572 :             xfs_buf_alloc_kmem(new_bp, flags) < 0) {
     628         747 :                 error = xfs_buf_alloc_pages(new_bp, flags);
     629         572 :                 if (error)
     630           0 :                         goto out_free_buf;
     631             :         }
     632             : 
     633    50001607 :         spin_lock(&pag->pag_buf_lock);
     634    50003483 :         bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
     635             :                         &new_bp->b_rhash_head, xfs_buf_hash_params);
     636    50003298 :         if (IS_ERR(bp)) {
     637           0 :                 error = PTR_ERR(bp);
     638           0 :                 spin_unlock(&pag->pag_buf_lock);
     639           0 :                 goto out_free_buf;
     640             :         }
     641    50003298 :         if (bp) {
     642             :                 /* found an existing buffer */
     643        1028 :                 atomic_inc(&bp->b_hold);
     644        1028 :                 spin_unlock(&pag->pag_buf_lock);
     645        1028 :                 error = xfs_buf_find_lock(bp, flags);
     646        1028 :                 if (error)
     647           2 :                         xfs_buf_rele(bp);
     648             :                 else
     649        1026 :                         *bpp = bp;
     650        1028 :                 goto out_free_buf;
     651             :         }
     652             : 
     653             :         /* The new buffer keeps the perag reference until it is freed. */
     654    50002270 :         new_bp->b_pag = pag;
     655    50002270 :         spin_unlock(&pag->pag_buf_lock);
     656    50001777 :         *bpp = new_bp;
     657    50001777 :         return 0;
     658             : 
     659        1028 : out_free_buf:
     660        1028 :         xfs_buf_free(new_bp);
     661        1028 : out_drop_pag:
     662        1028 :         xfs_perag_put(pag);
     663        1028 :         return error;
     664             : }
     665             : 
     666             : /*
     667             :  * Assembles a buffer covering the specified range. The code is optimised for
     668             :  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
     669             :  * more hits than misses.
     670             :  */
     671             : int
     672 15902511314 : xfs_buf_get_map(
     673             :         struct xfs_buftarg      *btp,
     674             :         struct xfs_buf_map      *map,
     675             :         int                     nmaps,
     676             :         xfs_buf_flags_t         flags,
     677             :         struct xfs_buf          **bpp)
     678             : {
     679 15902511314 :         struct xfs_perag        *pag;
     680 15902511314 :         struct xfs_buf          *bp = NULL;
     681 15902511314 :         struct xfs_buf_map      cmap = { .bm_bn = map[0].bm_bn };
     682 15902511314 :         int                     error;
     683 15902511314 :         int                     i;
     684             : 
     685 31811774570 :         for (i = 0; i < nmaps; i++)
     686 15909263256 :                 cmap.bm_len += map[i].bm_len;
     687             : 
     688 15902511314 :         error = xfs_buf_map_verify(btp, &cmap);
     689 15910379410 :         if (error)
     690             :                 return error;
     691             : 
     692 15910958762 :         pag = xfs_perag_get(btp->bt_mount,
     693             :                             xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
     694             : 
     695 15907315714 :         error = xfs_buf_lookup(pag, &cmap, flags, &bp);
     696 15926600425 :         if (error && error != -ENOENT)
     697    75959990 :                 goto out_put_perag;
     698             : 
     699             :         /* cache hits always outnumber misses by at least 10:1 */
     700 15850640435 :         if (unlikely(!bp)) {
     701    50003243 :                 XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
     702             : 
     703    50002938 :                 if (flags & XBF_INCORE)
     704         410 :                         goto out_put_perag;
     705             : 
     706             :                 /* xfs_buf_find_insert() consumes the perag reference. */
     707    50002528 :                 error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
     708             :                                 flags, &bp);
     709    50002953 :                 if (error)
     710             :                         return error;
     711             :         } else {
     712 15800637192 :                 XFS_STATS_INC(btp->bt_mount, xb_get_locked);
     713 15800998365 :                 xfs_perag_put(pag);
     714             :         }
     715             : 
     716             :         /* We do not hold a perag reference anymore. */
     717 15846108439 :         if (!bp->b_addr) {
     718         572 :                 error = _xfs_buf_map_pages(bp, flags);
     719         572 :                 if (unlikely(error)) {
     720           0 :                         xfs_warn_ratelimited(btp->bt_mount,
     721             :                                 "%s: failed to map %u pages", __func__,
     722             :                                 bp->b_page_count);
     723           0 :                         xfs_buf_relse(bp);
     724           0 :                         return error;
     725             :                 }
     726             :         }
     727             : 
     728             :         /*
     729             :          * Clear b_error if this is a lookup from a caller that doesn't expect
     730             :          * valid data to be found in the buffer.
     731             :          */
     732 15846108439 :         if (!(flags & XBF_READ))
     733    42866844 :                 xfs_buf_ioerror(bp, 0);
     734             : 
     735 15846108677 :         XFS_STATS_INC(btp->bt_mount, xb_get);
     736 15846576121 :         trace_xfs_buf_get(bp, flags, _RET_IP_);
     737 15851935640 :         *bpp = bp;
     738 15851935640 :         return 0;
     739             : 
     740    75960400 : out_put_perag:
     741    75960400 :         xfs_perag_put(pag);
     742    75960400 :         return error;
     743             : }
     744             : 
     745             : int
     746    25832621 : _xfs_buf_read(
     747             :         struct xfs_buf          *bp,
     748             :         xfs_buf_flags_t         flags)
     749             : {
     750    25832621 :         ASSERT(!(flags & XBF_WRITE));
     751    25832621 :         ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
     752             : 
     753    25832621 :         bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
     754    25832621 :         bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
     755             : 
     756    25832621 :         return xfs_buf_submit(bp);
     757             : }
     758             : 
     759             : /*
     760             :  * Reverify a buffer found in cache without an attached ->b_ops.
     761             :  *
     762             :  * If the caller passed an ops structure and the buffer doesn't have ops
     763             :  * assigned, set the ops and use it to verify the contents. If verification
     764             :  * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
     765             :  * already in XBF_DONE state on entry.
     766             :  *
     767             :  * Under normal operations, every in-core buffer is verified on read I/O
     768             :  * completion. There are two scenarios that can lead to in-core buffers without
     769             :  * an assigned ->b_ops. The first is during log recovery of buffers on a V4
     770             :  * filesystem, though these buffers are purged at the end of recovery. The
     771             :  * other is online repair, which intentionally reads with a NULL buffer ops to
     772             :  * run several verifiers across an in-core buffer in order to establish buffer
     773             :  * type.  If repair can't establish that, the buffer will be left in memory
     774             :  * with NULL buffer ops.
     775             :  */
     776             : int
     777 16343951179 : xfs_buf_reverify(
     778             :         struct xfs_buf          *bp,
     779             :         const struct xfs_buf_ops *ops)
     780             : {
     781 16343951179 :         ASSERT(bp->b_flags & XBF_DONE);
     782 16343951179 :         ASSERT(bp->b_error == 0);
     783             : 
     784 16343951179 :         if (!ops || bp->b_ops)
     785             :                 return 0;
     786             : 
     787          79 :         bp->b_ops = ops;
     788          79 :         bp->b_ops->verify_read(bp);
     789          79 :         if (bp->b_error)
     790          77 :                 bp->b_flags &= ~XBF_DONE;
     791             :         return bp->b_error;
     792             : }
     793             : 
     794             : int
     795 15858944147 : xfs_buf_read_map(
     796             :         struct xfs_buftarg      *target,
     797             :         struct xfs_buf_map      *map,
     798             :         int                     nmaps,
     799             :         xfs_buf_flags_t         flags,
     800             :         struct xfs_buf          **bpp,
     801             :         const struct xfs_buf_ops *ops,
     802             :         xfs_failaddr_t          fa)
     803             : {
     804 15858944147 :         struct xfs_buf          *bp;
     805 15858944147 :         int                     error;
     806             : 
     807 15858944147 :         flags |= XBF_READ;
     808 15858944147 :         *bpp = NULL;
     809             : 
     810 15858944147 :         error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
     811 15882543960 :         if (error)
     812             :                 return error;
     813             : 
     814 15806135838 :         trace_xfs_buf_read(bp, flags, _RET_IP_);
     815             : 
     816 15809184952 :         if (!(bp->b_flags & XBF_DONE)) {
     817             :                 /* Initiate the buffer read and wait. */
     818    25822418 :                 XFS_STATS_INC(target->bt_mount, xb_get_read);
     819    25822429 :                 bp->b_ops = ops;
     820    25822429 :                 error = _xfs_buf_read(bp, flags);
     821             : 
     822             :                 /* Readahead iodone already dropped the buffer, so exit. */
     823    25417983 :                 if (flags & XBF_ASYNC)
     824             :                         return 0;
     825             :         } else {
     826             :                 /* Buffer already read; all we need to do is check it. */
     827 15783362534 :                 error = xfs_buf_reverify(bp, ops);
     828             : 
     829             :                 /* Readahead already finished; drop the buffer and exit. */
     830 15785449566 :                 if (flags & XBF_ASYNC) {
     831  3491096161 :                         xfs_buf_relse(bp);
     832  3491073225 :                         return 0;
     833             :                 }
     834             : 
     835             :                 /* We do not want read in the flags */
     836 12294353405 :                 bp->b_flags &= ~XBF_READ;
     837 12294353405 :                 ASSERT(bp->b_ops != NULL || ops == NULL);
     838             :         }
     839             : 
     840             :         /*
     841             :          * If we've had a read error, then the contents of the buffer are
     842             :          * invalid and should not be used. To ensure that a followup read tries
     843             :          * to pull the buffer from disk again, we clear the XBF_DONE flag and
     844             :          * mark the buffer stale. This ensures that anyone who has a current
     845             :          * reference to the buffer will interpret it's contents correctly and
     846             :          * future cache lookups will also treat it as an empty, uninitialised
     847             :          * buffer.
     848             :          */
     849 12303057941 :         if (error) {
     850             :                 /*
     851             :                  * Check against log shutdown for error reporting because
     852             :                  * metadata writeback may require a read first and we need to
     853             :                  * report errors in metadata writeback until the log is shut
     854             :                  * down. High level transaction read functions already check
     855             :                  * against mount shutdown, anyway, so we only need to be
     856             :                  * concerned about low level IO interactions here.
     857             :                  */
     858       76482 :                 if (!xlog_is_shutdown(target->bt_mount->m_log))
     859       26396 :                         xfs_buf_ioerror_alert(bp, fa);
     860             : 
     861       38288 :                 bp->b_flags &= ~XBF_DONE;
     862       38288 :                 xfs_buf_stale(bp);
     863       38281 :                 xfs_buf_relse(bp);
     864             : 
     865             :                 /* bad CRC means corrupted metadata */
     866       38280 :                 if (error == -EFSBADCRC)
     867         668 :                         error = -EFSCORRUPTED;
     868       38280 :                 return error;
     869             :         }
     870             : 
     871 12303019700 :         *bpp = bp;
     872 12303019700 :         return 0;
     873             : }
     874             : 
     875             : /*
     876             :  *      If we are not low on memory then do the readahead in a deadlock
     877             :  *      safe manner.
     878             :  */
     879             : void
     880  3579990791 : xfs_buf_readahead_map(
     881             :         struct xfs_buftarg      *target,
     882             :         struct xfs_buf_map      *map,
     883             :         int                     nmaps,
     884             :         const struct xfs_buf_ops *ops)
     885             : {
     886  3579990791 :         struct xfs_buf          *bp;
     887             : 
     888  7160242127 :         xfs_buf_read_map(target, map, nmaps,
     889             :                      XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
     890  3579990791 :                      __this_address);
     891  3581485087 : }
     892             : 
     893             : /*
     894             :  * Read an uncached buffer from disk. Allocates and returns a locked
     895             :  * buffer containing the disk contents or nothing. Uncached buffers always have
     896             :  * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
     897             :  * is cached or uncached during fault diagnosis.
     898             :  */
     899             : int
     900       67817 : xfs_buf_read_uncached(
     901             :         struct xfs_buftarg      *target,
     902             :         xfs_daddr_t             daddr,
     903             :         size_t                  numblks,
     904             :         xfs_buf_flags_t         flags,
     905             :         struct xfs_buf          **bpp,
     906             :         const struct xfs_buf_ops *ops)
     907             : {
     908       67817 :         struct xfs_buf          *bp;
     909       67817 :         int                     error;
     910             : 
     911       67817 :         *bpp = NULL;
     912             : 
     913       67817 :         error = xfs_buf_get_uncached(target, numblks, flags, &bp);
     914       67817 :         if (error)
     915             :                 return error;
     916             : 
     917             :         /* set up the buffer for a read IO */
     918       67817 :         ASSERT(bp->b_map_count == 1);
     919       67817 :         bp->b_rhash_key = XFS_BUF_DADDR_NULL;
     920       67817 :         bp->b_maps[0].bm_bn = daddr;
     921       67817 :         bp->b_flags |= XBF_READ;
     922       67817 :         bp->b_ops = ops;
     923             : 
     924       67817 :         xfs_buf_submit(bp);
     925       67817 :         if (bp->b_error) {
     926          16 :                 error = bp->b_error;
     927          16 :                 xfs_buf_relse(bp);
     928          16 :                 return error;
     929             :         }
     930             : 
     931       67801 :         *bpp = bp;
     932       67801 :         return 0;
     933             : }
     934             : 
     935             : int
     936      137871 : xfs_buf_get_uncached(
     937             :         struct xfs_buftarg      *target,
     938             :         size_t                  numblks,
     939             :         xfs_buf_flags_t         flags,
     940             :         struct xfs_buf          **bpp)
     941             : {
     942      137871 :         int                     error;
     943      137871 :         struct xfs_buf          *bp;
     944      137871 :         DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
     945             : 
     946      137871 :         *bpp = NULL;
     947             : 
     948             :         /* flags might contain irrelevant bits, pass only what we care about */
     949      137871 :         error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
     950      137871 :         if (error)
     951             :                 return error;
     952             : 
     953      137871 :         error = xfs_buf_alloc_pages(bp, flags);
     954      137871 :         if (error)
     955           0 :                 goto fail_free_buf;
     956             : 
     957      137871 :         error = _xfs_buf_map_pages(bp, 0);
     958      137871 :         if (unlikely(error)) {
     959           0 :                 xfs_warn(target->bt_mount,
     960             :                         "%s: failed to map pages", __func__);
     961           0 :                 goto fail_free_buf;
     962             :         }
     963             : 
     964      137871 :         trace_xfs_buf_get_uncached(bp, _RET_IP_);
     965      137871 :         *bpp = bp;
     966      137871 :         return 0;
     967             : 
     968           0 : fail_free_buf:
     969           0 :         xfs_buf_free(bp);
     970           0 :         return error;
     971             : }
     972             : 
     973             : /*
     974             :  *      Increment reference count on buffer, to hold the buffer concurrently
     975             :  *      with another thread which may release (free) the buffer asynchronously.
     976             :  *      Must hold the buffer already to call this function.
     977             :  */
     978             : void
     979  4287788910 : xfs_buf_hold(
     980             :         struct xfs_buf          *bp)
     981             : {
     982  4287788910 :         trace_xfs_buf_hold(bp, _RET_IP_);
     983  4287953690 :         atomic_inc(&bp->b_hold);
     984  4287978429 : }
     985             : 
     986             : /*
     987             :  * Release a hold on the specified buffer. If the hold count is 1, the buffer is
     988             :  * placed on LRU or freed (depending on b_lru_ref).
     989             :  */
     990             : void
     991 20281985977 : xfs_buf_rele(
     992             :         struct xfs_buf          *bp)
     993             : {
     994 20281985977 :         struct xfs_perag        *pag = bp->b_pag;
     995 20281985977 :         bool                    release;
     996 20281985977 :         bool                    freebuf = false;
     997             : 
     998 20281985977 :         trace_xfs_buf_rele(bp, _RET_IP_);
     999             : 
    1000 20285365676 :         if (!pag) {
    1001     9860204 :                 ASSERT(list_empty(&bp->b_lru));
    1002    19720432 :                 if (atomic_dec_and_test(&bp->b_hold)) {
    1003      137874 :                         xfs_buf_ioacct_dec(bp);
    1004      137874 :                         xfs_buf_free(bp);
    1005             :                 }
    1006     9860228 :                 return;
    1007             :         }
    1008             : 
    1009 20275505472 :         ASSERT(atomic_read(&bp->b_hold) > 0);
    1010             : 
    1011             :         /*
    1012             :          * We grab the b_lock here first to serialise racing xfs_buf_rele()
    1013             :          * calls. The pag_buf_lock being taken on the last reference only
    1014             :          * serialises against racing lookups in xfs_buf_find(). IOWs, the second
    1015             :          * to last reference we drop here is not serialised against the last
    1016             :          * reference until we take bp->b_lock. Hence if we don't grab b_lock
    1017             :          * first, the last "release" reference can win the race to the lock and
    1018             :          * free the buffer before the second-to-last reference is processed,
    1019             :          * leading to a use-after-free scenario.
    1020             :          */
    1021 20275505472 :         spin_lock(&bp->b_lock);
    1022 20291711812 :         release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
    1023 20299740431 :         if (!release) {
    1024             :                 /*
    1025             :                  * Drop the in-flight state if the buffer is already on the LRU
    1026             :                  * and it holds the only reference. This is racy because we
    1027             :                  * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
    1028             :                  * ensures the decrement occurs only once per-buf.
    1029             :                  */
    1030 20218362925 :                 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
    1031  5323041425 :                         __xfs_buf_ioacct_dec(bp);
    1032 20218532202 :                 goto out_unlock;
    1033             :         }
    1034             : 
    1035             :         /* the last reference has been dropped ... */
    1036    81377506 :         __xfs_buf_ioacct_dec(bp);
    1037    81377512 :         if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
    1038             :                 /*
    1039             :                  * If the buffer is added to the LRU take a new reference to the
    1040             :                  * buffer for the LRU and clear the (now stale) dispose list
    1041             :                  * state flag
    1042             :                  */
    1043    31374036 :                 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
    1044    31374220 :                         bp->b_state &= ~XFS_BSTATE_DISPOSE;
    1045    31374220 :                         atomic_inc(&bp->b_hold);
    1046             :                 }
    1047    31374220 :                 spin_unlock(&pag->pag_buf_lock);
    1048             :         } else {
    1049             :                 /*
    1050             :                  * most of the time buffers will already be removed from the
    1051             :                  * LRU, so optimise that case by checking for the
    1052             :                  * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
    1053             :                  * was on was the disposal list
    1054             :                  */
    1055    50003476 :                 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
    1056    23255415 :                         list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
    1057             :                 } else {
    1058    26748061 :                         ASSERT(list_empty(&bp->b_lru));
    1059             :                 }
    1060             : 
    1061    50003504 :                 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
    1062    50003504 :                 rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
    1063             :                                        xfs_buf_hash_params);
    1064    50003447 :                 spin_unlock(&pag->pag_buf_lock);
    1065    50003489 :                 xfs_perag_put(pag);
    1066    50003489 :                 freebuf = true;
    1067             :         }
    1068             : 
    1069 20299909891 : out_unlock:
    1070 20299909891 :         spin_unlock(&bp->b_lock);
    1071             : 
    1072 20298543846 :         if (freebuf)
    1073    50003491 :                 xfs_buf_free(bp);
    1074             : }
    1075             : 
    1076             : 
    1077             : /*
    1078             :  *      Lock a buffer object, if it is not already locked.
    1079             :  *
    1080             :  *      If we come across a stale, pinned, locked buffer, we know that we are
    1081             :  *      being asked to lock a buffer that has been reallocated. Because it is
    1082             :  *      pinned, we know that the log has not been pushed to disk and hence it
    1083             :  *      will still be locked.  Rather than continuing to have trylock attempts
    1084             :  *      fail until someone else pushes the log, push it ourselves before
    1085             :  *      returning.  This means that the xfsaild will not get stuck trying
    1086             :  *      to push on stale inode buffers.
    1087             :  */
    1088             : int
    1089  3720321438 : xfs_buf_trylock(
    1090             :         struct xfs_buf          *bp)
    1091             : {
    1092  3720321438 :         int                     locked;
    1093             : 
    1094  3720321438 :         locked = down_trylock(&bp->b_sema) == 0;
    1095  3720381329 :         if (locked)
    1096  3644050350 :                 trace_xfs_buf_trylock(bp, _RET_IP_);
    1097             :         else
    1098    76330979 :                 trace_xfs_buf_trylock_fail(bp, _RET_IP_);
    1099  3720695715 :         return locked;
    1100             : }
    1101             : 
    1102             : /*
    1103             :  *      Lock a buffer object.
    1104             :  *
    1105             :  *      If we come across a stale, pinned, locked buffer, we know that we
    1106             :  *      are being asked to lock a buffer that has been reallocated. Because
    1107             :  *      it is pinned, we know that the log has not been pushed to disk and
    1108             :  *      hence it will still be locked. Rather than sleeping until someone
    1109             :  *      else pushes the log, push it ourselves before trying to get the lock.
    1110             :  */
    1111             : void
    1112 12289594483 : xfs_buf_lock(
    1113             :         struct xfs_buf          *bp)
    1114             : {
    1115 12289594483 :         trace_xfs_buf_lock(bp, _RET_IP_);
    1116             : 
    1117 12286297987 :         if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
    1118       22067 :                 xfs_log_force(bp->b_mount, 0);
    1119 12286297987 :         down(&bp->b_sema);
    1120             : 
    1121 12286184056 :         trace_xfs_buf_lock_done(bp, _RET_IP_);
    1122 12290114647 : }
    1123             : 
    1124             : void
    1125 15962777708 : xfs_buf_unlock(
    1126             :         struct xfs_buf          *bp)
    1127             : {
    1128 15962777708 :         ASSERT(xfs_buf_islocked(bp));
    1129             : 
    1130 15962777708 :         up(&bp->b_sema);
    1131 15970193678 :         trace_xfs_buf_unlock(bp, _RET_IP_);
    1132 15974124912 : }
    1133             : 
    1134             : STATIC void
    1135    69258299 : xfs_buf_wait_unpin(
    1136             :         struct xfs_buf          *bp)
    1137             : {
    1138    69258299 :         DECLARE_WAITQUEUE       (wait, current);
    1139             : 
    1140    69258299 :         if (atomic_read(&bp->b_pin_count) == 0)
    1141    69255457 :                 return;
    1142             : 
    1143        2842 :         add_wait_queue(&bp->b_waiters, &wait);
    1144        8526 :         for (;;) {
    1145        5684 :                 set_current_state(TASK_UNINTERRUPTIBLE);
    1146        5684 :                 if (atomic_read(&bp->b_pin_count) == 0)
    1147             :                         break;
    1148        2842 :                 io_schedule();
    1149             :         }
    1150        2842 :         remove_wait_queue(&bp->b_waiters, &wait);
    1151        2842 :         set_current_state(TASK_RUNNING);
    1152             : }
    1153             : 
    1154             : static void
    1155        4199 : xfs_buf_ioerror_alert_ratelimited(
    1156             :         struct xfs_buf          *bp)
    1157             : {
    1158        4199 :         static unsigned long    lasttime;
    1159        4199 :         static struct xfs_buftarg *lasttarg;
    1160             : 
    1161        4199 :         if (bp->b_target != lasttarg ||
    1162        4178 :             time_after(jiffies, (lasttime + 5*HZ))) {
    1163          23 :                 lasttime = jiffies;
    1164          23 :                 xfs_buf_ioerror_alert(bp, __this_address);
    1165             :         }
    1166        4199 :         lasttarg = bp->b_target;
    1167        4199 : }
    1168             : 
    1169             : /*
    1170             :  * Account for this latest trip around the retry handler, and decide if
    1171             :  * we've failed enough times to constitute a permanent failure.
    1172             :  */
    1173             : static bool
    1174        3977 : xfs_buf_ioerror_permanent(
    1175             :         struct xfs_buf          *bp,
    1176             :         struct xfs_error_cfg    *cfg)
    1177             : {
    1178        3977 :         struct xfs_mount        *mp = bp->b_mount;
    1179             : 
    1180        3977 :         if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
    1181          16 :             ++bp->b_retries > cfg->max_retries)
    1182             :                 return true;
    1183        3961 :         if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
    1184           0 :             time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
    1185             :                 return true;
    1186             : 
    1187             :         /* At unmount we may treat errors differently */
    1188        7922 :         if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
    1189           3 :                 return true;
    1190             : 
    1191             :         return false;
    1192             : }
    1193             : 
    1194             : /*
    1195             :  * On a sync write or shutdown we just want to stale the buffer and let the
    1196             :  * caller handle the error in bp->b_error appropriately.
    1197             :  *
    1198             :  * If the write was asynchronous then no one will be looking for the error.  If
    1199             :  * this is the first failure of this type, clear the error state and write the
    1200             :  * buffer out again. This means we always retry an async write failure at least
    1201             :  * once, but we also need to set the buffer up to behave correctly now for
    1202             :  * repeated failures.
    1203             :  *
    1204             :  * If we get repeated async write failures, then we take action according to the
    1205             :  * error configuration we have been set up to use.
    1206             :  *
    1207             :  * Returns true if this function took care of error handling and the caller must
    1208             :  * not touch the buffer again.  Return false if the caller should proceed with
    1209             :  * normal I/O completion handling.
    1210             :  */
    1211             : static bool
    1212     2186392 : xfs_buf_ioend_handle_error(
    1213             :         struct xfs_buf          *bp)
    1214             : {
    1215     2186392 :         struct xfs_mount        *mp = bp->b_mount;
    1216     2186392 :         struct xfs_error_cfg    *cfg;
    1217             : 
    1218             :         /*
    1219             :          * If we've already shutdown the journal because of I/O errors, there's
    1220             :          * no point in giving this a retry.
    1221             :          */
    1222     4372784 :         if (xlog_is_shutdown(mp->m_log))
    1223     2182193 :                 goto out_stale;
    1224             : 
    1225        4199 :         xfs_buf_ioerror_alert_ratelimited(bp);
    1226             : 
    1227             :         /*
    1228             :          * We're not going to bother about retrying this during recovery.
    1229             :          * One strike!
    1230             :          */
    1231        4199 :         if (bp->b_flags & _XBF_LOGRECOVERY) {
    1232           0 :                 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
    1233           0 :                 return false;
    1234             :         }
    1235             : 
    1236             :         /*
    1237             :          * Synchronous writes will have callers process the error.
    1238             :          */
    1239        4199 :         if (!(bp->b_flags & XBF_ASYNC))
    1240           0 :                 goto out_stale;
    1241             : 
    1242        4199 :         trace_xfs_buf_iodone_async(bp, _RET_IP_);
    1243             : 
    1244        4199 :         cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
    1245        4199 :         if (bp->b_last_error != bp->b_error ||
    1246        3977 :             !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
    1247         222 :                 bp->b_last_error = bp->b_error;
    1248         222 :                 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
    1249         158 :                     !bp->b_first_retry_time)
    1250         158 :                         bp->b_first_retry_time = jiffies;
    1251         222 :                 goto resubmit;
    1252             :         }
    1253             : 
    1254             :         /*
    1255             :          * Permanent error - we need to trigger a shutdown if we haven't already
    1256             :          * to indicate that inconsistency will result from this action.
    1257             :          */
    1258        3977 :         if (xfs_buf_ioerror_permanent(bp, cfg)) {
    1259          19 :                 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
    1260          19 :                 goto out_stale;
    1261             :         }
    1262             : 
    1263             :         /* Still considered a transient error. Caller will schedule retries. */
    1264        3958 :         if (bp->b_flags & _XBF_INODES)
    1265           0 :                 xfs_buf_inode_io_fail(bp);
    1266        3958 :         else if (bp->b_flags & _XBF_DQUOTS)
    1267          42 :                 xfs_buf_dquot_io_fail(bp);
    1268             :         else
    1269        3916 :                 ASSERT(list_empty(&bp->b_li_list));
    1270        3958 :         xfs_buf_ioerror(bp, 0);
    1271        3958 :         xfs_buf_relse(bp);
    1272        3958 :         return true;
    1273             : 
    1274             : resubmit:
    1275         222 :         xfs_buf_ioerror(bp, 0);
    1276         222 :         bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
    1277         222 :         xfs_buf_submit(bp);
    1278         222 :         return true;
    1279     2182212 : out_stale:
    1280     2182212 :         xfs_buf_stale(bp);
    1281     2182212 :         bp->b_flags |= XBF_DONE;
    1282     2182212 :         bp->b_flags &= ~XBF_WRITE;
    1283     2182212 :         trace_xfs_buf_error_relse(bp, _RET_IP_);
    1284     2182212 :         return false;
    1285             : }
    1286             : 
    1287             : static void
    1288    97340985 : xfs_buf_ioend(
    1289             :         struct xfs_buf  *bp)
    1290             : {
    1291    97340985 :         trace_xfs_buf_iodone(bp, _RET_IP_);
    1292             : 
    1293             :         /*
    1294             :          * Pull in IO completion errors now. We are guaranteed to be running
    1295             :          * single threaded, so we don't need the lock to read b_io_error.
    1296             :          */
    1297    97340989 :         if (!bp->b_error && bp->b_io_error)
    1298       57798 :                 xfs_buf_ioerror(bp, bp->b_io_error);
    1299             : 
    1300    97340986 :         if (bp->b_flags & XBF_READ) {
    1301    25900672 :                 if (!bp->b_error && bp->b_ops)
    1302    23755704 :                         bp->b_ops->verify_read(bp);
    1303    25900673 :                 if (!bp->b_error)
    1304    25833102 :                         bp->b_flags |= XBF_DONE;
    1305             :         } else {
    1306    71440314 :                 if (!bp->b_error) {
    1307    69253921 :                         bp->b_flags &= ~XBF_WRITE_FAIL;
    1308    69253921 :                         bp->b_flags |= XBF_DONE;
    1309             :                 }
    1310             : 
    1311    71440314 :                 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
    1312             :                         return;
    1313             : 
    1314             :                 /* clear the retry state */
    1315    71436133 :                 bp->b_last_error = 0;
    1316    71436133 :                 bp->b_retries = 0;
    1317    71436133 :                 bp->b_first_retry_time = 0;
    1318             : 
    1319             :                 /*
    1320             :                  * Note that for things like remote attribute buffers, there may
    1321             :                  * not be a buffer log item here, so processing the buffer log
    1322             :                  * item must remain optional.
    1323             :                  */
    1324    71436133 :                 if (bp->b_log_item)
    1325    38804293 :                         xfs_buf_item_done(bp);
    1326             : 
    1327    71436132 :                 if (bp->b_flags & _XBF_INODES)
    1328    20473874 :                         xfs_buf_inode_iodone(bp);
    1329    50962258 :                 else if (bp->b_flags & _XBF_DQUOTS)
    1330     6914937 :                         xfs_buf_dquot_iodone(bp);
    1331             : 
    1332             :         }
    1333             : 
    1334    97336805 :         bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
    1335             :                          _XBF_LOGRECOVERY);
    1336             : 
    1337    97336805 :         if (bp->b_flags & XBF_ASYNC)
    1338    69695946 :                 xfs_buf_relse(bp);
    1339             :         else
    1340    27640859 :                 complete(&bp->b_iowait);
    1341             : }
    1342             : 
    1343             : static void
    1344    95099615 : xfs_buf_ioend_work(
    1345             :         struct work_struct      *work)
    1346             : {
    1347    95099615 :         struct xfs_buf          *bp =
    1348    95099615 :                 container_of(work, struct xfs_buf, b_ioend_work);
    1349             : 
    1350    95099615 :         xfs_buf_ioend(bp);
    1351    95099614 : }
    1352             : 
    1353             : static void
    1354    95099615 : xfs_buf_ioend_async(
    1355             :         struct xfs_buf  *bp)
    1356             : {
    1357    95099615 :         INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
    1358    95099615 :         queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
    1359    95099615 : }
    1360             : 
    1361             : void
    1362    45125403 : __xfs_buf_ioerror(
    1363             :         struct xfs_buf          *bp,
    1364             :         int                     error,
    1365             :         xfs_failaddr_t          failaddr)
    1366             : {
    1367    45125403 :         ASSERT(error <= 0 && error >= -1000);
    1368    45125403 :         bp->b_error = error;
    1369    45125403 :         trace_xfs_buf_ioerror(bp, error, failaddr);
    1370    45126356 : }
    1371             : 
    1372             : void
    1373       26431 : xfs_buf_ioerror_alert(
    1374             :         struct xfs_buf          *bp,
    1375             :         xfs_failaddr_t          func)
    1376             : {
    1377       26431 :         xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
    1378             :                 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
    1379             :                                   func, (uint64_t)xfs_buf_daddr(bp),
    1380       26431 :                                   bp->b_length, -bp->b_error);
    1381       26466 : }
    1382             : 
    1383             : /*
    1384             :  * To simulate an I/O failure, the buffer must be locked and held with at least
    1385             :  * three references. The LRU reference is dropped by the stale call. The buf
    1386             :  * item reference is dropped via ioend processing. The third reference is owned
    1387             :  * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
    1388             :  */
    1389             : void
    1390     2192801 : xfs_buf_ioend_fail(
    1391             :         struct xfs_buf  *bp)
    1392             : {
    1393     2192801 :         bp->b_flags &= ~XBF_DONE;
    1394     2192801 :         xfs_buf_stale(bp);
    1395     2192800 :         xfs_buf_ioerror(bp, -EIO);
    1396     2192800 :         xfs_buf_ioend(bp);
    1397     2192801 : }
    1398             : 
    1399             : int
    1400         562 : xfs_bwrite(
    1401             :         struct xfs_buf          *bp)
    1402             : {
    1403         562 :         int                     error;
    1404             : 
    1405         562 :         ASSERT(xfs_buf_islocked(bp));
    1406             : 
    1407         562 :         bp->b_flags |= XBF_WRITE;
    1408         562 :         bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
    1409             :                          XBF_DONE);
    1410             : 
    1411         562 :         error = xfs_buf_submit(bp);
    1412         562 :         if (error)
    1413           0 :                 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
    1414         562 :         return error;
    1415             : }
    1416             : 
    1417             : static void
    1418    95148260 : xfs_buf_bio_end_io(
    1419             :         struct bio              *bio)
    1420             : {
    1421    95148260 :         struct xfs_buf          *bp = (struct xfs_buf *)bio->bi_private;
    1422             : 
    1423    95148260 :         if (!bio->bi_status &&
    1424   146051795 :             (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
    1425    50961333 :             XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
    1426           0 :                 bio->bi_status = BLK_STS_IOERR;
    1427             : 
    1428             :         /*
    1429             :          * don't overwrite existing errors - otherwise we can lose errors on
    1430             :          * buffers that require multiple bios to complete.
    1431             :          */
    1432    95148260 :         if (bio->bi_status) {
    1433       57798 :                 int error = blk_status_to_errno(bio->bi_status);
    1434             : 
    1435       57796 :                 cmpxchg(&bp->b_io_error, 0, error);
    1436             :         }
    1437             : 
    1438    95148258 :         if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
    1439             :                 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
    1440             : 
    1441   190296517 :         if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
    1442    95073275 :                 xfs_buf_ioend_async(bp);
    1443    95148259 :         bio_put(bio);
    1444    95148259 : }
    1445             : 
    1446             : static void
    1447    95147891 : xfs_buf_ioapply_map(
    1448             :         struct xfs_buf  *bp,
    1449             :         int             map,
    1450             :         int             *buf_offset,
    1451             :         int             *count,
    1452             :         blk_opf_t       op)
    1453             : {
    1454    95147891 :         int             page_index;
    1455    95147891 :         unsigned int    total_nr_pages = bp->b_page_count;
    1456    95147891 :         int             nr_pages;
    1457    95147891 :         struct bio      *bio;
    1458    95147891 :         sector_t        sector =  bp->b_maps[map].bm_bn;
    1459    95147891 :         int             size;
    1460    95147891 :         int             offset;
    1461             : 
    1462             :         /* skip the pages in the buffer before the start offset */
    1463    95147891 :         page_index = 0;
    1464    95147891 :         offset = *buf_offset;
    1465    95147891 :         while (offset >= PAGE_SIZE) {
    1466           0 :                 page_index++;
    1467           0 :                 offset -= PAGE_SIZE;
    1468             :         }
    1469             : 
    1470             :         /*
    1471             :          * Limit the IO size to the length of the current vector, and update the
    1472             :          * remaining IO count for the next time around.
    1473             :          */
    1474    95147891 :         size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
    1475    95147891 :         *count -= size;
    1476    95147891 :         *buf_offset += size;
    1477             : 
    1478    95147891 : next_chunk:
    1479    95147891 :         atomic_inc(&bp->b_io_remaining);
    1480    95147974 :         nr_pages = bio_max_segs(total_nr_pages);
    1481             : 
    1482    95147974 :         bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
    1483    95147773 :         bio->bi_iter.bi_sector = sector;
    1484    95147773 :         bio->bi_end_io = xfs_buf_bio_end_io;
    1485    95147773 :         bio->bi_private = bp;
    1486             : 
    1487   190295416 :         for (; size && nr_pages; nr_pages--, page_index++) {
    1488    95147807 :                 int     rbytes, nbytes = PAGE_SIZE - offset;
    1489             : 
    1490    95147807 :                 if (nbytes > size)
    1491             :                         nbytes = size;
    1492             : 
    1493    95147807 :                 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
    1494             :                                       offset);
    1495    95147643 :                 if (rbytes < nbytes)
    1496             :                         break;
    1497             : 
    1498    95147643 :                 offset = 0;
    1499    95147643 :                 sector += BTOBB(nbytes);
    1500    95147643 :                 size -= nbytes;
    1501    95147643 :                 total_nr_pages--;
    1502             :         }
    1503             : 
    1504    95147609 :         if (likely(bio->bi_iter.bi_size)) {
    1505    95147609 :                 if (xfs_buf_is_vmapped(bp)) {
    1506             :                         flush_kernel_vmap_range(bp->b_addr,
    1507             :                                                 xfs_buf_vmap_len(bp));
    1508             :                 }
    1509    95147609 :                 submit_bio(bio);
    1510    95147576 :                 if (size)
    1511           0 :                         goto next_chunk;
    1512             :         } else {
    1513             :                 /*
    1514             :                  * This is guaranteed not to be the last io reference count
    1515             :                  * because the caller (xfs_buf_submit) holds a count itself.
    1516             :                  */
    1517           0 :                 atomic_dec(&bp->b_io_remaining);
    1518           0 :                 xfs_buf_ioerror(bp, -EIO);
    1519           0 :                 bio_put(bio);
    1520             :         }
    1521             : 
    1522    95147576 : }
    1523             : 
    1524             : STATIC void
    1525    95147884 : _xfs_buf_ioapply(
    1526             :         struct xfs_buf  *bp)
    1527             : {
    1528    95147884 :         struct blk_plug plug;
    1529    95147884 :         blk_opf_t       op;
    1530    95147884 :         int             offset;
    1531    95147884 :         int             size;
    1532    95147884 :         int             i;
    1533             : 
    1534             :         /*
    1535             :          * Make sure we capture only current IO errors rather than stale errors
    1536             :          * left over from previous use of the buffer (e.g. failed readahead).
    1537             :          */
    1538    95147884 :         bp->b_error = 0;
    1539             : 
    1540    95147884 :         if (bp->b_flags & XBF_WRITE) {
    1541    69258300 :                 op = REQ_OP_WRITE;
    1542             : 
    1543             :                 /*
    1544             :                  * Run the write verifier callback function if it exists. If
    1545             :                  * this function fails it will mark the buffer with an error and
    1546             :                  * the IO should not be dispatched.
    1547             :                  */
    1548    69258300 :                 if (bp->b_ops) {
    1549    69258300 :                         bp->b_ops->verify_write(bp);
    1550    69258299 :                         if (bp->b_error) {
    1551           4 :                                 xfs_force_shutdown(bp->b_mount,
    1552             :                                                    SHUTDOWN_CORRUPT_INCORE);
    1553           4 :                                 return;
    1554             :                         }
    1555           0 :                 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
    1556           0 :                         struct xfs_mount *mp = bp->b_mount;
    1557             : 
    1558             :                         /*
    1559             :                          * non-crc filesystems don't attach verifiers during
    1560             :                          * log recovery, so don't warn for such filesystems.
    1561             :                          */
    1562           0 :                         if (xfs_has_crc(mp)) {
    1563           0 :                                 xfs_warn(mp,
    1564             :                                         "%s: no buf ops on daddr 0x%llx len %d",
    1565             :                                         __func__, xfs_buf_daddr(bp),
    1566             :                                         bp->b_length);
    1567           0 :                                 xfs_hex_dump(bp->b_addr,
    1568             :                                                 XFS_CORRUPTION_DUMP_LEN);
    1569           0 :                                 dump_stack();
    1570             :                         }
    1571             :                 }
    1572             :         } else {
    1573    25889584 :                 op = REQ_OP_READ;
    1574    25889584 :                 if (bp->b_flags & XBF_READ_AHEAD)
    1575    16552264 :                         op |= REQ_RAHEAD;
    1576             :         }
    1577             : 
    1578             :         /* we only use the buffer cache for meta-data */
    1579    95147879 :         op |= REQ_META;
    1580             : 
    1581             :         /*
    1582             :          * Walk all the vectors issuing IO on them. Set up the initial offset
    1583             :          * into the buffer and the desired IO size before we start -
    1584             :          * _xfs_buf_ioapply_vec() will modify them appropriately for each
    1585             :          * subsequent call.
    1586             :          */
    1587    95147879 :         offset = bp->b_offset;
    1588    95147879 :         size = BBTOB(bp->b_length);
    1589    95147879 :         blk_start_plug(&plug);
    1590   190295798 :         for (i = 0; i < bp->b_map_count; i++) {
    1591    95147919 :                 xfs_buf_ioapply_map(bp, i, &offset, &size, op);
    1592    95147597 :                 if (bp->b_error)
    1593             :                         break;
    1594    95147597 :                 if (size <= 0)
    1595             :                         break;  /* all done */
    1596             :         }
    1597    95147521 :         blk_finish_plug(&plug);
    1598             : }
    1599             : 
    1600             : /*
    1601             :  * Wait for I/O completion of a sync buffer and return the I/O error code.
    1602             :  */
    1603             : static int
    1604    27630119 : xfs_buf_iowait(
    1605             :         struct xfs_buf  *bp)
    1606             : {
    1607    27630119 :         ASSERT(!(bp->b_flags & XBF_ASYNC));
    1608             : 
    1609    27630119 :         trace_xfs_buf_iowait(bp, _RET_IP_);
    1610    27630145 :         wait_for_completion(&bp->b_iowait);
    1611    27629972 :         trace_xfs_buf_iowait_done(bp, _RET_IP_);
    1612             : 
    1613    27629932 :         return bp->b_error;
    1614             : }
    1615             : 
    1616             : /*
    1617             :  * Buffer I/O submission path, read or write. Asynchronous submission transfers
    1618             :  * the buffer lock ownership and the current reference to the IO. It is not
    1619             :  * safe to reference the buffer after a call to this function unless the caller
    1620             :  * holds an additional reference itself.
    1621             :  */
    1622             : static int
    1623    96465191 : __xfs_buf_submit(
    1624             :         struct xfs_buf  *bp,
    1625             :         bool            wait)
    1626             : {
    1627    96465191 :         int             error = 0;
    1628             : 
    1629    96465191 :         trace_xfs_buf_submit(bp, _RET_IP_);
    1630             : 
    1631    96465195 :         ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
    1632             : 
    1633             :         /*
    1634             :          * On log shutdown we stale and complete the buffer immediately. We can
    1635             :          * be called to read the superblock before the log has been set up, so
    1636             :          * be careful checking the log state.
    1637             :          *
    1638             :          * Checking the mount shutdown state here can result in the log tail
    1639             :          * moving inappropriately on disk as the log may not yet be shut down.
    1640             :          * i.e. failing this buffer on mount shutdown can remove it from the AIL
    1641             :          * and move the tail of the log forwards without having written this
    1642             :          * buffer to disk. This corrupts the log tail state in memory, and
    1643             :          * because the log may not be shut down yet, it can then be propagated
    1644             :          * to disk before the log is shutdown. Hence we check log shutdown
    1645             :          * state here rather than mount state to avoid corrupting the log tail
    1646             :          * on shutdown.
    1647             :          */
    1648   192862667 :         if (bp->b_mount->m_log &&
    1649             :             xlog_is_shutdown(bp->b_mount->m_log)) {
    1650     1317473 :                 xfs_buf_ioend_fail(bp);
    1651     1317473 :                 return -EIO;
    1652             :         }
    1653             : 
    1654             :         /*
    1655             :          * Grab a reference so the buffer does not go away underneath us. For
    1656             :          * async buffers, I/O completion drops the callers reference, which
    1657             :          * could occur before submission returns.
    1658             :          */
    1659    95147722 :         xfs_buf_hold(bp);
    1660             : 
    1661    95147763 :         if (bp->b_flags & XBF_WRITE)
    1662    69258296 :                 xfs_buf_wait_unpin(bp);
    1663             : 
    1664             :         /* clear the internal error state to avoid spurious errors */
    1665    95147768 :         bp->b_io_error = 0;
    1666             : 
    1667             :         /*
    1668             :          * Set the count to 1 initially, this will stop an I/O completion
    1669             :          * callout which happens before we have started all the I/O from calling
    1670             :          * xfs_buf_ioend too early.
    1671             :          */
    1672    95147768 :         atomic_set(&bp->b_io_remaining, 1);
    1673    95147768 :         if (bp->b_flags & XBF_ASYNC)
    1674    67517872 :                 xfs_buf_ioacct_inc(bp);
    1675    95147769 :         _xfs_buf_ioapply(bp);
    1676             : 
    1677             :         /*
    1678             :          * If _xfs_buf_ioapply failed, we can get back here with only the IO
    1679             :          * reference we took above. If we drop it to zero, run completion so
    1680             :          * that we don't return to the caller with completion still pending.
    1681             :          */
    1682   190295723 :         if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
    1683       74911 :                 if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
    1684       48571 :                         xfs_buf_ioend(bp);
    1685             :                 else
    1686       26340 :                         xfs_buf_ioend_async(bp);
    1687             :         }
    1688             : 
    1689    95147912 :         if (wait)
    1690     9337979 :                 error = xfs_buf_iowait(bp);
    1691             : 
    1692             :         /*
    1693             :          * Release the hold that keeps the buffer referenced for the entire
    1694             :          * I/O. Note that if the buffer is async, it is not safe to reference
    1695             :          * after this release.
    1696             :          */
    1697    95147655 :         xfs_buf_rele(bp);
    1698    95147655 :         return error;
    1699             : }
    1700             : 
    1701             : void *
    1702 12501905636 : xfs_buf_offset(
    1703             :         struct xfs_buf          *bp,
    1704             :         size_t                  offset)
    1705             : {
    1706 12501905636 :         struct page             *page;
    1707             : 
    1708 12501905636 :         if (bp->b_addr)
    1709 12501905636 :                 return bp->b_addr + offset;
    1710             : 
    1711           0 :         page = bp->b_pages[offset >> PAGE_SHIFT];
    1712           0 :         return page_address(page) + (offset & (PAGE_SIZE-1));
    1713             : }
    1714             : 
    1715             : void
    1716      836874 : xfs_buf_zero(
    1717             :         struct xfs_buf          *bp,
    1718             :         size_t                  boff,
    1719             :         size_t                  bsize)
    1720             : {
    1721      836874 :         size_t                  bend;
    1722             : 
    1723      836874 :         bend = boff + bsize;
    1724     1673736 :         while (boff < bend) {
    1725      836874 :                 struct page     *page;
    1726      836874 :                 int             page_index, page_offset, csize;
    1727             : 
    1728      836874 :                 page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
    1729      836874 :                 page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
    1730      836874 :                 page = bp->b_pages[page_index];
    1731      836874 :                 csize = min_t(size_t, PAGE_SIZE - page_offset,
    1732             :                                       BBTOB(bp->b_length) - boff);
    1733             : 
    1734      836874 :                 ASSERT((csize + page_offset) <= PAGE_SIZE);
    1735             : 
    1736      836862 :                 memset(page_address(page) + page_offset, 0, csize);
    1737             : 
    1738      836862 :                 boff += csize;
    1739             :         }
    1740      836862 : }
    1741             : 
    1742             : /*
    1743             :  * Log a message about and stale a buffer that a caller has decided is corrupt.
    1744             :  *
    1745             :  * This function should be called for the kinds of metadata corruption that
    1746             :  * cannot be detect from a verifier, such as incorrect inter-block relationship
    1747             :  * data.  Do /not/ call this function from a verifier function.
    1748             :  *
    1749             :  * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
    1750             :  * be marked stale, but b_error will not be set.  The caller is responsible for
    1751             :  * releasing the buffer or fixing it.
    1752             :  */
    1753             : void
    1754           0 : __xfs_buf_mark_corrupt(
    1755             :         struct xfs_buf          *bp,
    1756             :         xfs_failaddr_t          fa)
    1757             : {
    1758           0 :         ASSERT(bp->b_flags & XBF_DONE);
    1759             : 
    1760           0 :         xfs_buf_corruption_error(bp, fa);
    1761           0 :         xfs_buf_stale(bp);
    1762           0 : }
    1763             : 
    1764             : /*
    1765             :  *      Handling of buffer targets (buftargs).
    1766             :  */
    1767             : 
    1768             : /*
    1769             :  * Wait for any bufs with callbacks that have been submitted but have not yet
    1770             :  * returned. These buffers will have an elevated hold count, so wait on those
    1771             :  * while freeing all the buffers only held by the LRU.
    1772             :  */
    1773             : static enum lru_status
    1774    18842747 : xfs_buftarg_drain_rele(
    1775             :         struct list_head        *item,
    1776             :         struct list_lru_one     *lru,
    1777             :         spinlock_t              *lru_lock,
    1778             :         void                    *arg)
    1779             : 
    1780             : {
    1781    18842747 :         struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
    1782    18842747 :         struct list_head        *dispose = arg;
    1783             : 
    1784    18842747 :         if (atomic_read(&bp->b_hold) > 1) {
    1785             :                 /* need to wait, so skip it this pass */
    1786           0 :                 trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
    1787           0 :                 return LRU_SKIP;
    1788             :         }
    1789    18842747 :         if (!spin_trylock(&bp->b_lock))
    1790             :                 return LRU_SKIP;
    1791             : 
    1792             :         /*
    1793             :          * clear the LRU reference count so the buffer doesn't get
    1794             :          * ignored in xfs_buf_rele().
    1795             :          */
    1796    18842747 :         atomic_set(&bp->b_lru_ref, 0);
    1797    18842747 :         bp->b_state |= XFS_BSTATE_DISPOSE;
    1798    18842747 :         list_lru_isolate_move(lru, item, dispose);
    1799    18842747 :         spin_unlock(&bp->b_lock);
    1800    18842747 :         return LRU_REMOVED;
    1801             : }
    1802             : 
    1803             : /*
    1804             :  * Wait for outstanding I/O on the buftarg to complete.
    1805             :  */
    1806             : void
    1807      115997 : xfs_buftarg_wait(
    1808             :         struct xfs_buftarg      *btp)
    1809             : {
    1810             :         /*
    1811             :          * First wait on the buftarg I/O count for all in-flight buffers to be
    1812             :          * released. This is critical as new buffers do not make the LRU until
    1813             :          * they are released.
    1814             :          *
    1815             :          * Next, flush the buffer workqueue to ensure all completion processing
    1816             :          * has finished. Just waiting on buffer locks is not sufficient for
    1817             :          * async IO as the reference count held over IO is not released until
    1818             :          * after the buffer lock is dropped. Hence we need to ensure here that
    1819             :          * all reference counts have been dropped before we start walking the
    1820             :          * LRU list.
    1821             :          */
    1822      119941 :         while (percpu_counter_sum(&btp->bt_io_count))
    1823        3944 :                 delay(100);
    1824      115997 :         flush_workqueue(btp->bt_mount->m_buf_workqueue);
    1825      115997 : }
    1826             : 
    1827             : void
    1828       44993 : xfs_buftarg_drain(
    1829             :         struct xfs_buftarg      *btp)
    1830             : {
    1831       44993 :         LIST_HEAD(dispose);
    1832       44993 :         int                     loop = 0;
    1833       44993 :         bool                    write_fail = false;
    1834             : 
    1835       44993 :         xfs_buftarg_wait(btp);
    1836             : 
    1837             :         /* loop until there is nothing left on the lru list. */
    1838       87787 :         while (list_lru_count(&btp->bt_lru)) {
    1839       42794 :                 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
    1840             :                               &dispose, LONG_MAX);
    1841             : 
    1842    18885541 :                 while (!list_empty(&dispose)) {
    1843    18842747 :                         struct xfs_buf *bp;
    1844    18842747 :                         bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
    1845    18842747 :                         list_del_init(&bp->b_lru);
    1846    18842747 :                         if (bp->b_flags & XBF_WRITE_FAIL) {
    1847           0 :                                 write_fail = true;
    1848           0 :                                 xfs_buf_alert_ratelimited(bp,
    1849             :                                         "XFS: Corruption Alert",
    1850             : "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
    1851             :                                         (long long)xfs_buf_daddr(bp));
    1852             :                         }
    1853    18842747 :                         xfs_buf_rele(bp);
    1854             :                 }
    1855       42794 :                 if (loop++ != 0)
    1856           0 :                         delay(100);
    1857             :         }
    1858             : 
    1859             :         /*
    1860             :          * If one or more failed buffers were freed, that means dirty metadata
    1861             :          * was thrown away. This should only ever happen after I/O completion
    1862             :          * handling has elevated I/O error(s) to permanent failures and shuts
    1863             :          * down the journal.
    1864             :          */
    1865       44993 :         if (write_fail) {
    1866           0 :                 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
    1867           0 :                 xfs_alert(btp->bt_mount,
    1868             :               "Please run xfs_repair to determine the extent of the problem.");
    1869             :         }
    1870       44993 : }
    1871             : 
    1872             : static enum lru_status
    1873    25402059 : xfs_buftarg_isolate(
    1874             :         struct list_head        *item,
    1875             :         struct list_lru_one     *lru,
    1876             :         spinlock_t              *lru_lock,
    1877             :         void                    *arg)
    1878             : {
    1879    25402059 :         struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
    1880    25402059 :         struct list_head        *dispose = arg;
    1881             : 
    1882             :         /*
    1883             :          * we are inverting the lru lock/bp->b_lock here, so use a trylock.
    1884             :          * If we fail to get the lock, just skip it.
    1885             :          */
    1886    25402059 :         if (!spin_trylock(&bp->b_lock))
    1887             :                 return LRU_SKIP;
    1888             :         /*
    1889             :          * Decrement the b_lru_ref count unless the value is already
    1890             :          * zero. If the value is already zero, we need to reclaim the
    1891             :          * buffer, otherwise it gets another trip through the LRU.
    1892             :          */
    1893    50804028 :         if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
    1894    17196066 :                 spin_unlock(&bp->b_lock);
    1895    17196066 :                 return LRU_ROTATE;
    1896             :         }
    1897             : 
    1898     8205948 :         bp->b_state |= XFS_BSTATE_DISPOSE;
    1899     8205948 :         list_lru_isolate_move(lru, item, dispose);
    1900     8205948 :         spin_unlock(&bp->b_lock);
    1901     8205948 :         return LRU_REMOVED;
    1902             : }
    1903             : 
    1904             : static unsigned long
    1905      200923 : xfs_buftarg_shrink_scan(
    1906             :         struct shrinker         *shrink,
    1907             :         struct shrink_control   *sc)
    1908             : {
    1909      200923 :         struct xfs_buftarg      *btp = container_of(shrink,
    1910             :                                         struct xfs_buftarg, bt_shrinker);
    1911      200923 :         LIST_HEAD(dispose);
    1912      200923 :         unsigned long           freed;
    1913             : 
    1914      200923 :         freed = list_lru_shrink_walk(&btp->bt_lru, sc,
    1915             :                                      xfs_buftarg_isolate, &dispose);
    1916             : 
    1917     8406871 :         while (!list_empty(&dispose)) {
    1918     8205948 :                 struct xfs_buf *bp;
    1919     8205948 :                 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
    1920     8205948 :                 list_del_init(&bp->b_lru);
    1921     8205948 :                 xfs_buf_rele(bp);
    1922             :         }
    1923             : 
    1924      200923 :         return freed;
    1925             : }
    1926             : 
    1927             : static unsigned long
    1928        5405 : xfs_buftarg_shrink_count(
    1929             :         struct shrinker         *shrink,
    1930             :         struct shrink_control   *sc)
    1931             : {
    1932        5405 :         struct xfs_buftarg      *btp = container_of(shrink,
    1933             :                                         struct xfs_buftarg, bt_shrinker);
    1934        5405 :         return list_lru_shrink_count(&btp->bt_lru, sc);
    1935             : }
    1936             : 
    1937             : void
    1938       22707 : xfs_free_buftarg(
    1939             :         struct xfs_buftarg      *btp)
    1940             : {
    1941       22707 :         unregister_shrinker(&btp->bt_shrinker);
    1942       22707 :         ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
    1943       22707 :         percpu_counter_destroy(&btp->bt_io_count);
    1944       22707 :         list_lru_destroy(&btp->bt_lru);
    1945             : 
    1946       22707 :         blkdev_issue_flush(btp->bt_bdev);
    1947       22707 :         invalidate_bdev(btp->bt_bdev);
    1948       22707 :         fs_put_dax(btp->bt_daxdev, btp->bt_mount);
    1949             : 
    1950       22707 :         kmem_free(btp);
    1951       22707 : }
    1952             : 
    1953             : int
    1954       45376 : xfs_setsize_buftarg(
    1955             :         xfs_buftarg_t           *btp,
    1956             :         unsigned int            sectorsize)
    1957             : {
    1958             :         /* Set up metadata sector size info */
    1959       45376 :         btp->bt_meta_sectorsize = sectorsize;
    1960       45376 :         btp->bt_meta_sectormask = sectorsize - 1;
    1961             : 
    1962       45376 :         if (set_blocksize(btp->bt_bdev, sectorsize)) {
    1963           0 :                 xfs_warn(btp->bt_mount,
    1964             :                         "Cannot set_blocksize to %u on device %pg",
    1965             :                         sectorsize, btp->bt_bdev);
    1966           0 :                 return -EINVAL;
    1967             :         }
    1968             : 
    1969             :         /* Set up device logical sector size mask */
    1970       45376 :         btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
    1971       45376 :         btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
    1972             : 
    1973       45376 :         return 0;
    1974             : }
    1975             : 
    1976             : /*
    1977             :  * When allocating the initial buffer target we have not yet
    1978             :  * read in the superblock, so don't know what sized sectors
    1979             :  * are being used at this early stage.  Play safe.
    1980             :  */
    1981             : STATIC int
    1982       22704 : xfs_setsize_buftarg_early(
    1983             :         xfs_buftarg_t           *btp,
    1984             :         struct block_device     *bdev)
    1985             : {
    1986       45408 :         return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
    1987             : }
    1988             : 
    1989             : struct xfs_buftarg *
    1990       22704 : xfs_alloc_buftarg(
    1991             :         struct xfs_mount        *mp,
    1992             :         struct block_device     *bdev)
    1993             : {
    1994       22704 :         xfs_buftarg_t           *btp;
    1995       22704 :         const struct dax_holder_operations *ops = NULL;
    1996             : 
    1997             : #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
    1998             :         ops = &xfs_dax_holder_operations;
    1999             : #endif
    2000       22704 :         btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
    2001             : 
    2002       22704 :         btp->bt_mount = mp;
    2003       22704 :         btp->bt_dev =  bdev->bd_dev;
    2004       22704 :         btp->bt_bdev = bdev;
    2005       22704 :         btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
    2006             :                                             mp, ops);
    2007             : 
    2008             :         /*
    2009             :          * Buffer IO error rate limiting. Limit it to no more than 10 messages
    2010             :          * per 30 seconds so as to not spam logs too much on repeated errors.
    2011             :          */
    2012       22704 :         ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
    2013             :                              DEFAULT_RATELIMIT_BURST);
    2014             : 
    2015       22704 :         if (xfs_setsize_buftarg_early(btp, bdev))
    2016           0 :                 goto error_free;
    2017             : 
    2018       22704 :         if (list_lru_init(&btp->bt_lru))
    2019           0 :                 goto error_free;
    2020             : 
    2021       22704 :         if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
    2022           0 :                 goto error_lru;
    2023             : 
    2024       22704 :         btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
    2025       22704 :         btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
    2026       22704 :         btp->bt_shrinker.seeks = DEFAULT_SEEKS;
    2027       22704 :         btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
    2028       22704 :         if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
    2029       22704 :                               mp->m_super->s_id))
    2030           0 :                 goto error_pcpu;
    2031             :         return btp;
    2032             : 
    2033             : error_pcpu:
    2034           0 :         percpu_counter_destroy(&btp->bt_io_count);
    2035           0 : error_lru:
    2036           0 :         list_lru_destroy(&btp->bt_lru);
    2037           0 : error_free:
    2038           0 :         kmem_free(btp);
    2039           0 :         return NULL;
    2040             : }
    2041             : 
    2042             : /*
    2043             :  * Cancel a delayed write list.
    2044             :  *
    2045             :  * Remove each buffer from the list, clear the delwri queue flag and drop the
    2046             :  * associated buffer reference.
    2047             :  */
    2048             : void
    2049       25391 : xfs_buf_delwri_cancel(
    2050             :         struct list_head        *list)
    2051             : {
    2052       25391 :         struct xfs_buf          *bp;
    2053             : 
    2054       25391 :         while (!list_empty(list)) {
    2055           0 :                 bp = list_first_entry(list, struct xfs_buf, b_list);
    2056             : 
    2057           0 :                 xfs_buf_lock(bp);
    2058           0 :                 bp->b_flags &= ~_XBF_DELWRI_Q;
    2059           0 :                 list_del_init(&bp->b_list);
    2060           0 :                 xfs_buf_relse(bp);
    2061             :         }
    2062       25391 : }
    2063             : 
    2064             : /*
    2065             :  * Add a buffer to the delayed write list.
    2066             :  *
    2067             :  * This queues a buffer for writeout if it hasn't already been.  Note that
    2068             :  * neither this routine nor the buffer list submission functions perform
    2069             :  * any internal synchronization.  It is expected that the lists are thread-local
    2070             :  * to the callers.
    2071             :  *
    2072             :  * Returns true if we queued up the buffer, or false if it already had
    2073             :  * been on the buffer list.
    2074             :  */
    2075             : bool
    2076    78514914 : xfs_buf_delwri_queue(
    2077             :         struct xfs_buf          *bp,
    2078             :         struct list_head        *list)
    2079             : {
    2080    78514914 :         ASSERT(xfs_buf_islocked(bp));
    2081    78514914 :         ASSERT(!(bp->b_flags & XBF_READ));
    2082             : 
    2083             :         /*
    2084             :          * If the buffer is already marked delwri it already is queued up
    2085             :          * by someone else for imediate writeout.  Just ignore it in that
    2086             :          * case.
    2087             :          */
    2088    78514914 :         if (bp->b_flags & _XBF_DELWRI_Q) {
    2089     7928215 :                 trace_xfs_buf_delwri_queued(bp, _RET_IP_);
    2090     7928215 :                 return false;
    2091             :         }
    2092             : 
    2093    70586699 :         trace_xfs_buf_delwri_queue(bp, _RET_IP_);
    2094             : 
    2095             :         /*
    2096             :          * If a buffer gets written out synchronously or marked stale while it
    2097             :          * is on a delwri list we lazily remove it. To do this, the other party
    2098             :          * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
    2099             :          * It remains referenced and on the list.  In a rare corner case it
    2100             :          * might get readded to a delwri list after the synchronous writeout, in
    2101             :          * which case we need just need to re-add the flag here.
    2102             :          */
    2103    70586693 :         bp->b_flags |= _XBF_DELWRI_Q;
    2104    70586693 :         if (list_empty(&bp->b_list)) {
    2105    70586684 :                 atomic_inc(&bp->b_hold);
    2106    70586652 :                 list_add_tail(&bp->b_list, list);
    2107             :         }
    2108             : 
    2109             :         return true;
    2110             : }
    2111             : 
    2112             : /*
    2113             :  * Compare function is more complex than it needs to be because
    2114             :  * the return value is only 32 bits and we are doing comparisons
    2115             :  * on 64 bit values
    2116             :  */
    2117             : static int
    2118   580201283 : xfs_buf_cmp(
    2119             :         void                    *priv,
    2120             :         const struct list_head  *a,
    2121             :         const struct list_head  *b)
    2122             : {
    2123   580201283 :         struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
    2124   580201283 :         struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
    2125   580201283 :         xfs_daddr_t             diff;
    2126             : 
    2127   580201283 :         diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
    2128   580201283 :         if (diff < 0)
    2129             :                 return -1;
    2130   287613590 :         if (diff > 0)
    2131   287613095 :                 return 1;
    2132             :         return 0;
    2133             : }
    2134             : 
    2135             : /*
    2136             :  * Submit buffers for write. If wait_list is specified, the buffers are
    2137             :  * submitted using sync I/O and placed on the wait list such that the caller can
    2138             :  * iowait each buffer. Otherwise async I/O is used and the buffers are released
    2139             :  * at I/O completion time. In either case, buffers remain locked until I/O
    2140             :  * completes and the buffer is released from the queue.
    2141             :  */
    2142             : static int
    2143     2737539 : xfs_buf_delwri_submit_buffers(
    2144             :         struct list_head        *buffer_list,
    2145             :         struct list_head        *wait_list)
    2146             : {
    2147     2737539 :         struct xfs_buf          *bp, *n;
    2148     2737539 :         int                     pinned = 0;
    2149     2737539 :         struct blk_plug         plug;
    2150             : 
    2151     2737539 :         list_sort(NULL, buffer_list, xfs_buf_cmp);
    2152             : 
    2153     2737532 :         blk_start_plug(&plug);
    2154    74095724 :         list_for_each_entry_safe(bp, n, buffer_list, b_list) {
    2155    71358199 :                 if (!wait_list) {
    2156    53066097 :                         if (!xfs_buf_trylock(bp))
    2157       41793 :                                 continue;
    2158    53024305 :                         if (xfs_buf_ispinned(bp)) {
    2159      729653 :                                 xfs_buf_unlock(bp);
    2160      729653 :                                 pinned++;
    2161      729653 :                                 continue;
    2162             :                         }
    2163             :                 } else {
    2164    18292102 :                         xfs_buf_lock(bp);
    2165             :                 }
    2166             : 
    2167             :                 /*
    2168             :                  * Someone else might have written the buffer synchronously or
    2169             :                  * marked it stale in the meantime.  In that case only the
    2170             :                  * _XBF_DELWRI_Q flag got cleared, and we have to drop the
    2171             :                  * reference and remove it from the list here.
    2172             :                  */
    2173    70586750 :                 if (!(bp->b_flags & _XBF_DELWRI_Q)) {
    2174       22552 :                         list_del_init(&bp->b_list);
    2175       22552 :                         xfs_buf_relse(bp);
    2176       22552 :                         continue;
    2177             :                 }
    2178             : 
    2179    70564198 :                 trace_xfs_buf_delwri_split(bp, _RET_IP_);
    2180             : 
    2181             :                 /*
    2182             :                  * If we have a wait list, each buffer (and associated delwri
    2183             :                  * queue reference) transfers to it and is submitted
    2184             :                  * synchronously. Otherwise, drop the buffer from the delwri
    2185             :                  * queue and submit async.
    2186             :                  */
    2187    70564202 :                 bp->b_flags &= ~_XBF_DELWRI_Q;
    2188    70564202 :                 bp->b_flags |= XBF_WRITE;
    2189    70564202 :                 if (wait_list) {
    2190    18292102 :                         bp->b_flags &= ~XBF_ASYNC;
    2191    18292102 :                         list_move_tail(&bp->b_list, wait_list);
    2192             :                 } else {
    2193    52272100 :                         bp->b_flags |= XBF_ASYNC;
    2194    52272100 :                         list_del_init(&bp->b_list);
    2195             :                 }
    2196    70564202 :                 __xfs_buf_submit(bp, false);
    2197             :         }
    2198     2737525 :         blk_finish_plug(&plug);
    2199             : 
    2200     2737528 :         return pinned;
    2201             : }
    2202             : 
    2203             : /*
    2204             :  * Write out a buffer list asynchronously.
    2205             :  *
    2206             :  * This will take the @buffer_list, write all non-locked and non-pinned buffers
    2207             :  * out and not wait for I/O completion on any of the buffers.  This interface
    2208             :  * is only safely useable for callers that can track I/O completion by higher
    2209             :  * level means, e.g. AIL pushing as the @buffer_list is consumed in this
    2210             :  * function.
    2211             :  *
    2212             :  * Note: this function will skip buffers it would block on, and in doing so
    2213             :  * leaves them on @buffer_list so they can be retried on a later pass. As such,
    2214             :  * it is up to the caller to ensure that the buffer list is fully submitted or
    2215             :  * cancelled appropriately when they are finished with the list. Failure to
    2216             :  * cancel or resubmit the list until it is empty will result in leaked buffers
    2217             :  * at unmount time.
    2218             :  */
    2219             : int
    2220     1936395 : xfs_buf_delwri_submit_nowait(
    2221             :         struct list_head        *buffer_list)
    2222             : {
    2223     1936395 :         return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
    2224             : }
    2225             : 
    2226             : /*
    2227             :  * Write out a buffer list synchronously.
    2228             :  *
    2229             :  * This will take the @buffer_list, write all buffers out and wait for I/O
    2230             :  * completion on all of the buffers. @buffer_list is consumed by the function,
    2231             :  * so callers must have some other way of tracking buffers if they require such
    2232             :  * functionality.
    2233             :  */
    2234             : int
    2235      801139 : xfs_buf_delwri_submit(
    2236             :         struct list_head        *buffer_list)
    2237             : {
    2238      801139 :         LIST_HEAD               (wait_list);
    2239      801139 :         int                     error = 0, error2;
    2240      801139 :         struct xfs_buf          *bp;
    2241             : 
    2242      801139 :         xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
    2243             : 
    2244             :         /* Wait for IO to complete. */
    2245    19093241 :         while (!list_empty(&wait_list)) {
    2246    18292102 :                 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
    2247             : 
    2248    18292102 :                 list_del_init(&bp->b_list);
    2249             : 
    2250             :                 /*
    2251             :                  * Wait on the locked buffer, check for errors and unlock and
    2252             :                  * release the delwri queue reference.
    2253             :                  */
    2254    18292102 :                 error2 = xfs_buf_iowait(bp);
    2255    18292102 :                 xfs_buf_relse(bp);
    2256    18292102 :                 if (!error)
    2257    18292102 :                         error = error2;
    2258             :         }
    2259             : 
    2260      801139 :         return error;
    2261             : }
    2262             : 
    2263             : /*
    2264             :  * Push a single buffer on a delwri queue.
    2265             :  *
    2266             :  * The purpose of this function is to submit a single buffer of a delwri queue
    2267             :  * and return with the buffer still on the original queue. The waiting delwri
    2268             :  * buffer submission infrastructure guarantees transfer of the delwri queue
    2269             :  * buffer reference to a temporary wait list. We reuse this infrastructure to
    2270             :  * transfer the buffer back to the original queue.
    2271             :  *
    2272             :  * Note the buffer transitions from the queued state, to the submitted and wait
    2273             :  * listed state and back to the queued state during this call. The buffer
    2274             :  * locking and queue management logic between _delwri_pushbuf() and
    2275             :  * _delwri_queue() guarantee that the buffer cannot be queued to another list
    2276             :  * before returning.
    2277             :  */
    2278             : int
    2279           0 : xfs_buf_delwri_pushbuf(
    2280             :         struct xfs_buf          *bp,
    2281             :         struct list_head        *buffer_list)
    2282             : {
    2283           0 :         LIST_HEAD               (submit_list);
    2284           0 :         int                     error;
    2285             : 
    2286           0 :         ASSERT(bp->b_flags & _XBF_DELWRI_Q);
    2287             : 
    2288           0 :         trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
    2289             : 
    2290             :         /*
    2291             :          * Isolate the buffer to a new local list so we can submit it for I/O
    2292             :          * independently from the rest of the original list.
    2293             :          */
    2294           0 :         xfs_buf_lock(bp);
    2295           0 :         list_move(&bp->b_list, &submit_list);
    2296           0 :         xfs_buf_unlock(bp);
    2297             : 
    2298             :         /*
    2299             :          * Delwri submission clears the DELWRI_Q buffer flag and returns with
    2300             :          * the buffer on the wait list with the original reference. Rather than
    2301             :          * bounce the buffer from a local wait list back to the original list
    2302             :          * after I/O completion, reuse the original list as the wait list.
    2303             :          */
    2304           0 :         xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
    2305             : 
    2306             :         /*
    2307             :          * The buffer is now locked, under I/O and wait listed on the original
    2308             :          * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
    2309             :          * return with the buffer unlocked and on the original queue.
    2310             :          */
    2311           0 :         error = xfs_buf_iowait(bp);
    2312           0 :         bp->b_flags |= _XBF_DELWRI_Q;
    2313           0 :         xfs_buf_unlock(bp);
    2314             : 
    2315           0 :         return error;
    2316             : }
    2317             : 
    2318 11456457708 : void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
    2319             : {
    2320             :         /*
    2321             :          * Set the lru reference count to 0 based on the error injection tag.
    2322             :          * This allows userspace to disrupt buffer caching for debug/testing
    2323             :          * purposes.
    2324             :          */
    2325 11456457708 :         if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
    2326          10 :                 lru_ref = 0;
    2327             : 
    2328 11459640519 :         atomic_set(&bp->b_lru_ref, lru_ref);
    2329 11459640519 : }
    2330             : 
    2331             : /*
    2332             :  * Verify an on-disk magic value against the magic value specified in the
    2333             :  * verifier structure. The verifier magic is in disk byte order so the caller is
    2334             :  * expected to pass the value directly from disk.
    2335             :  */
    2336             : bool
    2337   171373407 : xfs_verify_magic(
    2338             :         struct xfs_buf          *bp,
    2339             :         __be32                  dmagic)
    2340             : {
    2341   171373407 :         struct xfs_mount        *mp = bp->b_mount;
    2342   171373407 :         int                     idx;
    2343             : 
    2344   171373407 :         idx = xfs_has_crc(mp);
    2345   171373407 :         if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
    2346             :                 return false;
    2347   171376532 :         return dmagic == bp->b_ops->magic[idx];
    2348             : }
    2349             : /*
    2350             :  * Verify an on-disk magic value against the magic value specified in the
    2351             :  * verifier structure. The verifier magic is in disk byte order so the caller is
    2352             :  * expected to pass the value directly from disk.
    2353             :  */
    2354             : bool
    2355  1090723068 : xfs_verify_magic16(
    2356             :         struct xfs_buf          *bp,
    2357             :         __be16                  dmagic)
    2358             : {
    2359  1090723068 :         struct xfs_mount        *mp = bp->b_mount;
    2360  1090723068 :         int                     idx;
    2361             : 
    2362  1090723068 :         idx = xfs_has_crc(mp);
    2363  1090723068 :         if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
    2364             :                 return false;
    2365  1090725272 :         return dmagic == bp->b_ops->magic16[idx];
    2366             : }

Generated by: LCOV version 1.14