LCOV - code coverage report
Current view: top level - fs/xfs - xfs_buf.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsx @ Mon Jul 31 20:08:34 PDT 2023 Lines: 919 1011 90.9 %
Date: 2023-07-31 20:08:34 Functions: 78 80 97.5 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
       4             :  * All Rights Reserved.
       5             :  */
       6             : #include "xfs.h"
       7             : #include <linux/backing-dev.h>
       8             : #include <linux/dax.h>
       9             : 
      10             : #include "xfs_shared.h"
      11             : #include "xfs_format.h"
      12             : #include "xfs_log_format.h"
      13             : #include "xfs_trans_resv.h"
      14             : #include "xfs_mount.h"
      15             : #include "xfs_trace.h"
      16             : #include "xfs_log.h"
      17             : #include "xfs_log_recover.h"
      18             : #include "xfs_log_priv.h"
      19             : #include "xfs_trans.h"
      20             : #include "xfs_buf_item.h"
      21             : #include "xfs_errortag.h"
      22             : #include "xfs_error.h"
      23             : #include "xfs_ag.h"
      24             : #include "xfs_buf_xfile.h"
      25             : 
      26             : struct kmem_cache *xfs_buf_cache;
      27             : 
      28             : /*
      29             :  * Locking orders
      30             :  *
      31             :  * xfs_buf_ioacct_inc:
      32             :  * xfs_buf_ioacct_dec:
      33             :  *      b_sema (caller holds)
      34             :  *        b_lock
      35             :  *
      36             :  * xfs_buf_stale:
      37             :  *      b_sema (caller holds)
      38             :  *        b_lock
      39             :  *          lru_lock
      40             :  *
      41             :  * xfs_buf_rele:
      42             :  *      b_lock
      43             :  *        pag_buf_lock
      44             :  *          lru_lock
      45             :  *
      46             :  * xfs_buftarg_drain_rele
      47             :  *      lru_lock
      48             :  *        b_lock (trylock due to inversion)
      49             :  *
      50             :  * xfs_buftarg_isolate
      51             :  *      lru_lock
      52             :  *        b_lock (trylock due to inversion)
      53             :  */
      54             : 
      55             : static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
      56             : 
      57             : static inline int
      58    31901376 : xfs_buf_submit(
      59             :         struct xfs_buf          *bp)
      60             : {
      61    31901376 :         return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
      62             : }
      63             : 
      64             : static inline int
      65             : xfs_buf_is_vmapped(
      66             :         struct xfs_buf  *bp)
      67             : {
      68             :         /*
      69             :          * Return true if the buffer is vmapped.
      70             :          *
      71             :          * b_addr is null if the buffer is not mapped, but the code is clever
      72             :          * enough to know it doesn't have to map a single page, so the check has
      73             :          * to be both for b_addr and bp->b_page_count > 1.
      74             :          */
      75    87183563 :         return bp->b_addr && bp->b_page_count > 1;
      76             : }
      77             : 
      78             : static inline int
      79             : xfs_buf_vmap_len(
      80             :         struct xfs_buf  *bp)
      81             : {
      82             :         return (bp->b_page_count * PAGE_SIZE);
      83             : }
      84             : 
      85             : /*
      86             :  * Bump the I/O in flight count on the buftarg if we haven't yet done so for
      87             :  * this buffer. The count is incremented once per buffer (per hold cycle)
      88             :  * because the corresponding decrement is deferred to buffer release. Buffers
      89             :  * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
      90             :  * tracking adds unnecessary overhead. This is used for sychronization purposes
      91             :  * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
      92             :  * in-flight buffers.
      93             :  *
      94             :  * Buffers that are never released (e.g., superblock, iclog buffers) must set
      95             :  * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
      96             :  * never reaches zero and unmount hangs indefinitely.
      97             :  */
      98             : static inline void
      99   195843158 : xfs_buf_ioacct_inc(
     100             :         struct xfs_buf  *bp)
     101             : {
     102   195843158 :         if (bp->b_flags & XBF_NO_IOACCT)
     103             :                 return;
     104             : 
     105   195331471 :         ASSERT(bp->b_flags & XBF_ASYNC);
     106   195331471 :         spin_lock(&bp->b_lock);
     107   195331621 :         if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
     108   184851778 :                 bp->b_state |= XFS_BSTATE_IN_FLIGHT;
     109   184851778 :                 percpu_counter_inc(&bp->b_target->bt_io_count);
     110             :         }
     111   195331492 :         spin_unlock(&bp->b_lock);
     112             : }
     113             : 
     114             : /*
     115             :  * Clear the in-flight state on a buffer about to be released to the LRU or
     116             :  * freed and unaccount from the buftarg.
     117             :  */
     118             : static inline void
     119 18642925230 : __xfs_buf_ioacct_dec(
     120             :         struct xfs_buf  *bp)
     121             : {
     122 18642925230 :         lockdep_assert_held(&bp->b_lock);
     123             : 
     124 18642925230 :         if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
     125   184851346 :                 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
     126   184851346 :                 percpu_counter_dec(&bp->b_target->bt_io_count);
     127             :         }
     128 18642925036 : }
     129             : 
     130             : static inline void
     131      699342 : xfs_buf_ioacct_dec(
     132             :         struct xfs_buf  *bp)
     133             : {
     134      699342 :         spin_lock(&bp->b_lock);
     135      699357 :         __xfs_buf_ioacct_dec(bp);
     136      699352 :         spin_unlock(&bp->b_lock);
     137      699377 : }
     138             : 
     139             : /*
     140             :  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
     141             :  * b_lru_ref count so that the buffer is freed immediately when the buffer
     142             :  * reference count falls to zero. If the buffer is already on the LRU, we need
     143             :  * to remove the reference that LRU holds on the buffer.
     144             :  *
     145             :  * This prevents build-up of stale buffers on the LRU.
     146             :  */
     147             : void
     148    45462033 : xfs_buf_stale(
     149             :         struct xfs_buf  *bp)
     150             : {
     151    45462033 :         ASSERT(xfs_buf_islocked(bp));
     152             : 
     153    45462033 :         bp->b_flags |= XBF_STALE;
     154             : 
     155             :         /*
     156             :          * Clear the delwri status so that a delwri queue walker will not
     157             :          * flush this buffer to disk now that it is stale. The delwri queue has
     158             :          * a reference to the buffer, so this is safe to do.
     159             :          */
     160    45462033 :         bp->b_flags &= ~_XBF_DELWRI_Q;
     161             : 
     162             :         /*
     163             :          * Once the buffer is marked stale and unlocked, a subsequent lookup
     164             :          * could reset b_flags. There is no guarantee that the buffer is
     165             :          * unaccounted (released to LRU) before that occurs. Drop in-flight
     166             :          * status now to preserve accounting consistency.
     167             :          */
     168    45462033 :         spin_lock(&bp->b_lock);
     169    45532841 :         __xfs_buf_ioacct_dec(bp);
     170             : 
     171    45491991 :         atomic_set(&bp->b_lru_ref, 0);
     172    91046813 :         if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
     173    45490459 :             (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
     174    12969451 :                 atomic_dec(&bp->b_hold);
     175             : 
     176    45556422 :         ASSERT(atomic_read(&bp->b_hold) >= 1);
     177    45556422 :         spin_unlock(&bp->b_lock);
     178    45554243 : }
     179             : 
     180             : static int
     181    88112467 : xfs_buf_get_maps(
     182             :         struct xfs_buf          *bp,
     183             :         int                     map_count)
     184             : {
     185    88112467 :         ASSERT(bp->b_maps == NULL);
     186    88112467 :         bp->b_map_count = map_count;
     187             : 
     188    88112467 :         if (map_count == 1) {
     189    88112298 :                 bp->b_maps = &bp->__b_map;
     190    88112298 :                 return 0;
     191             :         }
     192             : 
     193         169 :         bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
     194             :                                 KM_NOFS);
     195         169 :         if (!bp->b_maps)
     196           0 :                 return -ENOMEM;
     197             :         return 0;
     198             : }
     199             : 
     200             : /*
     201             :  *      Frees b_pages if it was allocated.
     202             :  */
     203             : static void
     204             : xfs_buf_free_maps(
     205             :         struct xfs_buf  *bp)
     206             : {
     207    88002250 :         if (bp->b_maps != &bp->__b_map) {
     208         169 :                 kmem_free(bp->b_maps);
     209         169 :                 bp->b_maps = NULL;
     210             :         }
     211             : }
     212             : 
     213             : static int
     214    88078640 : _xfs_buf_alloc(
     215             :         struct xfs_buftarg      *target,
     216             :         struct xfs_buf_map      *map,
     217             :         int                     nmaps,
     218             :         xfs_buf_flags_t         flags,
     219             :         struct xfs_buf          **bpp)
     220             : {
     221    88078640 :         struct xfs_buf          *bp;
     222    88078640 :         int                     error;
     223    88078640 :         int                     i;
     224             : 
     225    88078640 :         *bpp = NULL;
     226    88078640 :         bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
     227             : 
     228             :         /*
     229             :          * We don't want certain flags to appear in b_flags unless they are
     230             :          * specifically set by later operations on the buffer.
     231             :          */
     232    88128471 :         flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
     233             : 
     234    88128471 :         atomic_set(&bp->b_hold, 1);
     235    88128471 :         atomic_set(&bp->b_lru_ref, 1);
     236    88128471 :         init_completion(&bp->b_iowait);
     237    88113369 :         INIT_LIST_HEAD(&bp->b_lru);
     238    88113369 :         INIT_LIST_HEAD(&bp->b_list);
     239    88113369 :         INIT_LIST_HEAD(&bp->b_li_list);
     240    88113369 :         sema_init(&bp->b_sema, 0); /* held, no waiters */
     241    88113369 :         spin_lock_init(&bp->b_lock);
     242    88108856 :         bp->b_target = target;
     243    88108856 :         bp->b_mount = target->bt_mount;
     244    88108856 :         bp->b_flags = flags;
     245             : 
     246             :         /*
     247             :          * Set length and io_length to the same value initially.
     248             :          * I/O routines should use io_length, which will be the same in
     249             :          * most cases but may be reset (e.g. XFS recovery).
     250             :          */
     251    88108856 :         error = xfs_buf_get_maps(bp, nmaps);
     252    88098273 :         if (error)  {
     253           0 :                 kmem_cache_free(xfs_buf_cache, bp);
     254           0 :                 return error;
     255             :         }
     256             : 
     257    88098273 :         bp->b_rhash_key = map[0].bm_bn;
     258    88098273 :         bp->b_length = 0;
     259   176188818 :         for (i = 0; i < nmaps; i++) {
     260    88090545 :                 bp->b_maps[i].bm_bn = map[i].bm_bn;
     261    88090545 :                 bp->b_maps[i].bm_len = map[i].bm_len;
     262    88090545 :                 bp->b_length += map[i].bm_len;
     263             :         }
     264             : 
     265    88098273 :         atomic_set(&bp->b_pin_count, 0);
     266    88098273 :         init_waitqueue_head(&bp->b_waiters);
     267             : 
     268    88100625 :         XFS_STATS_INC(bp->b_mount, xb_create);
     269    88100995 :         trace_xfs_buf_init(bp, _RET_IP_);
     270             : 
     271    88091781 :         *bpp = bp;
     272    88091781 :         return 0;
     273             : }
     274             : 
     275             : static void
     276    86567213 : xfs_buf_free_pages(
     277             :         struct xfs_buf  *bp)
     278             : {
     279    86567213 :         uint            i;
     280             : 
     281    86567213 :         ASSERT(bp->b_flags & _XBF_PAGES);
     282             : 
     283   214874747 :         for (i = 0; i < bp->b_page_count; i++) {
     284   128307733 :                 if (bp->b_pages[i])
     285   128307733 :                         __free_page(bp->b_pages[i]);
     286             :         }
     287    86567014 :         mm_account_reclaimed_pages(bp->b_page_count);
     288             : 
     289    86567113 :         xfs_buf_free_page_array(bp);
     290    86566931 : }
     291             : 
     292             : void
     293    87835579 : xfs_buf_free_page_array(
     294             :         struct xfs_buf  *bp)
     295             : {
     296    87835579 :         ASSERT(bp->b_flags & _XBF_PAGES);
     297             : 
     298    87835579 :         if (bp->b_pages != bp->b_page_array)
     299    13871857 :                 kmem_free(bp->b_pages);
     300    87835579 :         bp->b_pages = NULL;
     301    87835579 :         bp->b_flags &= ~_XBF_PAGES;
     302    87835579 :         bp->b_page_count = 0;
     303    87835579 : }
     304             : 
     305             : static void
     306    88002250 : xfs_buf_free_callback(
     307             :         struct callback_head    *cb)
     308             : {
     309    88002250 :         struct xfs_buf          *bp = container_of(cb, struct xfs_buf, b_rcu);
     310             : 
     311    88002250 :         xfs_buf_free_maps(bp);
     312    88002250 :         kmem_cache_free(xfs_buf_cache, bp);
     313    88050482 : }
     314             : 
     315             : static void
     316    88145181 : xfs_buf_free(
     317             :         struct xfs_buf          *bp)
     318             : {
     319    88145181 :         trace_xfs_buf_free(bp, _RET_IP_);
     320             : 
     321    88144820 :         ASSERT(list_empty(&bp->b_lru));
     322             : 
     323    88144820 :         if (xfs_buf_is_vmapped(bp))
     324    12990832 :                 vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
     325             : 
     326    88144786 :         if (bp->b_flags & _XBF_DIRECT_MAP)
     327     1268607 :                 xfile_buf_unmap_pages(bp);
     328    86876179 :         else if (bp->b_flags & _XBF_PAGES)
     329    86567250 :                 xfs_buf_free_pages(bp);
     330      308929 :         else if (bp->b_flags & _XBF_KMEM)
     331      308927 :                 kmem_free(bp->b_addr);
     332             : 
     333    88144595 :         call_rcu(&bp->b_rcu, xfs_buf_free_callback);
     334    88143808 : }
     335             : 
     336             : static int
     337      308880 : xfs_buf_alloc_kmem(
     338             :         struct xfs_buf  *bp,
     339             :         xfs_buf_flags_t flags)
     340             : {
     341      308880 :         xfs_km_flags_t  kmflag_mask = KM_NOFS;
     342      308880 :         size_t          size = BBTOB(bp->b_length);
     343             : 
     344             :         /* Assure zeroed buffer for non-read cases. */
     345      308880 :         if (!(flags & XBF_READ))
     346       26556 :                 kmflag_mask |= KM_ZERO;
     347             : 
     348      308880 :         bp->b_addr = kmem_alloc(size, kmflag_mask);
     349      308898 :         if (!bp->b_addr)
     350             :                 return -ENOMEM;
     351             : 
     352      308898 :         if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
     353             :             ((unsigned long)bp->b_addr & PAGE_MASK)) {
     354             :                 /* b_addr spans two pages - use alloc_page instead */
     355           0 :                 kmem_free(bp->b_addr);
     356           0 :                 bp->b_addr = NULL;
     357           0 :                 return -ENOMEM;
     358             :         }
     359      308898 :         bp->b_offset = offset_in_page(bp->b_addr);
     360      308898 :         bp->b_pages = bp->b_page_array;
     361      308898 :         bp->b_pages[0] = kmem_to_page(bp->b_addr);
     362      308865 :         bp->b_page_count = 1;
     363      308865 :         bp->b_flags |= _XBF_KMEM;
     364      308865 :         return 0;
     365             : }
     366             : 
     367             : /* Make sure that we have a page list */
     368             : int
     369    87750837 : xfs_buf_alloc_page_array(
     370             :         struct xfs_buf  *bp,
     371             :         gfp_t           gfp_mask)
     372             : {
     373    87750837 :         ASSERT(!(bp->b_flags & _XBF_PAGES));
     374             : 
     375    87750837 :         bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
     376    87750837 :         if (bp->b_page_count <= XB_PAGES) {
     377    73880539 :                 bp->b_pages = bp->b_page_array;
     378             :         } else {
     379    13870298 :                 bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
     380             :                                         gfp_mask);
     381    13870593 :                 if (!bp->b_pages)
     382             :                         return -ENOMEM;
     383             :         }
     384             : 
     385    87751132 :         bp->b_flags |= _XBF_PAGES;
     386    87751132 :         return 0;
     387             : }
     388             : 
     389             : static int
     390    86493911 : xfs_buf_alloc_pages(
     391             :         struct xfs_buf  *bp,
     392             :         xfs_buf_flags_t flags)
     393             : {
     394    86493911 :         gfp_t           gfp_mask = __GFP_NOWARN;
     395    86493911 :         long            filled = 0;
     396    86493911 :         int             error;
     397             : 
     398    86493911 :         if (flags & XBF_READ_AHEAD)
     399             :                 gfp_mask |= __GFP_NORETRY;
     400             :         else
     401    67460776 :                 gfp_mask |= GFP_NOFS;
     402             : 
     403    86493911 :         error = xfs_buf_alloc_page_array(bp, gfp_mask);
     404    86456518 :         if (error)
     405             :                 return error;
     406             : 
     407             :         /* Assure zeroed buffer for non-read cases. */
     408    86456518 :         if (!(flags & XBF_READ))
     409    55642989 :                 gfp_mask |= __GFP_ZERO;
     410             : 
     411             :         /*
     412             :          * Bulk filling of pages can take multiple calls. Not filling the entire
     413             :          * array is not an allocation failure, so don't back off if we get at
     414             :          * least one extra page.
     415             :          */
     416    86456902 :         for (;;) {
     417    86456902 :                 long    last = filled;
     418             : 
     419    86456902 :                 filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
     420             :                                                 bp->b_pages);
     421    86509438 :                 if (filled == bp->b_page_count) {
     422    86509054 :                         XFS_STATS_INC(bp->b_mount, xb_page_found);
     423    86494995 :                         break;
     424             :                 }
     425             : 
     426         384 :                 if (filled != last)
     427         384 :                         continue;
     428             : 
     429           0 :                 if (flags & XBF_READ_AHEAD) {
     430           0 :                         xfs_buf_free_pages(bp);
     431           0 :                         return -ENOMEM;
     432             :                 }
     433             : 
     434           0 :                 XFS_STATS_INC(bp->b_mount, xb_page_retries);
     435           0 :                 memalloc_retry_wait(gfp_mask);
     436             :         }
     437             : 
     438    86494995 :         bp->b_offset = 0;
     439    86494995 :         return 0;
     440             : }
     441             : 
     442             : /*
     443             :  *      Map buffer into kernel address-space if necessary.
     444             :  */
     445             : STATIC int
     446   456901406 : _xfs_buf_map_pages(
     447             :         struct xfs_buf          *bp,
     448             :         xfs_buf_flags_t         flags)
     449             : {
     450   456901406 :         ASSERT(bp->b_flags & (_XBF_PAGES | _XBF_DIRECT_MAP));
     451             : 
     452   456901406 :         if (bp->b_page_count == 1) {
     453             :                 /* A single page buffer is always mappable */
     454    73774227 :                 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
     455   383127179 :         } else if (flags & XBF_UNMAPPED) {
     456   370136516 :                 bp->b_addr = NULL;
     457             :         } else {
     458    12990663 :                 int retried = 0;
     459    12990663 :                 unsigned nofs_flag;
     460             : 
     461             :                 /*
     462             :                  * vm_map_ram() will allocate auxiliary structures (e.g.
     463             :                  * pagetables) with GFP_KERNEL, yet we are likely to be under
     464             :                  * GFP_NOFS context here. Hence we need to tell memory reclaim
     465             :                  * that we are in such a context via PF_MEMALLOC_NOFS to prevent
     466             :                  * memory reclaim re-entering the filesystem here and
     467             :                  * potentially deadlocking.
     468             :                  */
     469    12990663 :                 nofs_flag = memalloc_nofs_save();
     470    12990672 :                 do {
     471    12990672 :                         bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
     472             :                                                 -1);
     473    12990643 :                         if (bp->b_addr)
     474             :                                 break;
     475           0 :                         vm_unmap_aliases();
     476           0 :                 } while (retried++ <= 1);
     477    12990634 :                 memalloc_nofs_restore(nofs_flag);
     478             : 
     479    12990634 :                 if (!bp->b_addr)
     480             :                         return -ENOMEM;
     481             : 
     482    12990634 :                 bp->b_addr += bp->b_offset;
     483             :         }
     484             : 
     485             :         return 0;
     486             : }
     487             : 
     488             : /*
     489             :  *      Finding and Reading Buffers
     490             :  */
     491             : static int
     492 58614691823 : _xfs_buf_obj_cmp(
     493             :         struct rhashtable_compare_arg   *arg,
     494             :         const void                      *obj)
     495             : {
     496 58614691823 :         const struct xfs_buf_map        *map = arg->key;
     497 58614691823 :         const struct xfs_buf            *bp = obj;
     498             : 
     499             :         /*
     500             :          * The key hashing in the lookup path depends on the key being the
     501             :          * first element of the compare_arg, make sure to assert this.
     502             :          */
     503 58614691823 :         BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
     504             : 
     505 58614691823 :         if (bp->b_rhash_key != map->bm_bn)
     506             :                 return 1;
     507             : 
     508 45957376247 :         if (unlikely(bp->b_length != map->bm_len)) {
     509             :                 /*
     510             :                  * found a block number match. If the range doesn't
     511             :                  * match, the only way this is allowed is if the buffer
     512             :                  * in the cache is stale and the transaction that made
     513             :                  * it stale has not yet committed. i.e. we are
     514             :                  * reallocating a busy extent. Skip this buffer and
     515             :                  * continue searching for an exact match.
     516             :                  *
     517             :                  * Note: If we're scanning for incore buffers to stale, don't
     518             :                  * complain if we find non-stale buffers.
     519             :                  */
     520    51870952 :                 if (!(map->bm_flags & XBM_LIVESCAN))
     521          27 :                         ASSERT(bp->b_flags & XBF_STALE);
     522    51870952 :                 return 1;
     523             :         }
     524             :         return 0;
     525             : }
     526             : 
     527             : static const struct rhashtable_params xfs_buf_hash_params = {
     528             :         .min_size               = 32,   /* empty AGs have minimal footprint */
     529             :         .nelem_hint             = 16,
     530             :         .key_len                = sizeof(xfs_daddr_t),
     531             :         .key_offset             = offsetof(struct xfs_buf, b_rhash_key),
     532             :         .head_offset            = offsetof(struct xfs_buf, b_rhash_head),
     533             :         .automatic_shrinking    = true,
     534             :         .obj_cmpfn              = _xfs_buf_obj_cmp,
     535             : };
     536             : 
     537             : int
     538      696163 : xfs_buf_cache_init(
     539             :         struct xfs_buf_cache    *bch)
     540             : {
     541      696163 :         spin_lock_init(&bch->bc_lock);
     542      696074 :         return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
     543             : }
     544             : 
     545             : void
     546      696794 : xfs_buf_cache_destroy(
     547             :         struct xfs_buf_cache    *bch)
     548             : {
     549      696794 :         rhashtable_destroy(&bch->bc_hash);
     550      696409 : }
     551             : 
     552             : static int
     553 46049301489 : xfs_buf_map_verify(
     554             :         struct xfs_buftarg      *btp,
     555             :         struct xfs_buf_map      *map)
     556             : {
     557 46049301489 :         xfs_daddr_t             eofs;
     558             : 
     559             :         /* Check for IOs smaller than the sector size / not sector aligned */
     560 46049301489 :         ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
     561 46049301489 :         ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
     562             : 
     563             :         /*
     564             :          * Corrupted block numbers can get through to here, unfortunately, so we
     565             :          * have to check that the buffer falls within the filesystem bounds.
     566             :          */
     567 46049301489 :         eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
     568 46049301489 :         if (map->bm_bn < 0 || map->bm_bn >= eofs) {
     569           0 :                 xfs_alert(btp->bt_mount,
     570             :                           "%s: daddr 0x%llx out of range, EOFS 0x%llx",
     571             :                           __func__, map->bm_bn, eofs);
     572           0 :                 WARN_ON(1);
     573           0 :                 return -EFSCORRUPTED;
     574             :         }
     575             :         return 0;
     576             : }
     577             : 
     578             : static int
     579 45937433598 : xfs_buf_find_lock(
     580             :         struct xfs_buf          *bp,
     581             :         xfs_buf_flags_t         flags)
     582             : {
     583 45937433598 :         if (flags & XBF_TRYLOCK) {
     584  8872946035 :                 if (!xfs_buf_trylock(bp)) {
     585   185598945 :                         XFS_STATS_INC(bp->b_mount, xb_busy_locked);
     586   185600198 :                         return -EAGAIN;
     587             :                 }
     588             :         } else {
     589 37064487563 :                 xfs_buf_lock(bp);
     590 37036738045 :                 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
     591             :         }
     592             : 
     593             :         /*
     594             :          * if the buffer is stale, clear all the external state associated with
     595             :          * it. We need to keep flags such as how we allocated the buffer memory
     596             :          * intact here.
     597             :          */
     598 45734129726 :         if (bp->b_flags & XBF_STALE) {
     599       87762 :                 if (flags & XBF_LIVESCAN) {
     600          38 :                         xfs_buf_unlock(bp);
     601          38 :                         return -ENOENT;
     602             :                 }
     603       87724 :                 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
     604       87724 :                 bp->b_flags &= _XBF_KMEM | _XBF_PAGES | _XBF_DIRECT_MAP;
     605       87724 :                 bp->b_ops = NULL;
     606             :         }
     607             :         return 0;
     608             : }
     609             : 
     610             : static inline int
     611 46062862057 : xfs_buf_lookup(
     612             :         struct xfs_buf_cache    *bch,
     613             :         struct xfs_buf_map      *map,
     614             :         xfs_buf_flags_t         flags,
     615             :         struct xfs_buf          **bpp)
     616             : {
     617 46062862057 :         struct xfs_buf          *bp;
     618 46062862057 :         int                     error;
     619             : 
     620 46062862057 :         rcu_read_lock();
     621 46062032407 :         bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
     622 91982173305 :         if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
     623   141406374 :                 rcu_read_unlock();
     624   141406374 :                 return -ENOENT;
     625             :         }
     626 45944542540 :         rcu_read_unlock();
     627             : 
     628 45940960687 :         error = xfs_buf_find_lock(bp, flags);
     629 45910132946 :         if (error) {
     630   185600658 :                 xfs_buf_rele(bp);
     631   185600658 :                 return error;
     632             :         }
     633             : 
     634 45724532288 :         trace_xfs_buf_find(bp, flags, _RET_IP_);
     635 45721509793 :         *bpp = bp;
     636 45721509793 :         return 0;
     637             : }
     638             : 
     639             : /*
     640             :  * Insert the new_bp into the hash table. This consumes the perag reference
     641             :  * taken for the lookup regardless of the result of the insert.
     642             :  */
     643             : static int
     644    87381183 : xfs_buf_find_insert(
     645             :         struct xfs_buftarg      *btp,
     646             :         struct xfs_buf_cache    *bch,
     647             :         struct xfs_perag        *pag,
     648             :         struct xfs_buf_map      *cmap,
     649             :         struct xfs_buf_map      *map,
     650             :         int                     nmaps,
     651             :         xfs_buf_flags_t         flags,
     652             :         struct xfs_buf          **bpp)
     653             : {
     654    87381183 :         struct xfs_buf          *new_bp;
     655    87381183 :         struct xfs_buf          *bp;
     656    87381183 :         int                     error;
     657             : 
     658    87381183 :         error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
     659    87387221 :         if (error)
     660           0 :                 goto out_drop_pag;
     661             : 
     662             :         /*
     663             :          * If the caller is ok with direct maps to xfile pages, try that.
     664             :          * ENOTBLK is the magic code to fall back to allocating memory.
     665             :          */
     666    87387221 :         if (xfile_buftarg_can_direct_map(btp)) {
     667     1269048 :                 error = xfile_buf_map_pages(new_bp, flags);
     668     1268872 :                 if (error && error != -ENOTBLK)
     669           0 :                         goto out_free_buf;
     670     1268872 :                 if (!error)
     671     1268874 :                         goto insert;
     672             :         }
     673             : 
     674             :         /*
     675             :          * For buffers that fit entirely within a single page, first attempt to
     676             :          * allocate the memory from the heap to minimise memory usage.
     677             :          */
     678    86118171 :         if (BBTOB(new_bp->b_length) < PAGE_SIZE) {
     679      308878 :                 error = xfs_buf_alloc_kmem(new_bp, flags);
     680      308871 :                 if (!error)
     681      308869 :                         goto insert;
     682             :         }
     683             : 
     684             :         /*
     685             :          * For larger buffers or if we can't get heap memory for these small
     686             :          * buffers, fall back to using the page allocator.
     687             :          */
     688    85809295 :         error = xfs_buf_alloc_pages(new_bp, flags);
     689    85768585 :         if (error)
     690           0 :                 goto out_free_buf;
     691             : 
     692    85768585 : insert:
     693    87346328 :         spin_lock(&bch->bc_lock);
     694    87390933 :         bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
     695    87390933 :                         &new_bp->b_rhash_head, xfs_buf_hash_params);
     696    87365274 :         if (IS_ERR(bp)) {
     697           0 :                 error = PTR_ERR(bp);
     698           0 :                 spin_unlock(&bch->bc_lock);
     699           0 :                 goto out_free_buf;
     700             :         }
     701    87365274 :         if (bp) {
     702             :                 /* found an existing buffer */
     703        1915 :                 atomic_inc(&bp->b_hold);
     704        1915 :                 spin_unlock(&bch->bc_lock);
     705        1915 :                 error = xfs_buf_find_lock(bp, flags);
     706        1915 :                 if (error)
     707           3 :                         xfs_buf_rele(bp);
     708             :                 else
     709        1912 :                         *bpp = bp;
     710        1915 :                 goto out_free_buf;
     711             :         }
     712             : 
     713             :         /* The new buffer keeps the perag reference until it is freed. */
     714    87363359 :         new_bp->b_pag = pag;
     715    87363359 :         new_bp->b_cache = bch;
     716    87363359 :         spin_unlock(&bch->bc_lock);
     717    87427196 :         *bpp = new_bp;
     718    87427196 :         return 0;
     719             : 
     720        1915 : out_free_buf:
     721        1915 :         xfs_buf_free(new_bp);
     722        1915 : out_drop_pag:
     723        1915 :         if (pag)
     724        1915 :                 xfs_perag_put(pag);
     725             :         return error;
     726             : }
     727             : 
     728             : /* Find the buffer cache for a particular buftarg and map. */
     729             : static inline struct xfs_buf_cache *
     730 46040278171 : xfs_buftarg_get_cache(
     731             :         struct xfs_buftarg              *btp,
     732             :         const struct xfs_buf_map        *map,
     733             :         struct xfs_perag                **pagp)
     734             : {
     735 46040278171 :         struct xfs_mount                *mp = btp->bt_mount;
     736             : 
     737 46040278171 :         if (btp->bt_cache) {
     738  2423165006 :                 *pagp = NULL;
     739  2423165006 :                 return btp->bt_cache;
     740             :         }
     741             : 
     742 43617113165 :         *pagp = xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
     743 43652821391 :         ASSERT(*pagp != NULL);
     744 43652821391 :         return &(*pagp)->pag_bcache;
     745             : }
     746             : 
     747             : /*
     748             :  * Assembles a buffer covering the specified range. The code is optimised for
     749             :  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
     750             :  * more hits than misses.
     751             :  */
     752             : int
     753 46062303686 : xfs_buf_get_map(
     754             :         struct xfs_buftarg      *btp,
     755             :         struct xfs_buf_map      *map,
     756             :         int                     nmaps,
     757             :         xfs_buf_flags_t         flags,
     758             :         struct xfs_buf          **bpp)
     759             : {
     760 46062303686 :         struct xfs_buf_cache    *bch;
     761 46062303686 :         struct xfs_perag        *pag;
     762 46062303686 :         struct xfs_buf          *bp = NULL;
     763 46062303686 :         struct xfs_buf_map      cmap = { .bm_bn = map[0].bm_bn };
     764 46062303686 :         int                     error;
     765 46062303686 :         int                     i;
     766             : 
     767 46062303686 :         if (flags & XBF_LIVESCAN)
     768    59290310 :                 cmap.bm_flags |= XBM_LIVESCAN;
     769 92115887569 :         for (i = 0; i < nmaps; i++)
     770 46053583883 :                 cmap.bm_len += map[i].bm_len;
     771             : 
     772 46062303686 :         error = xfs_buf_map_verify(btp, &cmap);
     773 46037244588 :         if (error)
     774             :                 return error;
     775             : 
     776 46039040133 :         bch = xfs_buftarg_get_cache(btp, &cmap, &pag);
     777             : 
     778 46072550200 :         error = xfs_buf_lookup(bch, &cmap, flags, &bp);
     779 46047623004 :         if (error && error != -ENOENT)
     780   185606124 :                 goto out_put_perag;
     781             : 
     782             :         /* cache hits always outnumber misses by at least 10:1 */
     783 45862016880 :         if (unlikely(!bp)) {
     784   141389509 :                 XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
     785             : 
     786   141407235 :                 if (flags & XBF_INCORE)
     787    53985246 :                         goto out_put_perag;
     788             : 
     789             :                 /* xfs_buf_find_insert() consumes the perag reference. */
     790    87421989 :                 error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
     791             :                                 flags, &bp);
     792    87425289 :                 if (error)
     793             :                         return error;
     794             :         } else {
     795 45720627371 :                 XFS_STATS_INC(btp->bt_mount, xb_get_locked);
     796 45726060757 :                 if (pag)
     797 43304103407 :                         xfs_perag_put(pag);
     798             :         }
     799             : 
     800             :         /* We do not hold a perag reference anymore. */
     801 45835392781 :         if (!bp->b_addr) {
     802   456818997 :                 error = _xfs_buf_map_pages(bp, flags);
     803   456098755 :                 if (unlikely(error)) {
     804           0 :                         xfs_warn_ratelimited(btp->bt_mount,
     805             :                                 "%s: failed to map %u pages", __func__,
     806             :                                 bp->b_page_count);
     807           0 :                         xfs_buf_relse(bp);
     808           0 :                         return error;
     809             :                 }
     810             :         }
     811             : 
     812             :         /*
     813             :          * Clear b_error if this is a lookup from a caller that doesn't expect
     814             :          * valid data to be found in the buffer.
     815             :          */
     816 45834672539 :         if (!(flags & XBF_READ))
     817    88022972 :                 xfs_buf_ioerror(bp, 0);
     818             : 
     819 45834548339 :         XFS_STATS_INC(btp->bt_mount, xb_get);
     820 45828212392 :         trace_xfs_buf_get(bp, flags, _RET_IP_);
     821 45819582431 :         *bpp = bp;
     822 45819582431 :         return 0;
     823             : 
     824   239591370 : out_put_perag:
     825   239591370 :         if (pag)
     826   239590776 :                 xfs_perag_put(pag);
     827             :         return error;
     828             : }
     829             : 
     830             : int
     831    31134823 : _xfs_buf_read(
     832             :         struct xfs_buf          *bp,
     833             :         xfs_buf_flags_t         flags)
     834             : {
     835    31134823 :         ASSERT(!(flags & XBF_WRITE));
     836    31134823 :         ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
     837             : 
     838    31134823 :         bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
     839    31134823 :         bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
     840             : 
     841    31134823 :         return xfs_buf_submit(bp);
     842             : }
     843             : 
     844             : /*
     845             :  * Reverify a buffer found in cache without an attached ->b_ops.
     846             :  *
     847             :  * If the caller passed an ops structure and the buffer doesn't have ops
     848             :  * assigned, set the ops and use it to verify the contents. If verification
     849             :  * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
     850             :  * already in XBF_DONE state on entry.
     851             :  *
     852             :  * Under normal operations, every in-core buffer is verified on read I/O
     853             :  * completion. There are two scenarios that can lead to in-core buffers without
     854             :  * an assigned ->b_ops. The first is during log recovery of buffers on a V4
     855             :  * filesystem, though these buffers are purged at the end of recovery. The
     856             :  * other is online repair, which intentionally reads with a NULL buffer ops to
     857             :  * run several verifiers across an in-core buffer in order to establish buffer
     858             :  * type.  If repair can't establish that, the buffer will be left in memory
     859             :  * with NULL buffer ops.
     860             :  */
     861             : int
     862 47238281075 : xfs_buf_reverify(
     863             :         struct xfs_buf          *bp,
     864             :         const struct xfs_buf_ops *ops)
     865             : {
     866 47238281075 :         ASSERT(bp->b_flags & XBF_DONE);
     867 47238281075 :         ASSERT(bp->b_error == 0);
     868             : 
     869 47238281075 :         if (!ops || bp->b_ops)
     870             :                 return 0;
     871             : 
     872          25 :         bp->b_ops = ops;
     873          25 :         bp->b_ops->verify_read(bp);
     874          25 :         if (bp->b_error)
     875          25 :                 bp->b_flags &= ~XBF_DONE;
     876             :         return bp->b_error;
     877             : }
     878             : 
     879             : int
     880 45917934300 : xfs_buf_read_map(
     881             :         struct xfs_buftarg      *target,
     882             :         struct xfs_buf_map      *map,
     883             :         int                     nmaps,
     884             :         xfs_buf_flags_t         flags,
     885             :         struct xfs_buf          **bpp,
     886             :         const struct xfs_buf_ops *ops,
     887             :         xfs_failaddr_t          fa)
     888             : {
     889 45917934300 :         struct xfs_buf          *bp;
     890 45917934300 :         int                     error;
     891             : 
     892 45917934300 :         flags |= XBF_READ;
     893 45917934300 :         *bpp = NULL;
     894             : 
     895 45917934300 :         error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
     896 45918303838 :         if (error)
     897             :                 return error;
     898             : 
     899 45734785384 :         trace_xfs_buf_read(bp, flags, _RET_IP_);
     900             : 
     901 45716637742 :         if (!(bp->b_flags & XBF_DONE)) {
     902             :                 /* Initiate the buffer read and wait. */
     903    31121538 :                 XFS_STATS_INC(target->bt_mount, xb_get_read);
     904    31121858 :                 bp->b_ops = ops;
     905    31121858 :                 error = _xfs_buf_read(bp, flags);
     906             : 
     907             :                 /* Readahead iodone already dropped the buffer, so exit. */
     908    28568799 :                 if (flags & XBF_ASYNC)
     909             :                         return 0;
     910             :         } else {
     911             :                 /* Buffer already read; all we need to do is check it. */
     912 45685516204 :                 error = xfs_buf_reverify(bp, ops);
     913             : 
     914             :                 /* Readahead already finished; drop the buffer and exit. */
     915 45690818357 :                 if (flags & XBF_ASYNC) {
     916  8553720565 :                         xfs_buf_relse(bp);
     917  8553728511 :                         return 0;
     918             :                 }
     919             : 
     920             :                 /* We do not want read in the flags */
     921 37137097792 :                 bp->b_flags &= ~XBF_READ;
     922 37137097792 :                 ASSERT(bp->b_ops != NULL || ops == NULL);
     923             :         }
     924             : 
     925             :         /*
     926             :          * If we've had a read error, then the contents of the buffer are
     927             :          * invalid and should not be used. To ensure that a followup read tries
     928             :          * to pull the buffer from disk again, we clear the XBF_DONE flag and
     929             :          * mark the buffer stale. This ensures that anyone who has a current
     930             :          * reference to the buffer will interpret it's contents correctly and
     931             :          * future cache lookups will also treat it as an empty, uninitialised
     932             :          * buffer.
     933             :          */
     934 37146076599 :         if (error) {
     935             :                 /*
     936             :                  * Check against log shutdown for error reporting because
     937             :                  * metadata writeback may require a read first and we need to
     938             :                  * report errors in metadata writeback until the log is shut
     939             :                  * down. High level transaction read functions already check
     940             :                  * against mount shutdown, anyway, so we only need to be
     941             :                  * concerned about low level IO interactions here.
     942             :                  */
     943       72698 :                 if (!xlog_is_shutdown(target->bt_mount->m_log))
     944       18846 :                         xfs_buf_ioerror_alert(bp, fa);
     945             : 
     946       36361 :                 bp->b_flags &= ~XBF_DONE;
     947       36361 :                 xfs_buf_stale(bp);
     948       36366 :                 xfs_buf_relse(bp);
     949             : 
     950             :                 /* bad CRC means corrupted metadata */
     951       36363 :                 if (error == -EFSBADCRC)
     952        2294 :                         error = -EFSCORRUPTED;
     953       36363 :                 return error;
     954             :         }
     955             : 
     956 37146040250 :         *bpp = bp;
     957 37146040250 :         return 0;
     958             : }
     959             : 
     960             : /*
     961             :  *      If we are not low on memory then do the readahead in a deadlock
     962             :  *      safe manner.
     963             :  */
     964             : void
     965  8750790689 : xfs_buf_readahead_map(
     966             :         struct xfs_buftarg      *target,
     967             :         struct xfs_buf_map      *map,
     968             :         int                     nmaps,
     969             :         const struct xfs_buf_ops *ops)
     970             : {
     971  8750790689 :         struct xfs_buf          *bp;
     972             : 
     973 17501609699 :         xfs_buf_read_map(target, map, nmaps,
     974             :                      XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
     975  8750790689 :                      __this_address);
     976  8752205812 : }
     977             : 
     978             : /*
     979             :  * Read an uncached buffer from disk. Allocates and returns a locked
     980             :  * buffer containing the disk contents or nothing. Uncached buffers always have
     981             :  * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
     982             :  * is cached or uncached during fault diagnosis.
     983             :  */
     984             : int
     985      318624 : xfs_buf_read_uncached(
     986             :         struct xfs_buftarg      *target,
     987             :         xfs_daddr_t             daddr,
     988             :         size_t                  numblks,
     989             :         xfs_buf_flags_t         flags,
     990             :         struct xfs_buf          **bpp,
     991             :         const struct xfs_buf_ops *ops)
     992             : {
     993      318624 :         struct xfs_buf          *bp;
     994      318624 :         int                     error;
     995             : 
     996      318624 :         *bpp = NULL;
     997             : 
     998      318624 :         error = xfs_buf_get_uncached(target, numblks, flags, &bp);
     999      318638 :         if (error)
    1000             :                 return error;
    1001             : 
    1002             :         /* set up the buffer for a read IO */
    1003      318638 :         ASSERT(bp->b_map_count == 1);
    1004      318638 :         bp->b_rhash_key = XFS_BUF_DADDR_NULL;
    1005      318638 :         bp->b_maps[0].bm_bn = daddr;
    1006      318638 :         bp->b_flags |= XBF_READ;
    1007      318638 :         bp->b_ops = ops;
    1008             : 
    1009      318638 :         xfs_buf_submit(bp);
    1010      318688 :         if (bp->b_error) {
    1011         585 :                 error = bp->b_error;
    1012         585 :                 xfs_buf_relse(bp);
    1013         585 :                 return error;
    1014             :         }
    1015             : 
    1016      318103 :         *bpp = bp;
    1017      318103 :         return 0;
    1018             : }
    1019             : 
    1020             : int
    1021      699242 : xfs_buf_get_uncached(
    1022             :         struct xfs_buftarg      *target,
    1023             :         size_t                  numblks,
    1024             :         xfs_buf_flags_t         flags,
    1025             :         struct xfs_buf          **bpp)
    1026             : {
    1027      699242 :         int                     error;
    1028      699242 :         struct xfs_buf          *bp;
    1029      699242 :         DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
    1030             : 
    1031      699242 :         *bpp = NULL;
    1032             : 
    1033             :         /* flags might contain irrelevant bits, pass only what we care about */
    1034      699242 :         error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
    1035      699333 :         if (error)
    1036             :                 return error;
    1037             : 
    1038      699334 :         error = xfs_buf_alloc_pages(bp, flags);
    1039      699306 :         if (error)
    1040           0 :                 goto fail_free_buf;
    1041             : 
    1042      699306 :         error = _xfs_buf_map_pages(bp, 0);
    1043      699241 :         if (unlikely(error)) {
    1044           0 :                 xfs_warn(target->bt_mount,
    1045             :                         "%s: failed to map pages", __func__);
    1046           0 :                 goto fail_free_buf;
    1047             :         }
    1048             : 
    1049      699241 :         trace_xfs_buf_get_uncached(bp, _RET_IP_);
    1050      699246 :         *bpp = bp;
    1051      699246 :         return 0;
    1052             : 
    1053           0 : fail_free_buf:
    1054           0 :         xfs_buf_free(bp);
    1055           0 :         return error;
    1056             : }
    1057             : 
    1058             : /*
    1059             :  *      Increment reference count on buffer, to hold the buffer concurrently
    1060             :  *      with another thread which may release (free) the buffer asynchronously.
    1061             :  *      Must hold the buffer already to call this function.
    1062             :  */
    1063             : void
    1064 15809541069 : xfs_buf_hold(
    1065             :         struct xfs_buf          *bp)
    1066             : {
    1067 15809541069 :         trace_xfs_buf_hold(bp, _RET_IP_);
    1068 15804214655 :         atomic_inc(&bp->b_hold);
    1069 15830594888 : }
    1070             : 
    1071             : /*
    1072             :  * Release a hold on the specified buffer. If the hold count is 1, the buffer is
    1073             :  * placed on LRU or freed (depending on b_lru_ref).
    1074             :  */
    1075             : void
    1076 62036122983 : xfs_buf_rele(
    1077             :         struct xfs_buf          *bp)
    1078             : {
    1079 62036122983 :         struct xfs_perag        *pag = bp->b_pag;
    1080 62036122983 :         struct xfs_buf_cache    *bch = bp->b_cache;
    1081 62036122983 :         bool                    release;
    1082 62036122983 :         bool                    freebuf = false;
    1083             : 
    1084 62036122983 :         trace_xfs_buf_rele(bp, _RET_IP_);
    1085             : 
    1086 62026770626 :         if (!bch) {
    1087    24972114 :                 ASSERT(list_empty(&bp->b_lru));
    1088    24972114 :                 if (atomic_dec_and_test(&bp->b_hold)) {
    1089      699371 :                         xfs_buf_ioacct_dec(bp);
    1090      699376 :                         xfs_buf_free(bp);
    1091             :                 }
    1092    24972090 :                 return;
    1093             :         }
    1094             : 
    1095 62001798512 :         ASSERT(atomic_read(&bp->b_hold) > 0);
    1096             : 
    1097             :         /*
    1098             :          * We grab the b_lock here first to serialise racing xfs_buf_rele()
    1099             :          * calls. The pag_buf_lock being taken on the last reference only
    1100             :          * serialises against racing lookups in xfs_buf_find(). IOWs, the second
    1101             :          * to last reference we drop here is not serialised against the last
    1102             :          * reference until we take bp->b_lock. Hence if we don't grab b_lock
    1103             :          * first, the last "release" reference can win the race to the lock and
    1104             :          * free the buffer before the second-to-last reference is processed,
    1105             :          * leading to a use-after-free scenario.
    1106             :          */
    1107 62001798512 :         spin_lock(&bp->b_lock);
    1108 62085164784 :         release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
    1109 62067195591 :         if (!release) {
    1110             :                 /*
    1111             :                  * Drop the in-flight state if the buffer is already on the LRU
    1112             :                  * and it holds the only reference. This is racy because we
    1113             :                  * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
    1114             :                  * ensures the decrement occurs only once per-buf.
    1115             :                  */
    1116 61922395184 :                 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
    1117 18458208114 :                         __xfs_buf_ioacct_dec(bp);
    1118 61908575961 :                 goto out_unlock;
    1119             :         }
    1120             : 
    1121             :         /* the last reference has been dropped ... */
    1122   144800407 :         __xfs_buf_ioacct_dec(bp);
    1123   144800261 :         if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
    1124             :                 /*
    1125             :                  * If the buffer is added to the LRU take a new reference to the
    1126             :                  * buffer for the LRU and clear the (now stale) dispose list
    1127             :                  * state flag
    1128             :                  */
    1129    57356332 :                 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
    1130    57360090 :                         bp->b_state &= ~XFS_BSTATE_DISPOSE;
    1131    57360090 :                         atomic_inc(&bp->b_hold);
    1132             :                 }
    1133    57361182 :                 spin_unlock(&bch->bc_lock);
    1134             :         } else {
    1135             :                 /*
    1136             :                  * most of the time buffers will already be removed from the
    1137             :                  * LRU, so optimise that case by checking for the
    1138             :                  * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
    1139             :                  * was on was the disposal list
    1140             :                  */
    1141    87443929 :                 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
    1142    43057028 :                         list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
    1143             :                 } else {
    1144    44386901 :                         ASSERT(list_empty(&bp->b_lru));
    1145             :                 }
    1146             : 
    1147    87444567 :                 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
    1148    87444567 :                 rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
    1149             :                                 xfs_buf_hash_params);
    1150    87444623 :                 spin_unlock(&bch->bc_lock);
    1151    87444695 :                 if (pag)
    1152    86175676 :                         xfs_perag_put(pag);
    1153    87444502 :                 bp->b_cache = NULL;
    1154    87444502 :                 bp->b_pag = NULL;
    1155    87444502 :                 freebuf = true;
    1156             :         }
    1157             : 
    1158 62053380975 : out_unlock:
    1159 62053380975 :         spin_unlock(&bp->b_lock);
    1160             : 
    1161 62083722709 :         if (freebuf)
    1162    87444226 :                 xfs_buf_free(bp);
    1163             : }
    1164             : 
    1165             : 
    1166             : /*
    1167             :  *      Lock a buffer object, if it is not already locked.
    1168             :  *
    1169             :  *      If we come across a stale, pinned, locked buffer, we know that we are
    1170             :  *      being asked to lock a buffer that has been reallocated. Because it is
    1171             :  *      pinned, we know that the log has not been pushed to disk and hence it
    1172             :  *      will still be locked.  Rather than continuing to have trylock attempts
    1173             :  *      fail until someone else pushes the log, push it ourselves before
    1174             :  *      returning.  This means that the xfsaild will not get stuck trying
    1175             :  *      to push on stale inode buffers.
    1176             :  */
    1177             : int
    1178  9211589001 : xfs_buf_trylock(
    1179             :         struct xfs_buf          *bp)
    1180             : {
    1181  9211589001 :         int                     locked;
    1182             : 
    1183  9211589001 :         locked = down_trylock(&bp->b_sema) == 0;
    1184  9212699937 :         if (locked)
    1185  9026040293 :                 trace_xfs_buf_trylock(bp, _RET_IP_);
    1186             :         else
    1187   186659644 :                 trace_xfs_buf_trylock_fail(bp, _RET_IP_);
    1188  9210004989 :         return locked;
    1189             : }
    1190             : 
    1191             : /*
    1192             :  *      Lock a buffer object.
    1193             :  *
    1194             :  *      If we come across a stale, pinned, locked buffer, we know that we
    1195             :  *      are being asked to lock a buffer that has been reallocated. Because
    1196             :  *      it is pinned, we know that the log has not been pushed to disk and
    1197             :  *      hence it will still be locked. Rather than sleeping until someone
    1198             :  *      else pushes the log, push it ourselves before trying to get the lock.
    1199             :  */
    1200             : void
    1201 37101231076 : xfs_buf_lock(
    1202             :         struct xfs_buf          *bp)
    1203             : {
    1204 37101231076 :         trace_xfs_buf_lock(bp, _RET_IP_);
    1205             : 
    1206 37090566171 :         if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
    1207       48647 :                 xfs_log_force(bp->b_mount, 0);
    1208 37090566171 :         down(&bp->b_sema);
    1209             : 
    1210 37115922919 :         trace_xfs_buf_lock_done(bp, _RET_IP_);
    1211 37086345874 : }
    1212             : 
    1213             : void
    1214 46216774919 : xfs_buf_unlock(
    1215             :         struct xfs_buf          *bp)
    1216             : {
    1217 46216774919 :         ASSERT(xfs_buf_islocked(bp));
    1218             : 
    1219 46216774919 :         up(&bp->b_sema);
    1220 46224984009 :         trace_xfs_buf_unlock(bp, _RET_IP_);
    1221 46192631159 : }
    1222             : 
    1223             : STATIC void
    1224   208111222 : xfs_buf_wait_unpin(
    1225             :         struct xfs_buf          *bp)
    1226             : {
    1227   208111222 :         DECLARE_WAITQUEUE       (wait, current);
    1228             : 
    1229   208111222 :         if (atomic_read(&bp->b_pin_count) == 0)
    1230   208094662 :                 return;
    1231             : 
    1232       16560 :         add_wait_queue(&bp->b_waiters, &wait);
    1233       49680 :         for (;;) {
    1234       33120 :                 set_current_state(TASK_UNINTERRUPTIBLE);
    1235       33120 :                 if (atomic_read(&bp->b_pin_count) == 0)
    1236             :                         break;
    1237       16560 :                 io_schedule();
    1238             :         }
    1239       16560 :         remove_wait_queue(&bp->b_waiters, &wait);
    1240       16560 :         set_current_state(TASK_RUNNING);
    1241             : }
    1242             : 
    1243             : static void
    1244       12460 : xfs_buf_ioerror_alert_ratelimited(
    1245             :         struct xfs_buf          *bp)
    1246             : {
    1247       12460 :         static unsigned long    lasttime;
    1248       12460 :         static struct xfs_buftarg *lasttarg;
    1249             : 
    1250       12460 :         if (bp->b_target != lasttarg ||
    1251       12163 :             time_after(jiffies, (lasttime + 5*HZ))) {
    1252         311 :                 lasttime = jiffies;
    1253         311 :                 xfs_buf_ioerror_alert(bp, __this_address);
    1254             :         }
    1255       12460 :         lasttarg = bp->b_target;
    1256       12460 : }
    1257             : 
    1258             : /*
    1259             :  * Account for this latest trip around the retry handler, and decide if
    1260             :  * we've failed enough times to constitute a permanent failure.
    1261             :  */
    1262             : static bool
    1263       11631 : xfs_buf_ioerror_permanent(
    1264             :         struct xfs_buf          *bp,
    1265             :         struct xfs_error_cfg    *cfg)
    1266             : {
    1267       11631 :         struct xfs_mount        *mp = bp->b_mount;
    1268             : 
    1269       11631 :         if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
    1270          74 :             ++bp->b_retries > cfg->max_retries)
    1271             :                 return true;
    1272       11557 :         if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
    1273           0 :             time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
    1274             :                 return true;
    1275             : 
    1276             :         /* At unmount we may treat errors differently */
    1277       23114 :         if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
    1278           6 :                 return true;
    1279             : 
    1280             :         return false;
    1281             : }
    1282             : 
    1283             : /*
    1284             :  * On a sync write or shutdown we just want to stale the buffer and let the
    1285             :  * caller handle the error in bp->b_error appropriately.
    1286             :  *
    1287             :  * If the write was asynchronous then no one will be looking for the error.  If
    1288             :  * this is the first failure of this type, clear the error state and write the
    1289             :  * buffer out again. This means we always retry an async write failure at least
    1290             :  * once, but we also need to set the buffer up to behave correctly now for
    1291             :  * repeated failures.
    1292             :  *
    1293             :  * If we get repeated async write failures, then we take action according to the
    1294             :  * error configuration we have been set up to use.
    1295             :  *
    1296             :  * Returns true if this function took care of error handling and the caller must
    1297             :  * not touch the buffer again.  Return false if the caller should proceed with
    1298             :  * normal I/O completion handling.
    1299             :  */
    1300             : static bool
    1301     2779934 : xfs_buf_ioend_handle_error(
    1302             :         struct xfs_buf          *bp)
    1303             : {
    1304     2779934 :         struct xfs_mount        *mp = bp->b_mount;
    1305     2779934 :         struct xfs_error_cfg    *cfg;
    1306             : 
    1307             :         /*
    1308             :          * If we've already shutdown the journal because of I/O errors, there's
    1309             :          * no point in giving this a retry.
    1310             :          */
    1311     5559868 :         if (xlog_is_shutdown(mp->m_log))
    1312     2767474 :                 goto out_stale;
    1313             : 
    1314       12460 :         xfs_buf_ioerror_alert_ratelimited(bp);
    1315             : 
    1316             :         /*
    1317             :          * We're not going to bother about retrying this during recovery.
    1318             :          * One strike!
    1319             :          */
    1320       12460 :         if (bp->b_flags & _XBF_LOGRECOVERY) {
    1321           0 :                 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
    1322           0 :                 return false;
    1323             :         }
    1324             : 
    1325             :         /*
    1326             :          * Synchronous writes will have callers process the error.
    1327             :          */
    1328       12460 :         if (!(bp->b_flags & XBF_ASYNC))
    1329         142 :                 goto out_stale;
    1330             : 
    1331       12318 :         trace_xfs_buf_iodone_async(bp, _RET_IP_);
    1332             : 
    1333       12318 :         cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
    1334       12318 :         if (bp->b_last_error != bp->b_error ||
    1335       11631 :             !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
    1336         687 :                 bp->b_last_error = bp->b_error;
    1337         687 :                 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
    1338         617 :                     !bp->b_first_retry_time)
    1339         617 :                         bp->b_first_retry_time = jiffies;
    1340         687 :                 goto resubmit;
    1341             :         }
    1342             : 
    1343             :         /*
    1344             :          * Permanent error - we need to trigger a shutdown if we haven't already
    1345             :          * to indicate that inconsistency will result from this action.
    1346             :          */
    1347       11631 :         if (xfs_buf_ioerror_permanent(bp, cfg)) {
    1348          80 :                 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
    1349          80 :                 goto out_stale;
    1350             :         }
    1351             : 
    1352             :         /* Still considered a transient error. Caller will schedule retries. */
    1353       11551 :         if (bp->b_flags & _XBF_INODES)
    1354         179 :                 xfs_buf_inode_io_fail(bp);
    1355       11372 :         else if (bp->b_flags & _XBF_DQUOTS)
    1356         131 :                 xfs_buf_dquot_io_fail(bp);
    1357             :         else
    1358       11241 :                 ASSERT(list_empty(&bp->b_li_list));
    1359       11551 :         xfs_buf_ioerror(bp, 0);
    1360       11551 :         xfs_buf_relse(bp);
    1361       11551 :         return true;
    1362             : 
    1363             : resubmit:
    1364         687 :         xfs_buf_ioerror(bp, 0);
    1365         687 :         bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
    1366         687 :         xfs_buf_submit(bp);
    1367         687 :         return true;
    1368     2767696 : out_stale:
    1369     2767696 :         xfs_buf_stale(bp);
    1370     2767696 :         bp->b_flags |= XBF_DONE;
    1371     2767696 :         bp->b_flags &= ~XBF_WRITE;
    1372     2767696 :         trace_xfs_buf_error_relse(bp, _RET_IP_);
    1373     2767696 :         return false;
    1374             : }
    1375             : 
    1376             : static void
    1377   242341251 : xfs_buf_ioend(
    1378             :         struct xfs_buf  *bp)
    1379             : {
    1380   242341251 :         trace_xfs_buf_iodone(bp, _RET_IP_);
    1381             : 
    1382             :         /*
    1383             :          * Pull in IO completion errors now. We are guaranteed to be running
    1384             :          * single threaded, so we don't need the lock to read b_io_error.
    1385             :          */
    1386   242341163 :         if (!bp->b_error && bp->b_io_error)
    1387       56061 :                 xfs_buf_ioerror(bp, bp->b_io_error);
    1388             : 
    1389   242341163 :         if (bp->b_flags & XBF_READ) {
    1390    31460378 :                 if (!bp->b_error && bp->b_ops)
    1391    28817423 :                         bp->b_ops->verify_read(bp);
    1392    31460378 :                 if (!bp->b_error)
    1393    31392594 :                         bp->b_flags |= XBF_DONE;
    1394             :         } else {
    1395   210880785 :                 if (!bp->b_error) {
    1396   208100855 :                         bp->b_flags &= ~XBF_WRITE_FAIL;
    1397   208100855 :                         bp->b_flags |= XBF_DONE;
    1398             :                 }
    1399             : 
    1400   210880785 :                 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
    1401             :                         return;
    1402             : 
    1403             :                 /* clear the retry state */
    1404   210868548 :                 bp->b_last_error = 0;
    1405   210868548 :                 bp->b_retries = 0;
    1406   210868548 :                 bp->b_first_retry_time = 0;
    1407             : 
    1408             :                 /*
    1409             :                  * Note that for things like remote attribute buffers, there may
    1410             :                  * not be a buffer log item here, so processing the buffer log
    1411             :                  * item must remain optional.
    1412             :                  */
    1413   210868548 :                 if (bp->b_log_item)
    1414   118880450 :                         xfs_buf_item_done(bp);
    1415             : 
    1416   210868487 :                 if (bp->b_flags & _XBF_INODES)
    1417    55187190 :                         xfs_buf_inode_iodone(bp);
    1418   155681297 :                 else if (bp->b_flags & _XBF_DQUOTS)
    1419    29861491 :                         xfs_buf_dquot_iodone(bp);
    1420             : 
    1421             :         }
    1422             : 
    1423   242328865 :         bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
    1424             :                          _XBF_LOGRECOVERY);
    1425             : 
    1426   242328865 :         if (bp->b_flags & XBF_ASYNC)
    1427   198598369 :                 xfs_buf_relse(bp);
    1428             :         else
    1429    43730496 :                 complete(&bp->b_iowait);
    1430             : }
    1431             : 
    1432             : static void
    1433   239217823 : xfs_buf_ioend_work(
    1434             :         struct work_struct      *work)
    1435             : {
    1436   239217823 :         struct xfs_buf          *bp =
    1437   239217823 :                 container_of(work, struct xfs_buf, b_ioend_work);
    1438             : 
    1439   239217823 :         xfs_buf_ioend(bp);
    1440   239217823 : }
    1441             : 
    1442             : static void
    1443   239217824 : xfs_buf_ioend_async(
    1444             :         struct xfs_buf  *bp)
    1445             : {
    1446   239217824 :         INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
    1447   239217824 :         queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
    1448   239217824 : }
    1449             : 
    1450             : void
    1451    90888109 : __xfs_buf_ioerror(
    1452             :         struct xfs_buf          *bp,
    1453             :         int                     error,
    1454             :         xfs_failaddr_t          failaddr)
    1455             : {
    1456    90888109 :         ASSERT(error <= 0 && error >= -1000);
    1457    90888109 :         bp->b_error = error;
    1458    90888109 :         trace_xfs_buf_ioerror(bp, error, failaddr);
    1459    90759305 : }
    1460             : 
    1461             : void
    1462       19154 : xfs_buf_ioerror_alert(
    1463             :         struct xfs_buf          *bp,
    1464             :         xfs_failaddr_t          func)
    1465             : {
    1466       19154 :         xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
    1467             :                 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
    1468             :                                   func, (uint64_t)xfs_buf_daddr(bp),
    1469       19154 :                                   bp->b_length, -bp->b_error);
    1470       19169 : }
    1471             : 
    1472             : /*
    1473             :  * To simulate an I/O failure, the buffer must be locked and held with at least
    1474             :  * three references. The LRU reference is dropped by the stale call. The buf
    1475             :  * item reference is dropped via ioend processing. The third reference is owned
    1476             :  * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
    1477             :  */
    1478             : void
    1479     2783425 : xfs_buf_ioend_fail(
    1480             :         struct xfs_buf  *bp)
    1481             : {
    1482     2783425 :         bp->b_flags &= ~XBF_DONE;
    1483     2783425 :         xfs_buf_stale(bp);
    1484     2783425 :         xfs_buf_ioerror(bp, -EIO);
    1485     2783425 :         xfs_buf_ioend(bp);
    1486     2783425 : }
    1487             : 
    1488             : int
    1489      447762 : xfs_bwrite(
    1490             :         struct xfs_buf          *bp)
    1491             : {
    1492      447762 :         int                     error;
    1493             : 
    1494      447762 :         ASSERT(xfs_buf_islocked(bp));
    1495             : 
    1496      447762 :         bp->b_flags |= XBF_WRITE;
    1497      447762 :         bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
    1498             :                          XBF_DONE);
    1499             : 
    1500      447762 :         error = xfs_buf_submit(bp);
    1501      447827 :         if (error)
    1502         142 :                 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
    1503      447827 :         return error;
    1504             : }
    1505             : 
    1506             : static void
    1507   239230501 : xfs_buf_bio_end_io(
    1508             :         struct bio              *bio)
    1509             : {
    1510   239230501 :         struct xfs_buf          *bp = (struct xfs_buf *)bio->bi_private;
    1511             : 
    1512   239230501 :         if (!bio->bi_status &&
    1513   415964299 :             (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
    1514   176789857 :             XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
    1515           0 :                 bio->bi_status = BLK_STS_IOERR;
    1516             : 
    1517             :         /*
    1518             :          * don't overwrite existing errors - otherwise we can lose errors on
    1519             :          * buffers that require multiple bios to complete.
    1520             :          */
    1521   239230501 :         if (bio->bi_status) {
    1522       56059 :                 int error = blk_status_to_errno(bio->bi_status);
    1523             : 
    1524       56060 :                 cmpxchg(&bp->b_io_error, 0, error);
    1525             :         }
    1526             : 
    1527   239230503 :         if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
    1528             :                 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
    1529             : 
    1530   239230503 :         if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
    1531   239166680 :                 xfs_buf_ioend_async(bp);
    1532   239230503 :         bio_put(bio);
    1533   239230502 : }
    1534             : 
    1535             : static void
    1536   239216260 : xfs_buf_ioapply_map(
    1537             :         struct xfs_buf  *bp,
    1538             :         int             map,
    1539             :         int             *buf_offset,
    1540             :         int             *count,
    1541             :         blk_opf_t       op)
    1542             : {
    1543   239216260 :         int             page_index;
    1544   239216260 :         unsigned int    total_nr_pages = bp->b_page_count;
    1545   239216260 :         int             nr_pages;
    1546   239216260 :         struct bio      *bio;
    1547   239216260 :         sector_t        sector =  bp->b_maps[map].bm_bn;
    1548   239216260 :         int             size;
    1549   239216260 :         int             offset;
    1550             : 
    1551             :         /* skip the pages in the buffer before the start offset */
    1552   239216260 :         page_index = 0;
    1553   239216260 :         offset = *buf_offset;
    1554   239216926 :         while (offset >= PAGE_SIZE) {
    1555         666 :                 page_index++;
    1556         666 :                 offset -= PAGE_SIZE;
    1557             :         }
    1558             : 
    1559             :         /*
    1560             :          * Limit the IO size to the length of the current vector, and update the
    1561             :          * remaining IO count for the next time around.
    1562             :          */
    1563   239216260 :         size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
    1564   239216260 :         *count -= size;
    1565   239216260 :         *buf_offset += size;
    1566             : 
    1567   239216260 : next_chunk:
    1568   239216260 :         atomic_inc(&bp->b_io_remaining);
    1569   239224012 :         nr_pages = bio_max_segs(total_nr_pages);
    1570             : 
    1571   239224012 :         bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
    1572   239221660 :         bio->bi_iter.bi_sector = sector;
    1573   239221660 :         bio->bi_end_io = xfs_buf_bio_end_io;
    1574   239221660 :         bio->bi_private = bp;
    1575             : 
    1576   690895793 :         for (; size && nr_pages; nr_pages--, page_index++) {
    1577   451685910 :                 int     rbytes, nbytes = PAGE_SIZE - offset;
    1578             : 
    1579   451685910 :                 if (nbytes > size)
    1580             :                         nbytes = size;
    1581             : 
    1582   451685910 :                 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
    1583             :                                       offset);
    1584   451674133 :                 if (rbytes < nbytes)
    1585             :                         break;
    1586             : 
    1587   451674133 :                 offset = 0;
    1588   451674133 :                 sector += BTOBB(nbytes);
    1589   451674133 :                 size -= nbytes;
    1590   451674133 :                 total_nr_pages--;
    1591             :         }
    1592             : 
    1593   239209883 :         if (likely(bio->bi_iter.bi_size)) {
    1594   239209883 :                 if (xfs_buf_is_vmapped(bp)) {
    1595             :                         flush_kernel_vmap_range(bp->b_addr,
    1596             :                                                 xfs_buf_vmap_len(bp));
    1597             :                 }
    1598   239209883 :                 submit_bio(bio);
    1599   239209341 :                 if (size)
    1600           0 :                         goto next_chunk;
    1601             :         } else {
    1602             :                 /*
    1603             :                  * This is guaranteed not to be the last io reference count
    1604             :                  * because the caller (xfs_buf_submit) holds a count itself.
    1605             :                  */
    1606           0 :                 atomic_dec(&bp->b_io_remaining);
    1607           0 :                 xfs_buf_ioerror(bp, -EIO);
    1608           0 :                 bio_put(bio);
    1609             :         }
    1610             : 
    1611   239209341 : }
    1612             : 
    1613             : /* Start a synchronous process-context buffer IO. */
    1614             : static inline void
    1615             : xfs_buf_start_sync_io(
    1616             :         struct xfs_buf  *bp)
    1617             : {
    1618      327044 :         atomic_inc(&bp->b_io_remaining);
    1619             : }
    1620             : 
    1621             : /* Finish a synchronous bprocess-context uffer IO. */
    1622             : static void
    1623      327425 : xfs_buf_end_sync_io(
    1624             :         struct xfs_buf  *bp,
    1625             :         int             error)
    1626             : {
    1627      327425 :         if (error)
    1628           0 :                 cmpxchg(&bp->b_io_error, 0, error);
    1629             : 
    1630      327425 :         if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
    1631             :                 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
    1632             : 
    1633      327425 :         if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
    1634           0 :                 xfs_buf_ioend(bp);
    1635      327652 : }
    1636             : 
    1637             : bool
    1638   900219423 : xfs_buf_check_poisoned(
    1639             :         struct xfs_buf          *bp)
    1640             : {
    1641   900219423 :         unsigned int            i;
    1642             : 
    1643  1800441980 :         for (i = 0; i < bp->b_page_count; i++) {
    1644   900222557 :                 if (PageHWPoison(bp->b_pages[i]))
    1645             :                         return true;
    1646             :         }
    1647             : 
    1648             :         return false;
    1649             : }
    1650             : 
    1651             : STATIC void
    1652   239549807 : _xfs_buf_ioapply(
    1653             :         struct xfs_buf  *bp)
    1654             : {
    1655   239549807 :         struct blk_plug plug;
    1656   239549807 :         blk_opf_t       op;
    1657   239549807 :         int             offset;
    1658   239549807 :         int             size;
    1659   239549807 :         int             i;
    1660             : 
    1661             :         /*
    1662             :          * Make sure we capture only current IO errors rather than stale errors
    1663             :          * left over from previous use of the buffer (e.g. failed readahead).
    1664             :          */
    1665   239549807 :         bp->b_error = 0;
    1666             : 
    1667   239549807 :         if (bp->b_flags & XBF_WRITE) {
    1668   208111749 :                 op = REQ_OP_WRITE;
    1669             : 
    1670             :                 /*
    1671             :                  * Run the write verifier callback function if it exists. If
    1672             :                  * this function fails it will mark the buffer with an error and
    1673             :                  * the IO should not be dispatched.
    1674             :                  */
    1675   208111749 :                 if (bp->b_ops) {
    1676   208111749 :                         bp->b_ops->verify_write(bp);
    1677   208110038 :                         if (bp->b_error) {
    1678          22 :                                 xfs_force_shutdown(bp->b_mount,
    1679             :                                                    SHUTDOWN_CORRUPT_INCORE);
    1680      327676 :                                 return;
    1681             :                         }
    1682           0 :                 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
    1683           0 :                         struct xfs_mount *mp = bp->b_mount;
    1684             : 
    1685             :                         /*
    1686             :                          * non-crc filesystems don't attach verifiers during
    1687             :                          * log recovery, so don't warn for such filesystems.
    1688             :                          */
    1689           0 :                         if (xfs_has_crc(mp)) {
    1690           0 :                                 xfs_warn(mp,
    1691             :                                         "%s: no buf ops on daddr 0x%llx len %d",
    1692             :                                         __func__, xfs_buf_daddr(bp),
    1693             :                                         bp->b_length);
    1694           0 :                                 xfs_hex_dump(bp->b_addr,
    1695             :                                                 XFS_CORRUPTION_DUMP_LEN);
    1696           0 :                                 dump_stack();
    1697             :                         }
    1698             :                 }
    1699             :         } else {
    1700    31438058 :                 op = REQ_OP_READ;
    1701    31438058 :                 if (bp->b_flags & XBF_READ_AHEAD)
    1702    19040647 :                         op |= REQ_RAHEAD;
    1703             :         }
    1704             : 
    1705             :         /* we only use the buffer cache for meta-data */
    1706   239548074 :         op |= REQ_META;
    1707             : 
    1708   239548074 :         if (bp->b_target->bt_flags & XFS_BUFTARG_XFILE) {
    1709      327044 :                 int     error;
    1710             : 
    1711      327044 :                 xfs_buf_start_sync_io(bp);
    1712      327752 :                 error = xfile_buf_ioapply(bp);
    1713      327516 :                 xfs_buf_end_sync_io(bp, error);
    1714      327516 :                 return;
    1715             :         }
    1716             : 
    1717             :         /*
    1718             :          * Walk all the vectors issuing IO on them. Set up the initial offset
    1719             :          * into the buffer and the desired IO size before we start -
    1720             :          * _xfs_buf_ioapply_vec() will modify them appropriately for each
    1721             :          * subsequent call.
    1722             :          */
    1723   239221030 :         offset = bp->b_offset;
    1724   239221030 :         size = BBTOB(bp->b_length);
    1725   239221030 :         blk_start_plug(&plug);
    1726   478439077 :         for (i = 0; i < bp->b_map_count; i++) {
    1727   239218047 :                 xfs_buf_ioapply_map(bp, i, &offset, &size, op);
    1728   239196878 :                 if (bp->b_error)
    1729             :                         break;
    1730   239196878 :                 if (size <= 0)
    1731             :                         break;  /* all done */
    1732             :         }
    1733   239196515 :         blk_finish_plug(&plug);
    1734             : }
    1735             : 
    1736             : /*
    1737             :  * Wait for I/O completion of a sync buffer and return the I/O error code.
    1738             :  */
    1739             : static int
    1740    43712522 : xfs_buf_iowait(
    1741             :         struct xfs_buf  *bp)
    1742             : {
    1743    43712522 :         ASSERT(!(bp->b_flags & XBF_ASYNC));
    1744             : 
    1745    43712522 :         trace_xfs_buf_iowait(bp, _RET_IP_);
    1746    43710858 :         wait_for_completion(&bp->b_iowait);
    1747    43710419 :         trace_xfs_buf_iowait_done(bp, _RET_IP_);
    1748             : 
    1749    43707392 :         return bp->b_error;
    1750             : }
    1751             : 
    1752             : /*
    1753             :  * Buffer I/O submission path, read or write. Asynchronous submission transfers
    1754             :  * the buffer lock ownership and the current reference to the IO. It is not
    1755             :  * safe to reference the buffer after a call to this function unless the caller
    1756             :  * holds an additional reference itself.
    1757             :  */
    1758             : static int
    1759   241268504 : __xfs_buf_submit(
    1760             :         struct xfs_buf  *bp,
    1761             :         bool            wait)
    1762             : {
    1763   241268504 :         int             error = 0;
    1764             : 
    1765   241268504 :         trace_xfs_buf_submit(bp, _RET_IP_);
    1766             : 
    1767   241266814 :         ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
    1768             : 
    1769             :         /*
    1770             :          * On log shutdown we stale and complete the buffer immediately. We can
    1771             :          * be called to read the superblock before the log has been set up, so
    1772             :          * be careful checking the log state.
    1773             :          *
    1774             :          * Checking the mount shutdown state here can result in the log tail
    1775             :          * moving inappropriately on disk as the log may not yet be shut down.
    1776             :          * i.e. failing this buffer on mount shutdown can remove it from the AIL
    1777             :          * and move the tail of the log forwards without having written this
    1778             :          * buffer to disk. This corrupts the log tail state in memory, and
    1779             :          * because the log may not be shut down yet, it can then be propagated
    1780             :          * to disk before the log is shutdown. Hence we check log shutdown
    1781             :          * state here rather than mount state to avoid corrupting the log tail
    1782             :          * on shutdown.
    1783             :          */
    1784   482265256 :         if (bp->b_mount->m_log &&
    1785             :             xlog_is_shutdown(bp->b_mount->m_log)) {
    1786     1722013 :                 xfs_buf_ioend_fail(bp);
    1787     1722013 :                 return -EIO;
    1788             :         }
    1789             : 
    1790             :         /*
    1791             :          * Grab a reference so the buffer does not go away underneath us. For
    1792             :          * async buffers, I/O completion drops the callers reference, which
    1793             :          * could occur before submission returns.
    1794             :          */
    1795   239544801 :         xfs_buf_hold(bp);
    1796             : 
    1797   239553115 :         if (bp->b_flags & XBF_WRITE)
    1798   208112686 :                 xfs_buf_wait_unpin(bp);
    1799             : 
    1800             :         /* clear the internal error state to avoid spurious errors */
    1801   239551438 :         bp->b_io_error = 0;
    1802             : 
    1803             :         /*
    1804             :          * Set the count to 1 initially, this will stop an I/O completion
    1805             :          * callout which happens before we have started all the I/O from calling
    1806             :          * xfs_buf_ioend too early.
    1807             :          */
    1808   239551438 :         atomic_set(&bp->b_io_remaining, 1);
    1809   239551438 :         if (bp->b_flags & XBF_ASYNC)
    1810   195843191 :                 xfs_buf_ioacct_inc(bp);
    1811   239551509 :         _xfs_buf_ioapply(bp);
    1812             : 
    1813             :         /*
    1814             :          * If _xfs_buf_ioapply failed, we can get back here with only the IO
    1815             :          * reference we took above. If we drop it to zero, run completion so
    1816             :          * that we don't return to the caller with completion still pending.
    1817             :          */
    1818   239551997 :         if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
    1819      391272 :                 if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
    1820      340128 :                         xfs_buf_ioend(bp);
    1821             :                 else
    1822       51144 :                         xfs_buf_ioend_async(bp);
    1823             :         }
    1824             : 
    1825   239556377 :         if (wait)
    1826    12849770 :                 error = xfs_buf_iowait(bp);
    1827             : 
    1828             :         /*
    1829             :          * Release the hold that keeps the buffer referenced for the entire
    1830             :          * I/O. Note that if the buffer is async, it is not safe to reference
    1831             :          * after this release.
    1832             :          */
    1833   239551980 :         xfs_buf_rele(bp);
    1834   239551980 :         return error;
    1835             : }
    1836             : 
    1837             : void *
    1838 33695562942 : xfs_buf_offset(
    1839             :         struct xfs_buf          *bp,
    1840             :         size_t                  offset)
    1841             : {
    1842 33695562942 :         struct page             *page;
    1843             : 
    1844 33695562942 :         if (bp->b_addr)
    1845 27907969450 :                 return bp->b_addr + offset;
    1846             : 
    1847  5787593492 :         page = bp->b_pages[offset >> PAGE_SHIFT];
    1848  5787593492 :         return page_address(page) + (offset & (PAGE_SIZE-1));
    1849             : }
    1850             : 
    1851             : void
    1852     2591047 : xfs_buf_zero(
    1853             :         struct xfs_buf          *bp,
    1854             :         size_t                  boff,
    1855             :         size_t                  bsize)
    1856             : {
    1857     2591047 :         size_t                  bend;
    1858             : 
    1859     2591047 :         bend = boff + bsize;
    1860    11930622 :         while (boff < bend) {
    1861     9339168 :                 struct page     *page;
    1862     9339168 :                 int             page_index, page_offset, csize;
    1863             : 
    1864     9339168 :                 page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
    1865     9339168 :                 page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
    1866     9339168 :                 page = bp->b_pages[page_index];
    1867     9339168 :                 csize = min_t(size_t, PAGE_SIZE - page_offset,
    1868             :                                       BBTOB(bp->b_length) - boff);
    1869             : 
    1870     9339168 :                 ASSERT((csize + page_offset) <= PAGE_SIZE);
    1871             : 
    1872     9339575 :                 memset(page_address(page) + page_offset, 0, csize);
    1873             : 
    1874     9339575 :                 boff += csize;
    1875             :         }
    1876     2591454 : }
    1877             : 
    1878             : /*
    1879             :  * Log a message about and stale a buffer that a caller has decided is corrupt.
    1880             :  *
    1881             :  * This function should be called for the kinds of metadata corruption that
    1882             :  * cannot be detect from a verifier, such as incorrect inter-block relationship
    1883             :  * data.  Do /not/ call this function from a verifier function.
    1884             :  *
    1885             :  * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
    1886             :  * be marked stale, but b_error will not be set.  The caller is responsible for
    1887             :  * releasing the buffer or fixing it.
    1888             :  */
    1889             : void
    1890           0 : __xfs_buf_mark_corrupt(
    1891             :         struct xfs_buf          *bp,
    1892             :         xfs_failaddr_t          fa)
    1893             : {
    1894           0 :         ASSERT(bp->b_flags & XBF_DONE);
    1895             : 
    1896           0 :         xfs_buf_corruption_error(bp, fa);
    1897           0 :         xfs_buf_stale(bp);
    1898           0 : }
    1899             : 
    1900             : /*
    1901             :  *      Handling of buffer targets (buftargs).
    1902             :  */
    1903             : 
    1904             : /*
    1905             :  * Wait for any bufs with callbacks that have been submitted but have not yet
    1906             :  * returned. These buffers will have an elevated hold count, so wait on those
    1907             :  * while freeing all the buffers only held by the LRU.
    1908             :  */
    1909             : static enum lru_status
    1910    43109327 : xfs_buftarg_drain_rele(
    1911             :         struct list_head        *item,
    1912             :         struct list_lru_one     *lru,
    1913             :         spinlock_t              *lru_lock,
    1914             :         void                    *arg)
    1915             : 
    1916             : {
    1917    43109327 :         struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
    1918    43109327 :         struct list_head        *dispose = arg;
    1919             : 
    1920    43109327 :         if (atomic_read(&bp->b_hold) > 1) {
    1921             :                 /* need to wait, so skip it this pass */
    1922           0 :                 trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
    1923           0 :                 return LRU_SKIP;
    1924             :         }
    1925    43109327 :         if (!spin_trylock(&bp->b_lock))
    1926             :                 return LRU_SKIP;
    1927             : 
    1928             :         /*
    1929             :          * clear the LRU reference count so the buffer doesn't get
    1930             :          * ignored in xfs_buf_rele().
    1931             :          */
    1932    43109369 :         atomic_set(&bp->b_lru_ref, 0);
    1933    43109369 :         bp->b_state |= XFS_BSTATE_DISPOSE;
    1934    43109369 :         list_lru_isolate_move(lru, item, dispose);
    1935    43109362 :         spin_unlock(&bp->b_lock);
    1936    43109362 :         return LRU_REMOVED;
    1937             : }
    1938             : 
    1939             : /*
    1940             :  * Wait for outstanding I/O on the buftarg to complete.
    1941             :  */
    1942             : void
    1943      436294 : xfs_buftarg_wait(
    1944             :         struct xfs_buftarg      *btp)
    1945             : {
    1946             :         /*
    1947             :          * First wait on the buftarg I/O count for all in-flight buffers to be
    1948             :          * released. This is critical as new buffers do not make the LRU until
    1949             :          * they are released.
    1950             :          *
    1951             :          * Next, flush the buffer workqueue to ensure all completion processing
    1952             :          * has finished. Just waiting on buffer locks is not sufficient for
    1953             :          * async IO as the reference count held over IO is not released until
    1954             :          * after the buffer lock is dropped. Hence we need to ensure here that
    1955             :          * all reference counts have been dropped before we start walking the
    1956             :          * LRU list.
    1957             :          */
    1958      438378 :         while (percpu_counter_sum(&btp->bt_io_count))
    1959        2084 :                 delay(100);
    1960      436420 :         flush_workqueue(btp->bt_mount->m_buf_workqueue);
    1961      436759 : }
    1962             : 
    1963             : void
    1964      297480 : xfs_buftarg_drain(
    1965             :         struct xfs_buftarg      *btp)
    1966             : {
    1967      297480 :         LIST_HEAD(dispose);
    1968      297480 :         int                     loop = 0;
    1969      297480 :         bool                    write_fail = false;
    1970             : 
    1971      297480 :         xfs_buftarg_wait(btp);
    1972             : 
    1973             :         /* loop until there is nothing left on the lru list. */
    1974      592419 :         while (list_lru_count(&btp->bt_lru)) {
    1975      295052 :                 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
    1976             :                               &dispose, LONG_MAX);
    1977             : 
    1978    43403494 :                 while (!list_empty(&dispose)) {
    1979    43108468 :                         struct xfs_buf *bp;
    1980    43108468 :                         bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
    1981    43108468 :                         list_del_init(&bp->b_lru);
    1982    43108472 :                         if (bp->b_flags & XBF_WRITE_FAIL) {
    1983           0 :                                 write_fail = true;
    1984           0 :                                 xfs_buf_alert_ratelimited(bp,
    1985             :                                         "XFS: Corruption Alert",
    1986             : "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
    1987             :                                         (long long)xfs_buf_daddr(bp));
    1988             :                         }
    1989    43108472 :                         xfs_buf_rele(bp);
    1990             :                 }
    1991      294939 :                 if (loop++ != 0)
    1992           0 :                         delay(100);
    1993             :         }
    1994             : 
    1995             :         /*
    1996             :          * If one or more failed buffers were freed, that means dirty metadata
    1997             :          * was thrown away. This should only ever happen after I/O completion
    1998             :          * handling has elevated I/O error(s) to permanent failures and shuts
    1999             :          * down the journal.
    2000             :          */
    2001      297612 :         if (write_fail) {
    2002           0 :                 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
    2003           0 :                 xfs_alert(btp->bt_mount,
    2004             :               "Please run xfs_repair to determine the extent of the problem.");
    2005             :         }
    2006      297612 : }
    2007             : 
    2008             : static enum lru_status
    2009     3715170 : xfs_buftarg_isolate(
    2010             :         struct list_head        *item,
    2011             :         struct list_lru_one     *lru,
    2012             :         spinlock_t              *lru_lock,
    2013             :         void                    *arg)
    2014             : {
    2015     3715170 :         struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
    2016     3715170 :         struct list_head        *dispose = arg;
    2017             : 
    2018             :         /*
    2019             :          * we are inverting the lru lock/bp->b_lock here, so use a trylock.
    2020             :          * If we fail to get the lock, just skip it.
    2021             :          */
    2022     3715170 :         if (!spin_trylock(&bp->b_lock))
    2023             :                 return LRU_SKIP;
    2024             :         /*
    2025             :          * Decrement the b_lru_ref count unless the value is already
    2026             :          * zero. If the value is already zero, we need to reclaim the
    2027             :          * buffer, otherwise it gets another trip through the LRU.
    2028             :          */
    2029     7430338 :         if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
    2030     2432682 :                 spin_unlock(&bp->b_lock);
    2031     2432682 :                 return LRU_ROTATE;
    2032             :         }
    2033             : 
    2034     1282487 :         bp->b_state |= XFS_BSTATE_DISPOSE;
    2035     1282487 :         list_lru_isolate_move(lru, item, dispose);
    2036     1282487 :         spin_unlock(&bp->b_lock);
    2037     1282487 :         return LRU_REMOVED;
    2038             : }
    2039             : 
    2040             : static unsigned long
    2041       31383 : xfs_buftarg_shrink_scan(
    2042             :         struct shrinker         *shrink,
    2043             :         struct shrink_control   *sc)
    2044             : {
    2045       31383 :         struct xfs_buftarg      *btp = container_of(shrink,
    2046             :                                         struct xfs_buftarg, bt_shrinker);
    2047       31383 :         LIST_HEAD(dispose);
    2048       31383 :         unsigned long           freed;
    2049             : 
    2050       31383 :         freed = list_lru_shrink_walk(&btp->bt_lru, sc,
    2051             :                                      xfs_buftarg_isolate, &dispose);
    2052             : 
    2053     1313870 :         while (!list_empty(&dispose)) {
    2054     1282487 :                 struct xfs_buf *bp;
    2055     1282487 :                 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
    2056     1282487 :                 list_del_init(&bp->b_lru);
    2057     1282487 :                 xfs_buf_rele(bp);
    2058             :         }
    2059             : 
    2060       31383 :         return freed;
    2061             : }
    2062             : 
    2063             : static unsigned long
    2064       14404 : xfs_buftarg_shrink_count(
    2065             :         struct shrinker         *shrink,
    2066             :         struct shrink_control   *sc)
    2067             : {
    2068       14404 :         struct xfs_buftarg      *btp = container_of(shrink,
    2069             :                                         struct xfs_buftarg, bt_shrinker);
    2070       14404 :         return list_lru_shrink_count(&btp->bt_lru, sc);
    2071             : }
    2072             : 
    2073             : void
    2074      282899 : xfs_free_buftarg(
    2075             :         struct xfs_buftarg      *btp)
    2076             : {
    2077      282899 :         unregister_shrinker(&btp->bt_shrinker);
    2078      283231 :         ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
    2079      283363 :         percpu_counter_destroy(&btp->bt_io_count);
    2080      283404 :         list_lru_destroy(&btp->bt_lru);
    2081             : 
    2082      283403 :         if (!(btp->bt_flags & XFS_BUFTARG_XFILE)) {
    2083      115226 :                 blkdev_issue_flush(btp->bt_bdev);
    2084      115226 :                 invalidate_bdev(btp->bt_bdev);
    2085      115226 :                 fs_put_dax(btp->bt_daxdev, btp->bt_mount);
    2086             :         }
    2087             : 
    2088      283403 :         kvfree(btp);
    2089      283373 : }
    2090             : 
    2091             : int
    2092      229678 : xfs_setsize_buftarg(
    2093             :         xfs_buftarg_t           *btp,
    2094             :         unsigned int            sectorsize)
    2095             : {
    2096             :         /* Set up metadata sector size info */
    2097      229678 :         btp->bt_meta_sectorsize = sectorsize;
    2098      229678 :         btp->bt_meta_sectormask = sectorsize - 1;
    2099             : 
    2100      229678 :         if (set_blocksize(btp->bt_bdev, sectorsize)) {
    2101           0 :                 xfs_warn(btp->bt_mount,
    2102             :                         "Cannot set_blocksize to %u on device %pg",
    2103             :                         sectorsize, btp->bt_bdev);
    2104           0 :                 return -EINVAL;
    2105             :         }
    2106             : 
    2107             :         /* Set up device logical sector size mask */
    2108      229678 :         btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
    2109      229678 :         btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
    2110             : 
    2111      229678 :         return 0;
    2112             : }
    2113             : 
    2114             : /*
    2115             :  * When allocating the initial buffer target we have not yet
    2116             :  * read in the superblock, so don't know what sized sectors
    2117             :  * are being used at this early stage.  Play safe.
    2118             :  */
    2119             : STATIC int
    2120      115207 : xfs_setsize_buftarg_early(
    2121             :         xfs_buftarg_t           *btp,
    2122             :         struct block_device     *bdev)
    2123             : {
    2124      230414 :         return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
    2125             : }
    2126             : 
    2127             : struct xfs_buftarg *
    2128      282951 : xfs_alloc_buftarg_common(
    2129             :         struct xfs_mount        *mp,
    2130             :         const char              *descr)
    2131             : {
    2132      282951 :         struct xfs_buftarg      *btp;
    2133             : 
    2134      282951 :         btp = kzalloc(sizeof(*btp), GFP_NOFS);
    2135      283146 :         if (!btp)
    2136             :                 return NULL;
    2137             : 
    2138      283146 :         btp->bt_mount = mp;
    2139             : 
    2140             :         /*
    2141             :          * Buffer IO error rate limiting. Limit it to no more than 10 messages
    2142             :          * per 30 seconds so as to not spam logs too much on repeated errors.
    2143             :          */
    2144      283146 :         ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
    2145             :                              DEFAULT_RATELIMIT_BURST);
    2146             : 
    2147      282998 :         if (list_lru_init(&btp->bt_lru))
    2148           0 :                 goto error_free;
    2149             : 
    2150      282923 :         if (percpu_counter_init(&btp->bt_io_count, 0, GFP_NOFS))
    2151           0 :                 goto error_lru;
    2152             : 
    2153      283384 :         btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
    2154      283384 :         btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
    2155      283384 :         btp->bt_shrinker.seeks = DEFAULT_SEEKS;
    2156      283384 :         btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
    2157      283385 :         if (register_shrinker(&btp->bt_shrinker, "xfs-%s:%s", descr,
    2158      283384 :                               mp->m_super->s_id))
    2159           0 :                 goto error_pcpu;
    2160             : 
    2161             :         return btp;
    2162             : 
    2163             : error_pcpu:
    2164           0 :         percpu_counter_destroy(&btp->bt_io_count);
    2165           0 : error_lru:
    2166           0 :         list_lru_destroy(&btp->bt_lru);
    2167           0 : error_free:
    2168           0 :         kvfree(btp);
    2169           0 :         return NULL;
    2170             : }
    2171             : 
    2172             : static inline void
    2173   209454419 : xfs_buf_list_del(
    2174             :         struct xfs_buf          *bp)
    2175             : {
    2176   209454419 :         list_del_init(&bp->b_list);
    2177   209454298 :         wake_up_var(&bp->b_list);
    2178   209454314 : }
    2179             : 
    2180             : /* Allocate a buffer cache target for a persistent block device. */
    2181             : struct xfs_buftarg *
    2182      115207 : xfs_alloc_buftarg(
    2183             :         struct xfs_mount        *mp,
    2184             :         struct block_device     *bdev)
    2185             : {
    2186      115207 :         struct xfs_buftarg      *btp;
    2187      115207 :         const struct dax_holder_operations *ops = NULL;
    2188             : 
    2189             : #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
    2190      115207 :         ops = &xfs_dax_holder_operations;
    2191             : #endif
    2192             : 
    2193      115207 :         btp = xfs_alloc_buftarg_common(mp, "buf");
    2194      115207 :         if (!btp)
    2195             :                 return NULL;
    2196             : 
    2197      115207 :         btp->bt_dev =  bdev->bd_dev;
    2198      115207 :         btp->bt_bdev = bdev;
    2199      115207 :         btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
    2200             :                                             mp, ops);
    2201             : 
    2202      115207 :         if (xfs_setsize_buftarg_early(btp, bdev))
    2203           0 :                 goto error_free;
    2204             : 
    2205             :         return btp;
    2206             : 
    2207             : error_free:
    2208           0 :         xfs_free_buftarg(btp);
    2209           0 :         return NULL;
    2210             : }
    2211             : 
    2212             : /*
    2213             :  * Cancel a delayed write list.
    2214             :  *
    2215             :  * Remove each buffer from the list, clear the delwri queue flag and drop the
    2216             :  * associated buffer reference.
    2217             :  */
    2218             : void
    2219      917627 : xfs_buf_delwri_cancel(
    2220             :         struct list_head        *list)
    2221             : {
    2222      917627 :         struct xfs_buf          *bp;
    2223             : 
    2224      917591 :         while (!list_empty(list)) {
    2225           0 :                 bp = list_first_entry(list, struct xfs_buf, b_list);
    2226             : 
    2227           0 :                 xfs_buf_lock(bp);
    2228           0 :                 bp->b_flags &= ~_XBF_DELWRI_Q;
    2229           0 :                 xfs_buf_list_del(bp);
    2230           0 :                 xfs_buf_relse(bp);
    2231             :         }
    2232      917591 : }
    2233             : 
    2234             : /*
    2235             :  * Add a buffer to the delayed write list.
    2236             :  *
    2237             :  * This queues a buffer for writeout if it hasn't already been.  Note that
    2238             :  * neither this routine nor the buffer list submission functions perform
    2239             :  * any internal synchronization.  It is expected that the lists are thread-local
    2240             :  * to the callers.
    2241             :  *
    2242             :  * Returns true if we queued up the buffer, or false if it already had
    2243             :  * been on the buffer list.
    2244             :  */
    2245             : bool
    2246   228692662 : xfs_buf_delwri_queue(
    2247             :         struct xfs_buf          *bp,
    2248             :         struct list_head        *list)
    2249             : {
    2250   228692662 :         ASSERT(xfs_buf_islocked(bp));
    2251   228692662 :         ASSERT(!(bp->b_flags & XBF_READ));
    2252             : 
    2253             :         /*
    2254             :          * If the buffer is already marked delwri it already is queued up
    2255             :          * by someone else for imediate writeout.  Just ignore it in that
    2256             :          * case.
    2257             :          */
    2258   228692662 :         if (bp->b_flags & _XBF_DELWRI_Q) {
    2259    19240646 :                 trace_xfs_buf_delwri_queued(bp, _RET_IP_);
    2260    19240646 :                 return false;
    2261             :         }
    2262             : 
    2263   209452016 :         trace_xfs_buf_delwri_queue(bp, _RET_IP_);
    2264             : 
    2265             :         /*
    2266             :          * If a buffer gets written out synchronously or marked stale while it
    2267             :          * is on a delwri list we lazily remove it. To do this, the other party
    2268             :          * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
    2269             :          * It remains referenced and on the list.  In a rare corner case it
    2270             :          * might get readded to a delwri list after the synchronous writeout, in
    2271             :          * which case we need just need to re-add the flag here.
    2272             :          */
    2273   209451198 :         bp->b_flags |= _XBF_DELWRI_Q;
    2274   209451198 :         if (list_empty(&bp->b_list)) {
    2275   209451207 :                 atomic_inc(&bp->b_hold);
    2276   209453459 :                 list_add_tail(&bp->b_list, list);
    2277             :         }
    2278             : 
    2279             :         return true;
    2280             : }
    2281             : 
    2282             : /*
    2283             :  * Queue a buffer to this delwri list as part of a data integrity operation.
    2284             :  * If the buffer is on any other delwri list, we'll wait for that to clear
    2285             :  * so that the caller can submit the buffer for IO and wait for the result.
    2286             :  * Callers must ensure the buffer is not already on the list.
    2287             :  */
    2288             : void
    2289     5095630 : xfs_buf_delwri_queue_here(
    2290             :         struct xfs_buf          *bp,
    2291             :         struct list_head        *buffer_list)
    2292             : {
    2293             :         /*
    2294             :          * We need this buffer to end up on the /caller's/ delwri list, not any
    2295             :          * old list.  This can happen if the buffer is marked stale (which
    2296             :          * clears DELWRI_Q) after the AIL queues the buffer to its list but
    2297             :          * before the AIL has a chance to submit the list.
    2298             :          */
    2299     5094819 :         while (!list_empty(&bp->b_list)) {
    2300          10 :                 xfs_buf_unlock(bp);
    2301          20 :                 wait_var_event(&bp->b_list, list_empty(&bp->b_list));
    2302          10 :                 xfs_buf_lock(bp);
    2303             :         }
    2304             : 
    2305     5094809 :         ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
    2306             : 
    2307             :         /* This buffer is uptodate; don't let it get reread. */
    2308     5094809 :         bp->b_flags |= XBF_DONE;
    2309             : 
    2310     5094809 :         xfs_buf_delwri_queue(bp, buffer_list);
    2311     5095887 : }
    2312             : 
    2313             : /*
    2314             :  * Compare function is more complex than it needs to be because
    2315             :  * the return value is only 32 bits and we are doing comparisons
    2316             :  * on 64 bit values
    2317             :  */
    2318             : static int
    2319  1867824139 : xfs_buf_cmp(
    2320             :         void                    *priv,
    2321             :         const struct list_head  *a,
    2322             :         const struct list_head  *b)
    2323             : {
    2324  1867824139 :         struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
    2325  1867824139 :         struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
    2326  1867824139 :         xfs_daddr_t             diff;
    2327             : 
    2328  1867824139 :         diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
    2329  1867824139 :         if (diff < 0)
    2330             :                 return -1;
    2331   921388155 :         if (diff > 0)
    2332   921234617 :                 return 1;
    2333             :         return 0;
    2334             : }
    2335             : 
    2336             : /*
    2337             :  * Submit buffers for write. If wait_list is specified, the buffers are
    2338             :  * submitted using sync I/O and placed on the wait list such that the caller can
    2339             :  * iowait each buffer. Otherwise async I/O is used and the buffers are released
    2340             :  * at I/O completion time. In either case, buffers remain locked until I/O
    2341             :  * completes and the buffer is released from the queue.
    2342             :  */
    2343             : static int
    2344     8605733 : xfs_buf_delwri_submit_buffers(
    2345             :         struct list_head        *buffer_list,
    2346             :         struct list_head        *wait_list)
    2347             : {
    2348     8605733 :         struct xfs_buf          *bp, *n;
    2349     8605733 :         int                     pinned = 0;
    2350     8605733 :         struct blk_plug         plug;
    2351             : 
    2352     8605733 :         list_sort(NULL, buffer_list, xfs_buf_cmp);
    2353             : 
    2354     8604777 :         blk_start_plug(&plug);
    2355   220461733 :         list_for_each_entry_safe(bp, n, buffer_list, b_list) {
    2356   211854645 :                 if (!wait_list) {
    2357   180994647 :                         if (!xfs_buf_trylock(bp))
    2358       91313 :                                 continue;
    2359   180903334 :                         if (xfs_buf_ispinned(bp)) {
    2360     2312120 :                                 xfs_buf_unlock(bp);
    2361     2312120 :                                 pinned++;
    2362     2312120 :                                 continue;
    2363             :                         }
    2364             :                 } else {
    2365    30859998 :                         xfs_buf_lock(bp);
    2366             :                 }
    2367             : 
    2368             :                 /*
    2369             :                  * Someone else might have written the buffer synchronously or
    2370             :                  * marked it stale in the meantime.  In that case only the
    2371             :                  * _XBF_DELWRI_Q flag got cleared, and we have to drop the
    2372             :                  * reference and remove it from the list here.
    2373             :                  */
    2374   209451567 :                 if (!(bp->b_flags & _XBF_DELWRI_Q)) {
    2375       84019 :                         xfs_buf_list_del(bp);
    2376       84019 :                         xfs_buf_relse(bp);
    2377       84019 :                         continue;
    2378             :                 }
    2379             : 
    2380   209367548 :                 trace_xfs_buf_delwri_split(bp, _RET_IP_);
    2381             : 
    2382             :                 /*
    2383             :                  * If we have a wait list, each buffer (and associated delwri
    2384             :                  * queue reference) transfers to it and is submitted
    2385             :                  * synchronously. Otherwise, drop the buffer from the delwri
    2386             :                  * queue and submit async.
    2387             :                  */
    2388   209367865 :                 bp->b_flags &= ~_XBF_DELWRI_Q;
    2389   209367865 :                 bp->b_flags |= XBF_WRITE;
    2390   209367865 :                 if (wait_list) {
    2391    30860670 :                         bp->b_flags &= ~XBF_ASYNC;
    2392    30860670 :                         list_move_tail(&bp->b_list, wait_list);
    2393             :                 } else {
    2394   178507195 :                         bp->b_flags |= XBF_ASYNC;
    2395   178507195 :                         xfs_buf_list_del(bp);
    2396             :                 }
    2397   209367862 :                 __xfs_buf_submit(bp, false);
    2398             :         }
    2399     8607088 :         blk_finish_plug(&plug);
    2400             : 
    2401     8607776 :         return pinned;
    2402             : }
    2403             : 
    2404             : /*
    2405             :  * Write out a buffer list asynchronously.
    2406             :  *
    2407             :  * This will take the @buffer_list, write all non-locked and non-pinned buffers
    2408             :  * out and not wait for I/O completion on any of the buffers.  This interface
    2409             :  * is only safely useable for callers that can track I/O completion by higher
    2410             :  * level means, e.g. AIL pushing as the @buffer_list is consumed in this
    2411             :  * function.
    2412             :  *
    2413             :  * Note: this function will skip buffers it would block on, and in doing so
    2414             :  * leaves them on @buffer_list so they can be retried on a later pass. As such,
    2415             :  * it is up to the caller to ensure that the buffer list is fully submitted or
    2416             :  * cancelled appropriately when they are finished with the list. Failure to
    2417             :  * cancel or resubmit the list until it is empty will result in leaked buffers
    2418             :  * at unmount time.
    2419             :  */
    2420             : int
    2421     6642540 : xfs_buf_delwri_submit_nowait(
    2422             :         struct list_head        *buffer_list)
    2423             : {
    2424     6642540 :         return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
    2425             : }
    2426             : 
    2427             : /*
    2428             :  * Write out a buffer list synchronously.
    2429             :  *
    2430             :  * This will take the @buffer_list, write all buffers out and wait for I/O
    2431             :  * completion on all of the buffers. @buffer_list is consumed by the function,
    2432             :  * so callers must have some other way of tracking buffers if they require such
    2433             :  * functionality.
    2434             :  */
    2435             : int
    2436     1963018 : xfs_buf_delwri_submit(
    2437             :         struct list_head        *buffer_list)
    2438             : {
    2439     1963018 :         LIST_HEAD               (wait_list);
    2440     1963018 :         int                     error = 0, error2;
    2441     1963018 :         struct xfs_buf          *bp;
    2442             : 
    2443     1963018 :         xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
    2444             : 
    2445             :         /* Wait for IO to complete. */
    2446    32826460 :         while (!list_empty(&wait_list)) {
    2447    30863243 :                 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
    2448             : 
    2449    30863243 :                 xfs_buf_list_del(bp);
    2450             : 
    2451             :                 /*
    2452             :                  * Wait on the locked buffer, check for errors and unlock and
    2453             :                  * release the delwri queue reference.
    2454             :                  */
    2455    30863226 :                 error2 = xfs_buf_iowait(bp);
    2456    30863235 :                 xfs_buf_relse(bp);
    2457    30863442 :                 if (!error)
    2458    30863441 :                         error = error2;
    2459             :         }
    2460             : 
    2461     1965504 :         return error;
    2462             : }
    2463             : 
    2464             : /*
    2465             :  * Push a single buffer on a delwri queue.
    2466             :  *
    2467             :  * The purpose of this function is to submit a single buffer of a delwri queue
    2468             :  * and return with the buffer still on the original queue. The waiting delwri
    2469             :  * buffer submission infrastructure guarantees transfer of the delwri queue
    2470             :  * buffer reference to a temporary wait list. We reuse this infrastructure to
    2471             :  * transfer the buffer back to the original queue.
    2472             :  *
    2473             :  * Note the buffer transitions from the queued state, to the submitted and wait
    2474             :  * listed state and back to the queued state during this call. The buffer
    2475             :  * locking and queue management logic between _delwri_pushbuf() and
    2476             :  * _delwri_queue() guarantee that the buffer cannot be queued to another list
    2477             :  * before returning.
    2478             :  */
    2479             : int
    2480           0 : xfs_buf_delwri_pushbuf(
    2481             :         struct xfs_buf          *bp,
    2482             :         struct list_head        *buffer_list)
    2483             : {
    2484           0 :         LIST_HEAD               (submit_list);
    2485           0 :         int                     error;
    2486             : 
    2487           0 :         ASSERT(bp->b_flags & _XBF_DELWRI_Q);
    2488             : 
    2489           0 :         trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
    2490             : 
    2491             :         /*
    2492             :          * Isolate the buffer to a new local list so we can submit it for I/O
    2493             :          * independently from the rest of the original list.
    2494             :          */
    2495           0 :         xfs_buf_lock(bp);
    2496           0 :         list_move(&bp->b_list, &submit_list);
    2497           0 :         xfs_buf_unlock(bp);
    2498             : 
    2499             :         /*
    2500             :          * Delwri submission clears the DELWRI_Q buffer flag and returns with
    2501             :          * the buffer on the wait list with the original reference. Rather than
    2502             :          * bounce the buffer from a local wait list back to the original list
    2503             :          * after I/O completion, reuse the original list as the wait list.
    2504             :          */
    2505           0 :         xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
    2506             : 
    2507             :         /*
    2508             :          * The buffer is now locked, under I/O and wait listed on the original
    2509             :          * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
    2510             :          * return with the buffer unlocked and on the original queue.
    2511             :          */
    2512           0 :         error = xfs_buf_iowait(bp);
    2513           0 :         bp->b_flags |= _XBF_DELWRI_Q;
    2514           0 :         xfs_buf_unlock(bp);
    2515             : 
    2516           0 :         return error;
    2517             : }
    2518             : 
    2519 31415923611 : void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
    2520             : {
    2521             :         /*
    2522             :          * Set the lru reference count to 0 based on the error injection tag.
    2523             :          * This allows userspace to disrupt buffer caching for debug/testing
    2524             :          * purposes.
    2525             :          */
    2526 31415923611 :         if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
    2527         105 :                 lru_ref = 0;
    2528             : 
    2529 31409328201 :         atomic_set(&bp->b_lru_ref, lru_ref);
    2530 31409328201 : }
    2531             : 
    2532             : /*
    2533             :  * Verify an on-disk magic value against the magic value specified in the
    2534             :  * verifier structure. The verifier magic is in disk byte order so the caller is
    2535             :  * expected to pass the value directly from disk.
    2536             :  */
    2537             : bool
    2538  1111576466 : xfs_verify_magic(
    2539             :         struct xfs_buf          *bp,
    2540             :         __be32                  dmagic)
    2541             : {
    2542  1111576466 :         struct xfs_mount        *mp = bp->b_mount;
    2543  1111576466 :         int                     idx;
    2544             : 
    2545  1111576466 :         idx = xfs_has_crc(mp);
    2546  1111576466 :         if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
    2547             :                 return false;
    2548  1111576035 :         return dmagic == bp->b_ops->magic[idx];
    2549             : }
    2550             : /*
    2551             :  * Verify an on-disk magic value against the magic value specified in the
    2552             :  * verifier structure. The verifier magic is in disk byte order so the caller is
    2553             :  * expected to pass the value directly from disk.
    2554             :  */
    2555             : bool
    2556  2319419565 : xfs_verify_magic16(
    2557             :         struct xfs_buf          *bp,
    2558             :         __be16                  dmagic)
    2559             : {
    2560  2319419565 :         struct xfs_mount        *mp = bp->b_mount;
    2561  2319419565 :         int                     idx;
    2562             : 
    2563  2319419565 :         idx = xfs_has_crc(mp);
    2564  2319419565 :         if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
    2565             :                 return false;
    2566  2319419558 :         return dmagic == bp->b_ops->magic16[idx];
    2567             : }
    2568             : 
    2569             : /* Return the number of sectors for a buffer target. */
    2570             : xfs_daddr_t
    2571  7657454112 : xfs_buftarg_nr_sectors(
    2572             :         struct xfs_buftarg      *btp)
    2573             : {
    2574  7657454112 :         if (btp->bt_flags & XFS_BUFTARG_XFILE)
    2575  7657454112 :                 return xfile_buftarg_nr_sectors(btp);
    2576           0 :         return bdev_nr_sectors(btp->bt_bdev);
    2577             : }

Generated by: LCOV version 1.14