LCOV - code coverage report
Current view: top level - fs/ext4 - mballoc.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsx @ Mon Jul 31 20:08:34 PDT 2023 Lines: 2696 3328 81.0 %
Date: 2023-07-31 20:08:34 Functions: 109 120 90.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
       4             :  * Written by Alex Tomas <alex@clusterfs.com>
       5             :  */
       6             : 
       7             : 
       8             : /*
       9             :  * mballoc.c contains the multiblocks allocation routines
      10             :  */
      11             : 
      12             : #include "ext4_jbd2.h"
      13             : #include "mballoc.h"
      14             : #include <linux/log2.h>
      15             : #include <linux/module.h>
      16             : #include <linux/slab.h>
      17             : #include <linux/nospec.h>
      18             : #include <linux/backing-dev.h>
      19             : #include <trace/events/ext4.h>
      20             : 
      21             : /*
      22             :  * MUSTDO:
      23             :  *   - test ext4_ext_search_left() and ext4_ext_search_right()
      24             :  *   - search for metadata in few groups
      25             :  *
      26             :  * TODO v4:
      27             :  *   - normalization should take into account whether file is still open
      28             :  *   - discard preallocations if no free space left (policy?)
      29             :  *   - don't normalize tails
      30             :  *   - quota
      31             :  *   - reservation for superuser
      32             :  *
      33             :  * TODO v3:
      34             :  *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
      35             :  *   - track min/max extents in each group for better group selection
      36             :  *   - mb_mark_used() may allocate chunk right after splitting buddy
      37             :  *   - tree of groups sorted by number of free blocks
      38             :  *   - error handling
      39             :  */
      40             : 
      41             : /*
      42             :  * The allocation request involve request for multiple number of blocks
      43             :  * near to the goal(block) value specified.
      44             :  *
      45             :  * During initialization phase of the allocator we decide to use the
      46             :  * group preallocation or inode preallocation depending on the size of
      47             :  * the file. The size of the file could be the resulting file size we
      48             :  * would have after allocation, or the current file size, which ever
      49             :  * is larger. If the size is less than sbi->s_mb_stream_request we
      50             :  * select to use the group preallocation. The default value of
      51             :  * s_mb_stream_request is 16 blocks. This can also be tuned via
      52             :  * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
      53             :  * terms of number of blocks.
      54             :  *
      55             :  * The main motivation for having small file use group preallocation is to
      56             :  * ensure that we have small files closer together on the disk.
      57             :  *
      58             :  * First stage the allocator looks at the inode prealloc list,
      59             :  * ext4_inode_info->i_prealloc_list, which contains list of prealloc
      60             :  * spaces for this particular inode. The inode prealloc space is
      61             :  * represented as:
      62             :  *
      63             :  * pa_lstart -> the logical start block for this prealloc space
      64             :  * pa_pstart -> the physical start block for this prealloc space
      65             :  * pa_len    -> length for this prealloc space (in clusters)
      66             :  * pa_free   ->  free space available in this prealloc space (in clusters)
      67             :  *
      68             :  * The inode preallocation space is used looking at the _logical_ start
      69             :  * block. If only the logical file block falls within the range of prealloc
      70             :  * space we will consume the particular prealloc space. This makes sure that
      71             :  * we have contiguous physical blocks representing the file blocks
      72             :  *
      73             :  * The important thing to be noted in case of inode prealloc space is that
      74             :  * we don't modify the values associated to inode prealloc space except
      75             :  * pa_free.
      76             :  *
      77             :  * If we are not able to find blocks in the inode prealloc space and if we
      78             :  * have the group allocation flag set then we look at the locality group
      79             :  * prealloc space. These are per CPU prealloc list represented as
      80             :  *
      81             :  * ext4_sb_info.s_locality_groups[smp_processor_id()]
      82             :  *
      83             :  * The reason for having a per cpu locality group is to reduce the contention
      84             :  * between CPUs. It is possible to get scheduled at this point.
      85             :  *
      86             :  * The locality group prealloc space is used looking at whether we have
      87             :  * enough free space (pa_free) within the prealloc space.
      88             :  *
      89             :  * If we can't allocate blocks via inode prealloc or/and locality group
      90             :  * prealloc then we look at the buddy cache. The buddy cache is represented
      91             :  * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
      92             :  * mapped to the buddy and bitmap information regarding different
      93             :  * groups. The buddy information is attached to buddy cache inode so that
      94             :  * we can access them through the page cache. The information regarding
      95             :  * each group is loaded via ext4_mb_load_buddy.  The information involve
      96             :  * block bitmap and buddy information. The information are stored in the
      97             :  * inode as:
      98             :  *
      99             :  *  {                        page                        }
     100             :  *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
     101             :  *
     102             :  *
     103             :  * one block each for bitmap and buddy information.  So for each group we
     104             :  * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
     105             :  * blocksize) blocks.  So it can have information regarding groups_per_page
     106             :  * which is blocks_per_page/2
     107             :  *
     108             :  * The buddy cache inode is not stored on disk. The inode is thrown
     109             :  * away when the filesystem is unmounted.
     110             :  *
     111             :  * We look for count number of blocks in the buddy cache. If we were able
     112             :  * to locate that many free blocks we return with additional information
     113             :  * regarding rest of the contiguous physical block available
     114             :  *
     115             :  * Before allocating blocks via buddy cache we normalize the request
     116             :  * blocks. This ensure we ask for more blocks that we needed. The extra
     117             :  * blocks that we get after allocation is added to the respective prealloc
     118             :  * list. In case of inode preallocation we follow a list of heuristics
     119             :  * based on file size. This can be found in ext4_mb_normalize_request. If
     120             :  * we are doing a group prealloc we try to normalize the request to
     121             :  * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
     122             :  * dependent on the cluster size; for non-bigalloc file systems, it is
     123             :  * 512 blocks. This can be tuned via
     124             :  * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
     125             :  * terms of number of blocks. If we have mounted the file system with -O
     126             :  * stripe=<value> option the group prealloc request is normalized to the
     127             :  * smallest multiple of the stripe value (sbi->s_stripe) which is
     128             :  * greater than the default mb_group_prealloc.
     129             :  *
     130             :  * If "mb_optimize_scan" mount option is set, we maintain in memory group info
     131             :  * structures in two data structures:
     132             :  *
     133             :  * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
     134             :  *
     135             :  *    Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
     136             :  *
     137             :  *    This is an array of lists where the index in the array represents the
     138             :  *    largest free order in the buddy bitmap of the participating group infos of
     139             :  *    that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
     140             :  *    number of buddy bitmap orders possible) number of lists. Group-infos are
     141             :  *    placed in appropriate lists.
     142             :  *
     143             :  * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
     144             :  *
     145             :  *    Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
     146             :  *
     147             :  *    This is an array of lists where in the i-th list there are groups with
     148             :  *    average fragment size >= 2^i and < 2^(i+1). The average fragment size
     149             :  *    is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
     150             :  *    Note that we don't bother with a special list for completely empty groups
     151             :  *    so we only have MB_NUM_ORDERS(sb) lists.
     152             :  *
     153             :  * When "mb_optimize_scan" mount option is set, mballoc consults the above data
     154             :  * structures to decide the order in which groups are to be traversed for
     155             :  * fulfilling an allocation request.
     156             :  *
     157             :  * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order
     158             :  * >= the order of the request. We directly look at the largest free order list
     159             :  * in the data structure (1) above where largest_free_order = order of the
     160             :  * request. If that list is empty, we look at remaining list in the increasing
     161             :  * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED
     162             :  * lookup in O(1) time.
     163             :  *
     164             :  * At CR_GOAL_LEN_FAST, we only consider groups where
     165             :  * average fragment size > request size. So, we lookup a group which has average
     166             :  * fragment size just above or equal to request size using our average fragment
     167             :  * size group lists (data structure 2) in O(1) time.
     168             :  *
     169             :  * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied
     170             :  * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in
     171             :  * CR_GOAL_LEN_FAST suggests that there is no BG that has avg
     172             :  * fragment size > goal length. So before falling to the slower
     173             :  * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and
     174             :  * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big
     175             :  * enough average fragment size. This increases the chances of finding a
     176             :  * suitable block group in O(1) time and results in faster allocation at the
     177             :  * cost of reduced size of allocation.
     178             :  *
     179             :  * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
     180             :  * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and
     181             :  * CR_GOAL_LEN_FAST phase.
     182             :  *
     183             :  * The regular allocator (using the buddy cache) supports a few tunables.
     184             :  *
     185             :  * /sys/fs/ext4/<partition>/mb_min_to_scan
     186             :  * /sys/fs/ext4/<partition>/mb_max_to_scan
     187             :  * /sys/fs/ext4/<partition>/mb_order2_req
     188             :  * /sys/fs/ext4/<partition>/mb_linear_limit
     189             :  *
     190             :  * The regular allocator uses buddy scan only if the request len is power of
     191             :  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
     192             :  * value of s_mb_order2_reqs can be tuned via
     193             :  * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
     194             :  * stripe size (sbi->s_stripe), we try to search for contiguous block in
     195             :  * stripe size. This should result in better allocation on RAID setups. If
     196             :  * not, we search in the specific group using bitmap for best extents. The
     197             :  * tunable min_to_scan and max_to_scan control the behaviour here.
     198             :  * min_to_scan indicate how long the mballoc __must__ look for a best
     199             :  * extent and max_to_scan indicates how long the mballoc __can__ look for a
     200             :  * best extent in the found extents. Searching for the blocks starts with
     201             :  * the group specified as the goal value in allocation context via
     202             :  * ac_g_ex. Each group is first checked based on the criteria whether it
     203             :  * can be used for allocation. ext4_mb_good_group explains how the groups are
     204             :  * checked.
     205             :  *
     206             :  * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
     207             :  * get traversed linearly. That may result in subsequent allocations being not
     208             :  * close to each other. And so, the underlying device may get filled up in a
     209             :  * non-linear fashion. While that may not matter on non-rotational devices, for
     210             :  * rotational devices that may result in higher seek times. "mb_linear_limit"
     211             :  * tells mballoc how many groups mballoc should search linearly before
     212             :  * performing consulting above data structures for more efficient lookups. For
     213             :  * non rotational devices, this value defaults to 0 and for rotational devices
     214             :  * this is set to MB_DEFAULT_LINEAR_LIMIT.
     215             :  *
     216             :  * Both the prealloc space are getting populated as above. So for the first
     217             :  * request we will hit the buddy cache which will result in this prealloc
     218             :  * space getting filled. The prealloc space is then later used for the
     219             :  * subsequent request.
     220             :  */
     221             : 
     222             : /*
     223             :  * mballoc operates on the following data:
     224             :  *  - on-disk bitmap
     225             :  *  - in-core buddy (actually includes buddy and bitmap)
     226             :  *  - preallocation descriptors (PAs)
     227             :  *
     228             :  * there are two types of preallocations:
     229             :  *  - inode
     230             :  *    assiged to specific inode and can be used for this inode only.
     231             :  *    it describes part of inode's space preallocated to specific
     232             :  *    physical blocks. any block from that preallocated can be used
     233             :  *    independent. the descriptor just tracks number of blocks left
     234             :  *    unused. so, before taking some block from descriptor, one must
     235             :  *    make sure corresponded logical block isn't allocated yet. this
     236             :  *    also means that freeing any block within descriptor's range
     237             :  *    must discard all preallocated blocks.
     238             :  *  - locality group
     239             :  *    assigned to specific locality group which does not translate to
     240             :  *    permanent set of inodes: inode can join and leave group. space
     241             :  *    from this type of preallocation can be used for any inode. thus
     242             :  *    it's consumed from the beginning to the end.
     243             :  *
     244             :  * relation between them can be expressed as:
     245             :  *    in-core buddy = on-disk bitmap + preallocation descriptors
     246             :  *
     247             :  * this mean blocks mballoc considers used are:
     248             :  *  - allocated blocks (persistent)
     249             :  *  - preallocated blocks (non-persistent)
     250             :  *
     251             :  * consistency in mballoc world means that at any time a block is either
     252             :  * free or used in ALL structures. notice: "any time" should not be read
     253             :  * literally -- time is discrete and delimited by locks.
     254             :  *
     255             :  *  to keep it simple, we don't use block numbers, instead we count number of
     256             :  *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
     257             :  *
     258             :  * all operations can be expressed as:
     259             :  *  - init buddy:                       buddy = on-disk + PAs
     260             :  *  - new PA:                           buddy += N; PA = N
     261             :  *  - use inode PA:                     on-disk += N; PA -= N
     262             :  *  - discard inode PA                  buddy -= on-disk - PA; PA = 0
     263             :  *  - use locality group PA             on-disk += N; PA -= N
     264             :  *  - discard locality group PA         buddy -= PA; PA = 0
     265             :  *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
     266             :  *        is used in real operation because we can't know actual used
     267             :  *        bits from PA, only from on-disk bitmap
     268             :  *
     269             :  * if we follow this strict logic, then all operations above should be atomic.
     270             :  * given some of them can block, we'd have to use something like semaphores
     271             :  * killing performance on high-end SMP hardware. let's try to relax it using
     272             :  * the following knowledge:
     273             :  *  1) if buddy is referenced, it's already initialized
     274             :  *  2) while block is used in buddy and the buddy is referenced,
     275             :  *     nobody can re-allocate that block
     276             :  *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
     277             :  *     bit set and PA claims same block, it's OK. IOW, one can set bit in
     278             :  *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
     279             :  *     block
     280             :  *
     281             :  * so, now we're building a concurrency table:
     282             :  *  - init buddy vs.
     283             :  *    - new PA
     284             :  *      blocks for PA are allocated in the buddy, buddy must be referenced
     285             :  *      until PA is linked to allocation group to avoid concurrent buddy init
     286             :  *    - use inode PA
     287             :  *      we need to make sure that either on-disk bitmap or PA has uptodate data
     288             :  *      given (3) we care that PA-=N operation doesn't interfere with init
     289             :  *    - discard inode PA
     290             :  *      the simplest way would be to have buddy initialized by the discard
     291             :  *    - use locality group PA
     292             :  *      again PA-=N must be serialized with init
     293             :  *    - discard locality group PA
     294             :  *      the simplest way would be to have buddy initialized by the discard
     295             :  *  - new PA vs.
     296             :  *    - use inode PA
     297             :  *      i_data_sem serializes them
     298             :  *    - discard inode PA
     299             :  *      discard process must wait until PA isn't used by another process
     300             :  *    - use locality group PA
     301             :  *      some mutex should serialize them
     302             :  *    - discard locality group PA
     303             :  *      discard process must wait until PA isn't used by another process
     304             :  *  - use inode PA
     305             :  *    - use inode PA
     306             :  *      i_data_sem or another mutex should serializes them
     307             :  *    - discard inode PA
     308             :  *      discard process must wait until PA isn't used by another process
     309             :  *    - use locality group PA
     310             :  *      nothing wrong here -- they're different PAs covering different blocks
     311             :  *    - discard locality group PA
     312             :  *      discard process must wait until PA isn't used by another process
     313             :  *
     314             :  * now we're ready to make few consequences:
     315             :  *  - PA is referenced and while it is no discard is possible
     316             :  *  - PA is referenced until block isn't marked in on-disk bitmap
     317             :  *  - PA changes only after on-disk bitmap
     318             :  *  - discard must not compete with init. either init is done before
     319             :  *    any discard or they're serialized somehow
     320             :  *  - buddy init as sum of on-disk bitmap and PAs is done atomically
     321             :  *
     322             :  * a special case when we've used PA to emptiness. no need to modify buddy
     323             :  * in this case, but we should care about concurrent init
     324             :  *
     325             :  */
     326             : 
     327             :  /*
     328             :  * Logic in few words:
     329             :  *
     330             :  *  - allocation:
     331             :  *    load group
     332             :  *    find blocks
     333             :  *    mark bits in on-disk bitmap
     334             :  *    release group
     335             :  *
     336             :  *  - use preallocation:
     337             :  *    find proper PA (per-inode or group)
     338             :  *    load group
     339             :  *    mark bits in on-disk bitmap
     340             :  *    release group
     341             :  *    release PA
     342             :  *
     343             :  *  - free:
     344             :  *    load group
     345             :  *    mark bits in on-disk bitmap
     346             :  *    release group
     347             :  *
     348             :  *  - discard preallocations in group:
     349             :  *    mark PAs deleted
     350             :  *    move them onto local list
     351             :  *    load on-disk bitmap
     352             :  *    load group
     353             :  *    remove PA from object (inode or locality group)
     354             :  *    mark free blocks in-core
     355             :  *
     356             :  *  - discard inode's preallocations:
     357             :  */
     358             : 
     359             : /*
     360             :  * Locking rules
     361             :  *
     362             :  * Locks:
     363             :  *  - bitlock on a group        (group)
     364             :  *  - object (inode/locality)   (object)
     365             :  *  - per-pa lock               (pa)
     366             :  *  - cr_power2_aligned lists lock      (cr_power2_aligned)
     367             :  *  - cr_goal_len_fast lists lock       (cr_goal_len_fast)
     368             :  *
     369             :  * Paths:
     370             :  *  - new pa
     371             :  *    object
     372             :  *    group
     373             :  *
     374             :  *  - find and use pa:
     375             :  *    pa
     376             :  *
     377             :  *  - release consumed pa:
     378             :  *    pa
     379             :  *    group
     380             :  *    object
     381             :  *
     382             :  *  - generate in-core bitmap:
     383             :  *    group
     384             :  *        pa
     385             :  *
     386             :  *  - discard all for given object (inode, locality group):
     387             :  *    object
     388             :  *        pa
     389             :  *    group
     390             :  *
     391             :  *  - discard all for given group:
     392             :  *    group
     393             :  *        pa
     394             :  *    group
     395             :  *        object
     396             :  *
     397             :  *  - allocation path (ext4_mb_regular_allocator)
     398             :  *    group
     399             :  *    cr_power2_aligned/cr_goal_len_fast
     400             :  */
     401             : static struct kmem_cache *ext4_pspace_cachep;
     402             : static struct kmem_cache *ext4_ac_cachep;
     403             : static struct kmem_cache *ext4_free_data_cachep;
     404             : 
     405             : /* We create slab caches for groupinfo data structures based on the
     406             :  * superblock block size.  There will be one per mounted filesystem for
     407             :  * each unique s_blocksize_bits */
     408             : #define NR_GRPINFO_CACHES 8
     409             : static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
     410             : 
     411             : static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
     412             :         "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
     413             :         "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
     414             :         "ext4_groupinfo_64k", "ext4_groupinfo_128k"
     415             : };
     416             : 
     417             : static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
     418             :                                         ext4_group_t group);
     419             : static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
     420             :                                                 ext4_group_t group);
     421             : static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
     422             : 
     423             : static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
     424             :                                ext4_group_t group, enum criteria cr);
     425             : 
     426             : static int ext4_try_to_trim_range(struct super_block *sb,
     427             :                 struct ext4_buddy *e4b, ext4_grpblk_t start,
     428             :                 ext4_grpblk_t max, ext4_grpblk_t minblocks);
     429             : 
     430             : /*
     431             :  * The algorithm using this percpu seq counter goes below:
     432             :  * 1. We sample the percpu discard_pa_seq counter before trying for block
     433             :  *    allocation in ext4_mb_new_blocks().
     434             :  * 2. We increment this percpu discard_pa_seq counter when we either allocate
     435             :  *    or free these blocks i.e. while marking those blocks as used/free in
     436             :  *    mb_mark_used()/mb_free_blocks().
     437             :  * 3. We also increment this percpu seq counter when we successfully identify
     438             :  *    that the bb_prealloc_list is not empty and hence proceed for discarding
     439             :  *    of those PAs inside ext4_mb_discard_group_preallocations().
     440             :  *
     441             :  * Now to make sure that the regular fast path of block allocation is not
     442             :  * affected, as a small optimization we only sample the percpu seq counter
     443             :  * on that cpu. Only when the block allocation fails and when freed blocks
     444             :  * found were 0, that is when we sample percpu seq counter for all cpus using
     445             :  * below function ext4_get_discard_pa_seq_sum(). This happens after making
     446             :  * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
     447             :  */
     448             : static DEFINE_PER_CPU(u64, discard_pa_seq);
     449         446 : static inline u64 ext4_get_discard_pa_seq_sum(void)
     450             : {
     451         446 :         int __cpu;
     452         446 :         u64 __seq = 0;
     453             : 
     454        2228 :         for_each_possible_cpu(__cpu)
     455        1781 :                 __seq += per_cpu(discard_pa_seq, __cpu);
     456         445 :         return __seq;
     457             : }
     458             : 
     459             : static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
     460             : {
     461             : #if BITS_PER_LONG == 64
     462  3364458863 :         *bit += ((unsigned long) addr & 7UL) << 3;
     463  3364458863 :         addr = (void *) ((unsigned long) addr & ~7UL);
     464             : #elif BITS_PER_LONG == 32
     465             :         *bit += ((unsigned long) addr & 3UL) << 3;
     466             :         addr = (void *) ((unsigned long) addr & ~3UL);
     467             : #else
     468             : #error "how many bits you are?!"
     469             : #endif
     470  3364458863 :         return addr;
     471             : }
     472             : 
     473  3020835651 : static inline int mb_test_bit(int bit, void *addr)
     474             : {
     475             :         /*
     476             :          * ext4_test_bit on architecture like powerpc
     477             :          * needs unsigned long aligned address
     478             :          */
     479  3020835651 :         addr = mb_correct_addr_and_bit(&bit, addr);
     480  3020835651 :         return ext4_test_bit(bit, addr);
     481             : }
     482             : 
     483    73270782 : static inline void mb_set_bit(int bit, void *addr)
     484             : {
     485    73270782 :         addr = mb_correct_addr_and_bit(&bit, addr);
     486    73270782 :         ext4_set_bit(bit, addr);
     487    73270021 : }
     488             : 
     489    36914588 : static inline void mb_clear_bit(int bit, void *addr)
     490             : {
     491    36914588 :         addr = mb_correct_addr_and_bit(&bit, addr);
     492    36914588 :         ext4_clear_bit(bit, addr);
     493    36914612 : }
     494             : 
     495    40227496 : static inline int mb_test_and_clear_bit(int bit, void *addr)
     496             : {
     497    40227496 :         addr = mb_correct_addr_and_bit(&bit, addr);
     498    40227496 :         return ext4_test_and_clear_bit(bit, addr);
     499             : }
     500             : 
     501   169842610 : static inline int mb_find_next_zero_bit(void *addr, int max, int start)
     502             : {
     503   169842610 :         int fix = 0, ret, tmpmax;
     504   169842610 :         addr = mb_correct_addr_and_bit(&fix, addr);
     505   169842610 :         tmpmax = max + fix;
     506   169842610 :         start += fix;
     507             : 
     508   169842610 :         ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
     509   169885752 :         if (ret > max)
     510             :                 return max;
     511             :         return ret;
     512             : }
     513             : 
     514    23367736 : static inline int mb_find_next_bit(void *addr, int max, int start)
     515             : {
     516    23367736 :         int fix = 0, ret, tmpmax;
     517    23367736 :         addr = mb_correct_addr_and_bit(&fix, addr);
     518    23367736 :         tmpmax = max + fix;
     519    23367736 :         start += fix;
     520             : 
     521    23367736 :         ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
     522    23367653 :         if (ret > max)
     523             :                 return max;
     524             :         return ret;
     525             : }
     526             : 
     527  3065653436 : static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
     528             : {
     529  3065653436 :         char *bb;
     530             : 
     531  3065653436 :         BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
     532  3065653436 :         BUG_ON(max == NULL);
     533             : 
     534  3065653436 :         if (order > e4b->bd_blkbits + 1) {
     535       20153 :                 *max = 0;
     536       20153 :                 return NULL;
     537             :         }
     538             : 
     539             :         /* at order 0 we see each particular block */
     540  3065633283 :         if (order == 0) {
     541   286459304 :                 *max = 1 << (e4b->bd_blkbits + 3);
     542   286459304 :                 return e4b->bd_bitmap;
     543             :         }
     544             : 
     545  2779173979 :         bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
     546  2779173979 :         *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
     547             : 
     548  2779173979 :         return bb;
     549             : }
     550             : 
     551             : #ifdef DOUBLE_CHECK
     552             : static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
     553             :                            int first, int count)
     554             : {
     555             :         int i;
     556             :         struct super_block *sb = e4b->bd_sb;
     557             : 
     558             :         if (unlikely(e4b->bd_info->bb_bitmap == NULL))
     559             :                 return;
     560             :         assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
     561             :         for (i = 0; i < count; i++) {
     562             :                 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
     563             :                         ext4_fsblk_t blocknr;
     564             : 
     565             :                         blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
     566             :                         blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
     567             :                         ext4_grp_locked_error(sb, e4b->bd_group,
     568             :                                               inode ? inode->i_ino : 0,
     569             :                                               blocknr,
     570             :                                               "freeing block already freed "
     571             :                                               "(bit %u)",
     572             :                                               first + i);
     573             :                         ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
     574             :                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
     575             :                 }
     576             :                 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
     577             :         }
     578             : }
     579             : 
     580             : static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
     581             : {
     582             :         int i;
     583             : 
     584             :         if (unlikely(e4b->bd_info->bb_bitmap == NULL))
     585             :                 return;
     586             :         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
     587             :         for (i = 0; i < count; i++) {
     588             :                 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
     589             :                 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
     590             :         }
     591             : }
     592             : 
     593             : static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
     594             : {
     595             :         if (unlikely(e4b->bd_info->bb_bitmap == NULL))
     596             :                 return;
     597             :         if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
     598             :                 unsigned char *b1, *b2;
     599             :                 int i;
     600             :                 b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
     601             :                 b2 = (unsigned char *) bitmap;
     602             :                 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
     603             :                         if (b1[i] != b2[i]) {
     604             :                                 ext4_msg(e4b->bd_sb, KERN_ERR,
     605             :                                          "corruption in group %u "
     606             :                                          "at byte %u(%u): %x in copy != %x "
     607             :                                          "on disk/prealloc",
     608             :                                          e4b->bd_group, i, i * 8, b1[i], b2[i]);
     609             :                                 BUG();
     610             :                         }
     611             :                 }
     612             :         }
     613             : }
     614             : 
     615             : static void mb_group_bb_bitmap_alloc(struct super_block *sb,
     616             :                         struct ext4_group_info *grp, ext4_group_t group)
     617             : {
     618             :         struct buffer_head *bh;
     619             : 
     620             :         grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
     621             :         if (!grp->bb_bitmap)
     622             :                 return;
     623             : 
     624             :         bh = ext4_read_block_bitmap(sb, group);
     625             :         if (IS_ERR_OR_NULL(bh)) {
     626             :                 kfree(grp->bb_bitmap);
     627             :                 grp->bb_bitmap = NULL;
     628             :                 return;
     629             :         }
     630             : 
     631             :         memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
     632             :         put_bh(bh);
     633             : }
     634             : 
     635             : static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
     636             : {
     637             :         kfree(grp->bb_bitmap);
     638             : }
     639             : 
     640             : #else
     641             : static inline void mb_free_blocks_double(struct inode *inode,
     642             :                                 struct ext4_buddy *e4b, int first, int count)
     643             : {
     644             :         return;
     645             : }
     646             : static inline void mb_mark_used_double(struct ext4_buddy *e4b,
     647             :                                                 int first, int count)
     648             : {
     649             :         return;
     650             : }
     651             : static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
     652             : {
     653             :         return;
     654             : }
     655             : 
     656             : static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
     657             :                         struct ext4_group_info *grp, ext4_group_t group)
     658             : {
     659             :         return;
     660             : }
     661             : 
     662             : static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
     663             : {
     664             :         return;
     665             : }
     666             : #endif
     667             : 
     668             : #ifdef AGGRESSIVE_CHECK
     669             : 
     670             : #define MB_CHECK_ASSERT(assert)                                         \
     671             : do {                                                                    \
     672             :         if (!(assert)) {                                                \
     673             :                 printk(KERN_EMERG                                       \
     674             :                         "Assertion failure in %s() at %s:%d: \"%s\"\n",     \
     675             :                         function, file, line, # assert);                \
     676             :                 BUG();                                                  \
     677             :         }                                                               \
     678             : } while (0)
     679             : 
     680             : static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
     681             :                                 const char *function, int line)
     682             : {
     683             :         struct super_block *sb = e4b->bd_sb;
     684             :         int order = e4b->bd_blkbits + 1;
     685             :         int max;
     686             :         int max2;
     687             :         int i;
     688             :         int j;
     689             :         int k;
     690             :         int count;
     691             :         struct ext4_group_info *grp;
     692             :         int fragments = 0;
     693             :         int fstart;
     694             :         struct list_head *cur;
     695             :         void *buddy;
     696             :         void *buddy2;
     697             : 
     698             :         if (e4b->bd_info->bb_check_counter++ % 10)
     699             :                 return 0;
     700             : 
     701             :         while (order > 1) {
     702             :                 buddy = mb_find_buddy(e4b, order, &max);
     703             :                 MB_CHECK_ASSERT(buddy);
     704             :                 buddy2 = mb_find_buddy(e4b, order - 1, &max2);
     705             :                 MB_CHECK_ASSERT(buddy2);
     706             :                 MB_CHECK_ASSERT(buddy != buddy2);
     707             :                 MB_CHECK_ASSERT(max * 2 == max2);
     708             : 
     709             :                 count = 0;
     710             :                 for (i = 0; i < max; i++) {
     711             : 
     712             :                         if (mb_test_bit(i, buddy)) {
     713             :                                 /* only single bit in buddy2 may be 0 */
     714             :                                 if (!mb_test_bit(i << 1, buddy2)) {
     715             :                                         MB_CHECK_ASSERT(
     716             :                                                 mb_test_bit((i<<1)+1, buddy2));
     717             :                                 }
     718             :                                 continue;
     719             :                         }
     720             : 
     721             :                         /* both bits in buddy2 must be 1 */
     722             :                         MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
     723             :                         MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
     724             : 
     725             :                         for (j = 0; j < (1 << order); j++) {
     726             :                                 k = (i * (1 << order)) + j;
     727             :                                 MB_CHECK_ASSERT(
     728             :                                         !mb_test_bit(k, e4b->bd_bitmap));
     729             :                         }
     730             :                         count++;
     731             :                 }
     732             :                 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
     733             :                 order--;
     734             :         }
     735             : 
     736             :         fstart = -1;
     737             :         buddy = mb_find_buddy(e4b, 0, &max);
     738             :         for (i = 0; i < max; i++) {
     739             :                 if (!mb_test_bit(i, buddy)) {
     740             :                         MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
     741             :                         if (fstart == -1) {
     742             :                                 fragments++;
     743             :                                 fstart = i;
     744             :                         }
     745             :                         continue;
     746             :                 }
     747             :                 fstart = -1;
     748             :                 /* check used bits only */
     749             :                 for (j = 0; j < e4b->bd_blkbits + 1; j++) {
     750             :                         buddy2 = mb_find_buddy(e4b, j, &max2);
     751             :                         k = i >> j;
     752             :                         MB_CHECK_ASSERT(k < max2);
     753             :                         MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
     754             :                 }
     755             :         }
     756             :         MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
     757             :         MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
     758             : 
     759             :         grp = ext4_get_group_info(sb, e4b->bd_group);
     760             :         if (!grp)
     761             :                 return NULL;
     762             :         list_for_each(cur, &grp->bb_prealloc_list) {
     763             :                 ext4_group_t groupnr;
     764             :                 struct ext4_prealloc_space *pa;
     765             :                 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
     766             :                 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
     767             :                 MB_CHECK_ASSERT(groupnr == e4b->bd_group);
     768             :                 for (i = 0; i < pa->pa_len; i++)
     769             :                         MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
     770             :         }
     771             :         return 0;
     772             : }
     773             : #undef MB_CHECK_ASSERT
     774             : #define mb_check_buddy(e4b) __mb_check_buddy(e4b,       \
     775             :                                         __FILE__, __func__, __LINE__)
     776             : #else
     777             : #define mb_check_buddy(e4b)
     778             : #endif
     779             : 
     780             : /*
     781             :  * Divide blocks started from @first with length @len into
     782             :  * smaller chunks with power of 2 blocks.
     783             :  * Clear the bits in bitmap which the blocks of the chunk(s) covered,
     784             :  * then increase bb_counters[] for corresponded chunk size.
     785             :  */
     786      230397 : static void ext4_mb_mark_free_simple(struct super_block *sb,
     787             :                                 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
     788             :                                         struct ext4_group_info *grp)
     789             : {
     790      230397 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
     791      230397 :         ext4_grpblk_t min;
     792      230397 :         ext4_grpblk_t max;
     793      230397 :         ext4_grpblk_t chunk;
     794      230397 :         unsigned int border;
     795             : 
     796      230397 :         BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
     797             : 
     798      230397 :         border = 2 << sb->s_blocksize_bits;
     799             : 
     800     1356364 :         while (len > 0) {
     801             :                 /* find how many blocks can be covered since this position */
     802     1125968 :                 max = ffs(first | border) - 1;
     803             : 
     804             :                 /* find how many blocks of power 2 we need to mark */
     805     1125968 :                 min = fls(len) - 1;
     806             : 
     807     1125968 :                 if (max < min)
     808             :                         min = max;
     809     1125968 :                 chunk = 1 << min;
     810             : 
     811             :                 /* mark multiblock chunks only */
     812     1125968 :                 grp->bb_counters[min]++;
     813     1125968 :                 if (min > 0)
     814     1067225 :                         mb_clear_bit(first >> min,
     815     1067225 :                                      buddy + sbi->s_mb_offsets[min]);
     816             : 
     817     1125967 :                 len -= chunk;
     818     1125967 :                 first += chunk;
     819             :         }
     820      230396 : }
     821             : 
     822             : static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
     823             : {
     824     6867833 :         int order;
     825             : 
     826             :         /*
     827             :          * We don't bother with a special lists groups with only 1 block free
     828             :          * extents and for completely empty groups.
     829             :          */
     830     6867833 :         order = fls(len) - 2;
     831     6867833 :         if (order < 0)
     832             :                 return 0;
     833     6833801 :         if (order == MB_NUM_ORDERS(sb))
     834      150767 :                 order--;
     835             :         return order;
     836             : }
     837             : 
     838             : /* Move group to appropriate avg_fragment_size list */
     839             : static void
     840     5695943 : mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
     841             : {
     842     5695943 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
     843     5695943 :         int new_order;
     844             : 
     845     5695943 :         if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
     846             :                 return;
     847             : 
     848     5359699 :         new_order = mb_avg_fragment_size_order(sb,
     849     5359699 :                                         grp->bb_free / grp->bb_fragments);
     850     5359699 :         if (new_order == grp->bb_avg_fragment_size_order)
     851             :                 return;
     852             : 
     853      498177 :         if (grp->bb_avg_fragment_size_order != -1) {
     854      338227 :                 write_lock(&sbi->s_mb_avg_fragment_size_locks[
     855             :                                         grp->bb_avg_fragment_size_order]);
     856      338225 :                 list_del(&grp->bb_avg_fragment_size_node);
     857      338224 :                 write_unlock(&sbi->s_mb_avg_fragment_size_locks[
     858             :                                         grp->bb_avg_fragment_size_order]);
     859             :         }
     860      498178 :         grp->bb_avg_fragment_size_order = new_order;
     861      498178 :         write_lock(&sbi->s_mb_avg_fragment_size_locks[
     862             :                                         grp->bb_avg_fragment_size_order]);
     863      498192 :         list_add_tail(&grp->bb_avg_fragment_size_node,
     864      498192 :                 &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
     865      498191 :         write_unlock(&sbi->s_mb_avg_fragment_size_locks[
     866             :                                         grp->bb_avg_fragment_size_order]);
     867             : }
     868             : 
     869             : /*
     870             :  * Choose next group by traversing largest_free_order lists. Updates *new_cr if
     871             :  * cr level needs an update.
     872             :  */
     873      104538 : static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
     874             :                         enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
     875             : {
     876      104538 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
     877      104538 :         struct ext4_group_info *iter, *grp;
     878      104538 :         int i;
     879             : 
     880      104538 :         if (ac->ac_status == AC_STATUS_FOUND)
     881             :                 return;
     882             : 
     883      104538 :         if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED))
     884           0 :                 atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions);
     885             : 
     886      104538 :         grp = NULL;
     887      428224 :         for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
     888      325513 :                 if (list_empty(&sbi->s_mb_largest_free_orders[i]))
     889      300180 :                         continue;
     890       25333 :                 read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
     891       25342 :                 if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
     892           0 :                         read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
     893           0 :                         continue;
     894             :                 }
     895       25342 :                 grp = NULL;
     896      160857 :                 list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
     897             :                                     bb_largest_free_order_node) {
     898      137372 :                         if (sbi->s_mb_stats)
     899           0 :                                 atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]);
     900      137372 :                         if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) {
     901             :                                 grp = iter;
     902             :                                 break;
     903             :                         }
     904             :                 }
     905       25339 :                 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
     906       25343 :                 if (grp)
     907             :                         break;
     908             :         }
     909             : 
     910      104548 :         if (!grp) {
     911             :                 /* Increment cr and search again */
     912      102711 :                 *new_cr = CR_GOAL_LEN_FAST;
     913             :         } else {
     914        1837 :                 *group = grp->bb_group;
     915        1837 :                 ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
     916             :         }
     917             : }
     918             : 
     919             : /*
     920             :  * Find a suitable group of given order from the average fragments list.
     921             :  */
     922             : static struct ext4_group_info *
     923     5325082 : ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order)
     924             : {
     925     5325082 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
     926     5325082 :         struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order];
     927     5325082 :         rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order];
     928     5325082 :         struct ext4_group_info *grp = NULL, *iter;
     929     5325082 :         enum criteria cr = ac->ac_criteria;
     930             : 
     931     5325082 :         if (list_empty(frag_list))
     932             :                 return NULL;
     933      675269 :         read_lock(frag_list_lock);
     934      675370 :         if (list_empty(frag_list)) {
     935           0 :                 read_unlock(frag_list_lock);
     936           0 :                 return NULL;
     937             :         }
     938    11476296 :         list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) {
     939    10968000 :                 if (sbi->s_mb_stats)
     940           0 :                         atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
     941    10968000 :                 if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) {
     942             :                         grp = iter;
     943             :                         break;
     944             :                 }
     945             :         }
     946      674484 :         read_unlock(frag_list_lock);
     947      674484 :         return grp;
     948             : }
     949             : 
     950             : /*
     951             :  * Choose next group by traversing average fragment size list of suitable
     952             :  * order. Updates *new_cr if cr level needs an update.
     953             :  */
     954      796074 : static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
     955             :                 enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
     956             : {
     957      796074 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
     958      796074 :         struct ext4_group_info *grp = NULL;
     959      796074 :         int i;
     960             : 
     961      796074 :         if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) {
     962          37 :                 if (sbi->s_mb_stats)
     963           0 :                         atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions);
     964             :         }
     965             : 
     966      796074 :         for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
     967     5392846 :              i < MB_NUM_ORDERS(ac->ac_sb); i++) {
     968     4612904 :                 grp = ext4_mb_find_good_group_avg_frag_lists(ac, i);
     969     4613002 :                 if (grp)
     970             :                         break;
     971             :         }
     972             : 
     973      796172 :         if (grp) {
     974       16230 :                 *group = grp->bb_group;
     975       16230 :                 ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
     976             :         } else {
     977      779942 :                 *new_cr = CR_BEST_AVAIL_LEN;
     978             :         }
     979      796172 : }
     980             : 
     981             : /*
     982             :  * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment
     983             :  * order we have and proactively trim the goal request length to that order to
     984             :  * find a suitable group faster.
     985             :  *
     986             :  * This optimizes allocation speed at the cost of slightly reduced
     987             :  * preallocations. However, we make sure that we don't trim the request too
     988             :  * much and fall to CR_GOAL_LEN_SLOW in that case.
     989             :  */
     990      918557 : static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
     991             :                 enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
     992             : {
     993      918557 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
     994      918557 :         struct ext4_group_info *grp = NULL;
     995      918557 :         int i, order, min_order;
     996      918557 :         unsigned long num_stripe_clusters = 0;
     997             : 
     998      918557 :         if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) {
     999      138730 :                 if (sbi->s_mb_stats)
    1000           0 :                         atomic_inc(&sbi->s_bal_best_avail_bad_suggestions);
    1001             :         }
    1002             : 
    1003             :         /*
    1004             :          * mb_avg_fragment_size_order() returns order in a way that makes
    1005             :          * retrieving back the length using (1 << order) inaccurate. Hence, use
    1006             :          * fls() instead since we need to know the actual length while modifying
    1007             :          * goal length.
    1008             :          */
    1009      918557 :         order = fls(ac->ac_g_ex.fe_len) - 1;
    1010      918557 :         min_order = order - sbi->s_mb_best_avail_max_trim_order;
    1011      918557 :         if (min_order < 0)
    1012             :                 min_order = 0;
    1013             : 
    1014      918557 :         if (sbi->s_stripe > 0) {
    1015             :                 /*
    1016             :                  * We are assuming that stripe size is always a multiple of
    1017             :                  * cluster ratio otherwise __ext4_fill_super exists early.
    1018             :                  */
    1019      918523 :                 num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
    1020      918523 :                 if (1 << min_order < num_stripe_clusters)
    1021             :                         /*
    1022             :                          * We consider 1 order less because later we round
    1023             :                          * up the goal len to num_stripe_clusters
    1024             :                          */
    1025      821167 :                         min_order = fls(num_stripe_clusters) - 1;
    1026             :         }
    1027             : 
    1028      918557 :         if (1 << min_order < ac->ac_o_ex.fe_len)
    1029      488014 :                 min_order = fls(ac->ac_o_ex.fe_len);
    1030             : 
    1031     1480038 :         for (i = order; i >= min_order; i--) {
    1032      712060 :                 int frag_order;
    1033             :                 /*
    1034             :                  * Scale down goal len to make sure we find something
    1035             :                  * in the free fragments list. Basically, reduce
    1036             :                  * preallocations.
    1037             :                  */
    1038      712060 :                 ac->ac_g_ex.fe_len = 1 << i;
    1039             : 
    1040      712060 :                 if (num_stripe_clusters > 0) {
    1041             :                         /*
    1042             :                          * Try to round up the adjusted goal length to
    1043             :                          * stripe size (in cluster units) multiple for
    1044             :                          * efficiency.
    1045             :                          */
    1046      712077 :                         ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len,
    1047             :                                                      num_stripe_clusters);
    1048             :                 }
    1049             : 
    1050      712060 :                 frag_order = mb_avg_fragment_size_order(ac->ac_sb,
    1051             :                                                         ac->ac_g_ex.fe_len);
    1052             : 
    1053      712060 :                 grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order);
    1054      712174 :                 if (grp)
    1055             :                         break;
    1056             :         }
    1057             : 
    1058      918671 :         if (grp) {
    1059      150690 :                 *group = grp->bb_group;
    1060      150690 :                 ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
    1061             :         } else {
    1062             :                 /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
    1063      767981 :                 ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
    1064      767981 :                 *new_cr = CR_GOAL_LEN_SLOW;
    1065             :         }
    1066      918671 : }
    1067             : 
    1068   331213080 : static inline int should_optimize_scan(struct ext4_allocation_context *ac)
    1069             : {
    1070   331213080 :         if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
    1071             :                 return 0;
    1072   328227370 :         if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
    1073             :                 return 0;
    1074    15365166 :         if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
    1075        2416 :                 return 0;
    1076             :         return 1;
    1077             : }
    1078             : 
    1079             : /*
    1080             :  * Return next linear group for allocation. If linear traversal should not be
    1081             :  * performed, this function just returns the same group
    1082             :  */
    1083             : static int
    1084   164582975 : next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
    1085             : {
    1086   164582975 :         if (!should_optimize_scan(ac))
    1087   157866898 :                 goto inc_and_return;
    1088             : 
    1089     6774602 :         if (ac->ac_groups_linear_remaining) {
    1090     6774663 :                 ac->ac_groups_linear_remaining--;
    1091     6774663 :                 goto inc_and_return;
    1092             :         }
    1093             : 
    1094             :         return group;
    1095   164641561 : inc_and_return:
    1096             :         /*
    1097             :          * Artificially restricted ngroups for non-extent
    1098             :          * files makes group > ngroups possible on first loop.
    1099             :          */
    1100   164641561 :         return group + 1 >= ngroups ? 0 : group + 1;
    1101             : }
    1102             : 
    1103             : /*
    1104             :  * ext4_mb_choose_next_group: choose next group for allocation.
    1105             :  *
    1106             :  * @ac        Allocation Context
    1107             :  * @new_cr    This is an output parameter. If the there is no good group
    1108             :  *            available at current CR level, this field is updated to indicate
    1109             :  *            the new cr level that should be used.
    1110             :  * @group     This is an input / output parameter. As an input it indicates the
    1111             :  *            next group that the allocator intends to use for allocation. As
    1112             :  *            output, this field indicates the next group that should be used as
    1113             :  *            determined by the optimization functions.
    1114             :  * @ngroups   Total number of groups
    1115             :  */
    1116   166305182 : static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
    1117             :                 enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
    1118             : {
    1119   166305182 :         *new_cr = ac->ac_criteria;
    1120             : 
    1121   166305182 :         if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
    1122   164718585 :                 *group = next_linear_group(ac, *group, ngroups);
    1123   164167449 :                 return;
    1124             :         }
    1125             : 
    1126     1819298 :         if (*new_cr == CR_POWER2_ALIGNED) {
    1127      104619 :                 ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups);
    1128     1714679 :         } else if (*new_cr == CR_GOAL_LEN_FAST) {
    1129      796071 :                 ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups);
    1130      918608 :         } else if (*new_cr == CR_BEST_AVAIL_LEN) {
    1131      918608 :                 ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups);
    1132             :         } else {
    1133             :                 /*
    1134             :                  * TODO: For CR=2, we can arrange groups in an rb tree sorted by
    1135             :                  * bb_free. But until that happens, we should never come here.
    1136             :                  */
    1137           0 :                 WARN_ON(1);
    1138             :         }
    1139             : }
    1140             : 
    1141             : /*
    1142             :  * Cache the order of the largest free extent we have available in this block
    1143             :  * group.
    1144             :  */
    1145             : static void
    1146     5696025 : mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
    1147             : {
    1148     5696025 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    1149     5696025 :         int i;
    1150             : 
    1151    21167276 :         for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
    1152    21120788 :                 if (grp->bb_counters[i] > 0)
    1153             :                         break;
    1154             :         /* No need to move between order lists? */
    1155     5696025 :         if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
    1156     5403936 :             i == grp->bb_largest_free_order) {
    1157     5283465 :                 grp->bb_largest_free_order = i;
    1158     5283465 :                 return;
    1159             :         }
    1160             : 
    1161      412560 :         if (grp->bb_largest_free_order >= 0) {
    1162      210251 :                 write_lock(&sbi->s_mb_largest_free_orders_locks[
    1163             :                                               grp->bb_largest_free_order]);
    1164      210254 :                 list_del_init(&grp->bb_largest_free_order_node);
    1165      210252 :                 write_unlock(&sbi->s_mb_largest_free_orders_locks[
    1166             :                                               grp->bb_largest_free_order]);
    1167             :         }
    1168      412562 :         grp->bb_largest_free_order = i;
    1169      412562 :         if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
    1170      368742 :                 write_lock(&sbi->s_mb_largest_free_orders_locks[
    1171             :                                               grp->bb_largest_free_order]);
    1172      368740 :                 list_add_tail(&grp->bb_largest_free_order_node,
    1173      368740 :                       &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
    1174      368738 :                 write_unlock(&sbi->s_mb_largest_free_orders_locks[
    1175             :                                               grp->bb_largest_free_order]);
    1176             :         }
    1177             : }
    1178             : 
    1179             : static noinline_for_stack
    1180      160468 : void ext4_mb_generate_buddy(struct super_block *sb,
    1181             :                             void *buddy, void *bitmap, ext4_group_t group,
    1182             :                             struct ext4_group_info *grp)
    1183             : {
    1184      160468 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    1185      160468 :         ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
    1186      160468 :         ext4_grpblk_t i = 0;
    1187      160468 :         ext4_grpblk_t first;
    1188      160468 :         ext4_grpblk_t len;
    1189      160468 :         unsigned free = 0;
    1190      160468 :         unsigned fragments = 0;
    1191      160468 :         unsigned long long period = get_cycles();
    1192             : 
    1193             :         /* initialize buddy from bitmap which is aggregation
    1194             :          * of on-disk bitmap and preallocations */
    1195      160466 :         i = mb_find_next_zero_bit(bitmap, max, 0);
    1196      160467 :         grp->bb_first_free = i;
    1197      399642 :         while (i < max) {
    1198      239174 :                 fragments++;
    1199      239174 :                 first = i;
    1200      239174 :                 i = mb_find_next_bit(bitmap, max, i);
    1201      239175 :                 len = i - first;
    1202      239175 :                 free += len;
    1203      239175 :                 if (len > 1)
    1204      230397 :                         ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
    1205             :                 else
    1206        8778 :                         grp->bb_counters[0]++;
    1207      239174 :                 if (i < max)
    1208       80441 :                         i = mb_find_next_zero_bit(bitmap, max, i);
    1209             :         }
    1210      160468 :         grp->bb_fragments = fragments;
    1211             : 
    1212      160468 :         if (free != grp->bb_free) {
    1213           0 :                 ext4_grp_locked_error(sb, group, 0, 0,
    1214             :                                       "block bitmap and bg descriptor "
    1215             :                                       "inconsistent: %u vs %u free clusters",
    1216             :                                       free, grp->bb_free);
    1217             :                 /*
    1218             :                  * If we intend to continue, we consider group descriptor
    1219             :                  * corrupt and update bb_free using bitmap value
    1220             :                  */
    1221           0 :                 grp->bb_free = free;
    1222           0 :                 ext4_mark_group_bitmap_corrupted(sb, group,
    1223             :                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
    1224             :         }
    1225      160468 :         mb_set_largest_free_order(sb, grp);
    1226      160468 :         mb_update_avg_fragment_size(sb, grp);
    1227             : 
    1228      160468 :         clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
    1229             : 
    1230      160468 :         period = get_cycles() - period;
    1231      160468 :         atomic_inc(&sbi->s_mb_buddies_generated);
    1232      160467 :         atomic64_add(period, &sbi->s_mb_generation_time);
    1233      160468 : }
    1234             : 
    1235             : /* The buddy information is attached the buddy cache inode
    1236             :  * for convenience. The information regarding each group
    1237             :  * is loaded via ext4_mb_load_buddy. The information involve
    1238             :  * block bitmap and buddy information. The information are
    1239             :  * stored in the inode as
    1240             :  *
    1241             :  * {                        page                        }
    1242             :  * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
    1243             :  *
    1244             :  *
    1245             :  * one block each for bitmap and buddy information.
    1246             :  * So for each group we take up 2 blocks. A page can
    1247             :  * contain blocks_per_page (PAGE_SIZE / blocksize)  blocks.
    1248             :  * So it can have information regarding groups_per_page which
    1249             :  * is blocks_per_page/2
    1250             :  *
    1251             :  * Locking note:  This routine takes the block group lock of all groups
    1252             :  * for this page; do not hold this lock when calling this routine!
    1253             :  */
    1254             : 
    1255      310963 : static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
    1256             : {
    1257      310963 :         ext4_group_t ngroups;
    1258      310963 :         int blocksize;
    1259      310963 :         int blocks_per_page;
    1260      310963 :         int groups_per_page;
    1261      310963 :         int err = 0;
    1262      310963 :         int i;
    1263      310963 :         ext4_group_t first_group, group;
    1264      310963 :         int first_block;
    1265      310963 :         struct super_block *sb;
    1266      310963 :         struct buffer_head *bhs;
    1267      310963 :         struct buffer_head **bh = NULL;
    1268      310963 :         struct inode *inode;
    1269      310963 :         char *data;
    1270      310963 :         char *bitmap;
    1271      310963 :         struct ext4_group_info *grinfo;
    1272             : 
    1273      310963 :         inode = page->mapping->host;
    1274      310963 :         sb = inode->i_sb;
    1275      310963 :         ngroups = ext4_get_groups_count(sb);
    1276      310963 :         blocksize = i_blocksize(inode);
    1277      310963 :         blocks_per_page = PAGE_SIZE / blocksize;
    1278             : 
    1279      310963 :         mb_debug(sb, "init page %lu\n", page->index);
    1280             : 
    1281      310963 :         groups_per_page = blocks_per_page >> 1;
    1282      310963 :         if (groups_per_page == 0)
    1283             :                 groups_per_page = 1;
    1284             : 
    1285             :         /* allocate buffer_heads to read bitmaps */
    1286        3424 :         if (groups_per_page > 1) {
    1287        3408 :                 i = sizeof(struct buffer_head *) * groups_per_page;
    1288        3408 :                 bh = kzalloc(i, gfp);
    1289        3408 :                 if (bh == NULL)
    1290             :                         return -ENOMEM;
    1291             :         } else
    1292             :                 bh = &bhs;
    1293             : 
    1294      310963 :         first_group = page->index * blocks_per_page / 2;
    1295             : 
    1296             :         /* read all groups the page covers into the cache */
    1297      625067 :         for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
    1298      314371 :                 if (group >= ngroups)
    1299             :                         break;
    1300             : 
    1301      314370 :                 grinfo = ext4_get_group_info(sb, group);
    1302      314371 :                 if (!grinfo)
    1303           0 :                         continue;
    1304             :                 /*
    1305             :                  * If page is uptodate then we came here after online resize
    1306             :                  * which added some new uninitialized group info structs, so
    1307             :                  * we must skip all initialized uptodate buddies on the page,
    1308             :                  * which may be currently in use by an allocating task.
    1309             :                  */
    1310      314371 :                 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
    1311           0 :                         bh[i] = NULL;
    1312           0 :                         continue;
    1313             :                 }
    1314      314371 :                 bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
    1315      314370 :                 if (IS_ERR(bh[i])) {
    1316         266 :                         err = PTR_ERR(bh[i]);
    1317         266 :                         bh[i] = NULL;
    1318         266 :                         goto out;
    1319             :                 }
    1320             :                 mb_debug(sb, "read bitmap for group %u\n", group);
    1321             :         }
    1322             : 
    1323             :         /* wait for I/O completion */
    1324      624802 :         for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
    1325      314105 :                 int err2;
    1326             : 
    1327      314105 :                 if (!bh[i])
    1328           1 :                         continue;
    1329      314104 :                 err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
    1330      314104 :                 if (!err)
    1331      314105 :                         err = err2;
    1332             :         }
    1333             : 
    1334      310697 :         first_block = page->index * blocks_per_page;
    1335      631633 :         for (i = 0; i < blocks_per_page; i++) {
    1336      320936 :                 group = (first_block + i) >> 1;
    1337      320936 :                 if (group >= ngroups)
    1338             :                         break;
    1339             : 
    1340      320935 :                 if (!bh[group - first_group])
    1341             :                         /* skip initialized uptodate buddy */
    1342           0 :                         continue;
    1343             : 
    1344      641870 :                 if (!buffer_verified(bh[group - first_group]))
    1345             :                         /* Skip faulty bitmaps */
    1346           0 :                         continue;
    1347      320935 :                 err = 0;
    1348             : 
    1349             :                 /*
    1350             :                  * data carry information regarding this
    1351             :                  * particular group in the format specified
    1352             :                  * above
    1353             :                  *
    1354             :                  */
    1355      320935 :                 data = page_address(page) + (i * blocksize);
    1356      320935 :                 bitmap = bh[group - first_group]->b_data;
    1357             : 
    1358             :                 /*
    1359             :                  * We place the buddy block and bitmap block
    1360             :                  * close together
    1361             :                  */
    1362      320935 :                 if ((first_block + i) & 1) {
    1363             :                         /* this is block of buddy */
    1364      160468 :                         BUG_ON(incore == NULL);
    1365      160468 :                         mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
    1366             :                                 group, page->index, i * blocksize);
    1367      160468 :                         trace_ext4_mb_buddy_bitmap_load(sb, group);
    1368      160468 :                         grinfo = ext4_get_group_info(sb, group);
    1369      160468 :                         if (!grinfo) {
    1370           0 :                                 err = -EFSCORRUPTED;
    1371           0 :                                 goto out;
    1372             :                         }
    1373      160468 :                         grinfo->bb_fragments = 0;
    1374      160468 :                         memset(grinfo->bb_counters, 0,
    1375             :                                sizeof(*grinfo->bb_counters) *
    1376             :                                (MB_NUM_ORDERS(sb)));
    1377             :                         /*
    1378             :                          * incore got set to the group block bitmap below
    1379             :                          */
    1380      160468 :                         ext4_lock_group(sb, group);
    1381             :                         /* init the buddy */
    1382      160468 :                         memset(data, 0xff, blocksize);
    1383      160468 :                         ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
    1384      160468 :                         ext4_unlock_group(sb, group);
    1385      160468 :                         incore = NULL;
    1386             :                 } else {
    1387             :                         /* this is block of bitmap */
    1388      160467 :                         BUG_ON(incore != NULL);
    1389      160467 :                         mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
    1390             :                                 group, page->index, i * blocksize);
    1391      160467 :                         trace_ext4_mb_bitmap_load(sb, group);
    1392             : 
    1393             :                         /* see comments in ext4_mb_put_pa() */
    1394      160467 :                         ext4_lock_group(sb, group);
    1395      320934 :                         memcpy(data, bitmap, blocksize);
    1396             : 
    1397             :                         /* mark all preallocated blks used in in-core bitmap */
    1398      160467 :                         ext4_mb_generate_from_pa(sb, data, group);
    1399      160468 :                         ext4_mb_generate_from_freelist(sb, data, group);
    1400      160468 :                         ext4_unlock_group(sb, group);
    1401             : 
    1402             :                         /* set incore so that the buddy information can be
    1403             :                          * generated using this
    1404             :                          */
    1405      160468 :                         incore = data;
    1406             :                 }
    1407             :         }
    1408      310698 :         SetPageUptodate(page);
    1409             : 
    1410      310964 : out:
    1411      310964 :         if (bh) {
    1412      625336 :                 for (i = 0; i < groups_per_page; i++)
    1413      314372 :                         brelse(bh[i]);
    1414      310964 :                 if (bh != &bhs)
    1415        3408 :                         kfree(bh);
    1416             :         }
    1417             :         return err;
    1418             : }
    1419             : 
    1420             : /*
    1421             :  * Lock the buddy and bitmap pages. This make sure other parallel init_group
    1422             :  * on the same buddy page doesn't happen whild holding the buddy page lock.
    1423             :  * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
    1424             :  * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
    1425             :  */
    1426      157744 : static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
    1427             :                 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
    1428             : {
    1429      157744 :         struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
    1430      157744 :         int block, pnum, poff;
    1431      157744 :         int blocks_per_page;
    1432      157744 :         struct page *page;
    1433             : 
    1434      157744 :         e4b->bd_buddy_page = NULL;
    1435      157744 :         e4b->bd_bitmap_page = NULL;
    1436             : 
    1437      157744 :         blocks_per_page = PAGE_SIZE / sb->s_blocksize;
    1438             :         /*
    1439             :          * the buddy cache inode stores the block bitmap
    1440             :          * and buddy information in consecutive blocks.
    1441             :          * So for each group we need two blocks.
    1442             :          */
    1443      157744 :         block = group * 2;
    1444      157744 :         pnum = block / blocks_per_page;
    1445      157744 :         poff = block % blocks_per_page;
    1446      157744 :         page = find_or_create_page(inode->i_mapping, pnum, gfp);
    1447      157745 :         if (!page)
    1448             :                 return -ENOMEM;
    1449      157745 :         BUG_ON(page->mapping != inode->i_mapping);
    1450      157745 :         e4b->bd_bitmap_page = page;
    1451      157745 :         e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
    1452             : 
    1453      157745 :         if (blocks_per_page >= 2) {
    1454             :                 /* buddy and bitmap are on the same page */
    1455             :                 return 0;
    1456             :         }
    1457             : 
    1458      154321 :         block++;
    1459      154321 :         pnum = block / blocks_per_page;
    1460      154321 :         page = find_or_create_page(inode->i_mapping, pnum, gfp);
    1461      154321 :         if (!page)
    1462             :                 return -ENOMEM;
    1463      154321 :         BUG_ON(page->mapping != inode->i_mapping);
    1464      154321 :         e4b->bd_buddy_page = page;
    1465      154321 :         return 0;
    1466             : }
    1467             : 
    1468      157745 : static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
    1469             : {
    1470      157745 :         if (e4b->bd_bitmap_page) {
    1471      157745 :                 unlock_page(e4b->bd_bitmap_page);
    1472      157744 :                 put_page(e4b->bd_bitmap_page);
    1473             :         }
    1474      157744 :         if (e4b->bd_buddy_page) {
    1475      154320 :                 unlock_page(e4b->bd_buddy_page);
    1476      154321 :                 put_page(e4b->bd_buddy_page);
    1477             :         }
    1478      157744 : }
    1479             : 
    1480             : /*
    1481             :  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
    1482             :  * block group lock of all groups for this page; do not hold the BG lock when
    1483             :  * calling this routine!
    1484             :  */
    1485             : static noinline_for_stack
    1486      157744 : int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
    1487             : {
    1488             : 
    1489      157744 :         struct ext4_group_info *this_grp;
    1490      157744 :         struct ext4_buddy e4b;
    1491      157744 :         struct page *page;
    1492      157744 :         int ret = 0;
    1493             : 
    1494      157744 :         might_sleep();
    1495      157744 :         mb_debug(sb, "init group %u\n", group);
    1496      157744 :         this_grp = ext4_get_group_info(sb, group);
    1497      157744 :         if (!this_grp)
    1498             :                 return -EFSCORRUPTED;
    1499             : 
    1500             :         /*
    1501             :          * This ensures that we don't reinit the buddy cache
    1502             :          * page which map to the group from which we are already
    1503             :          * allocating. If we are looking at the buddy cache we would
    1504             :          * have taken a reference using ext4_mb_load_buddy and that
    1505             :          * would have pinned buddy page to page cache.
    1506             :          * The call to ext4_mb_get_buddy_page_lock will mark the
    1507             :          * page accessed.
    1508             :          */
    1509      157744 :         ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
    1510      157745 :         if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
    1511             :                 /*
    1512             :                  * somebody initialized the group
    1513             :                  * return without doing anything
    1514             :                  */
    1515         530 :                 goto err;
    1516             :         }
    1517             : 
    1518      157215 :         page = e4b.bd_bitmap_page;
    1519      157215 :         ret = ext4_mb_init_cache(page, NULL, gfp);
    1520      157215 :         if (ret)
    1521         266 :                 goto err;
    1522      156949 :         if (!PageUptodate(page)) {
    1523           0 :                 ret = -EIO;
    1524           0 :                 goto err;
    1525             :         }
    1526             : 
    1527      156949 :         if (e4b.bd_buddy_page == NULL) {
    1528             :                 /*
    1529             :                  * If both the bitmap and buddy are in
    1530             :                  * the same page we don't need to force
    1531             :                  * init the buddy
    1532             :                  */
    1533        3424 :                 ret = 0;
    1534        3424 :                 goto err;
    1535             :         }
    1536             :         /* init buddy cache */
    1537      153525 :         page = e4b.bd_buddy_page;
    1538      153525 :         ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
    1539      153525 :         if (ret)
    1540           0 :                 goto err;
    1541      153525 :         if (!PageUptodate(page)) {
    1542           0 :                 ret = -EIO;
    1543           0 :                 goto err;
    1544             :         }
    1545      153525 : err:
    1546      157745 :         ext4_mb_put_buddy_page_lock(&e4b);
    1547      157745 :         return ret;
    1548             : }
    1549             : 
    1550             : /*
    1551             :  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
    1552             :  * block group lock of all groups for this page; do not hold the BG lock when
    1553             :  * calling this routine!
    1554             :  */
    1555             : static noinline_for_stack int
    1556    21420778 : ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
    1557             :                        struct ext4_buddy *e4b, gfp_t gfp)
    1558             : {
    1559    21420778 :         int blocks_per_page;
    1560    21420778 :         int block;
    1561    21420778 :         int pnum;
    1562    21420778 :         int poff;
    1563    21420778 :         struct page *page;
    1564    21420778 :         int ret;
    1565    21420778 :         struct ext4_group_info *grp;
    1566    21420778 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    1567    21420778 :         struct inode *inode = sbi->s_buddy_cache;
    1568             : 
    1569    21420778 :         might_sleep();
    1570    21430385 :         mb_debug(sb, "load group %u\n", group);
    1571             : 
    1572    21430385 :         blocks_per_page = PAGE_SIZE / sb->s_blocksize;
    1573    21430385 :         grp = ext4_get_group_info(sb, group);
    1574    21430751 :         if (!grp)
    1575             :                 return -EFSCORRUPTED;
    1576             : 
    1577    21430751 :         e4b->bd_blkbits = sb->s_blocksize_bits;
    1578    21430751 :         e4b->bd_info = grp;
    1579    21430751 :         e4b->bd_sb = sb;
    1580    21430751 :         e4b->bd_group = group;
    1581    21430751 :         e4b->bd_buddy_page = NULL;
    1582    21430751 :         e4b->bd_bitmap_page = NULL;
    1583             : 
    1584    21430751 :         if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
    1585             :                 /*
    1586             :                  * we need full data about the group
    1587             :                  * to make a good selection
    1588             :                  */
    1589        3558 :                 ret = ext4_mb_init_group(sb, group, gfp);
    1590        3559 :                 if (ret)
    1591             :                         return ret;
    1592             :         }
    1593             : 
    1594             :         /*
    1595             :          * the buddy cache inode stores the block bitmap
    1596             :          * and buddy information in consecutive blocks.
    1597             :          * So for each group we need two blocks.
    1598             :          */
    1599    21430752 :         block = group * 2;
    1600    21430752 :         pnum = block / blocks_per_page;
    1601    21430752 :         poff = block % blocks_per_page;
    1602             : 
    1603             :         /* we could use find_or_create_page(), but it locks page
    1604             :          * what we'd like to avoid in fast path ... */
    1605    21430752 :         page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
    1606    21439754 :         if (page == NULL || !PageUptodate(page)) {
    1607         112 :                 if (page)
    1608             :                         /*
    1609             :                          * drop the page reference and try
    1610             :                          * to get the page with lock. If we
    1611             :                          * are not uptodate that implies
    1612             :                          * somebody just created the page but
    1613             :                          * is yet to initialize the same. So
    1614             :                          * wait for it to initialize.
    1615             :                          */
    1616           0 :                         put_page(page);
    1617         112 :                 page = find_or_create_page(inode->i_mapping, pnum, gfp);
    1618         112 :                 if (page) {
    1619         112 :                         if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
    1620             :         "ext4: bitmap's paging->mapping != inode->i_mapping\n")) {
    1621             :                                 /* should never happen */
    1622           0 :                                 unlock_page(page);
    1623           0 :                                 ret = -EINVAL;
    1624           0 :                                 goto err;
    1625             :                         }
    1626         112 :                         if (!PageUptodate(page)) {
    1627         112 :                                 ret = ext4_mb_init_cache(page, NULL, gfp);
    1628         112 :                                 if (ret) {
    1629           0 :                                         unlock_page(page);
    1630           0 :                                         goto err;
    1631             :                                 }
    1632             :                                 mb_cmp_bitmaps(e4b, page_address(page) +
    1633             :                                                (poff * sb->s_blocksize));
    1634             :                         }
    1635         112 :                         unlock_page(page);
    1636             :                 }
    1637             :         }
    1638    21433896 :         if (page == NULL) {
    1639           0 :                 ret = -ENOMEM;
    1640           0 :                 goto err;
    1641             :         }
    1642    21433896 :         if (!PageUptodate(page)) {
    1643           0 :                 ret = -EIO;
    1644           0 :                 goto err;
    1645             :         }
    1646             : 
    1647             :         /* Pages marked accessed already */
    1648    21431644 :         e4b->bd_bitmap_page = page;
    1649    21431644 :         e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
    1650             : 
    1651    21431644 :         block++;
    1652    21431644 :         pnum = block / blocks_per_page;
    1653    21431644 :         poff = block % blocks_per_page;
    1654             : 
    1655    21431644 :         page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
    1656    21444747 :         if (page == NULL || !PageUptodate(page)) {
    1657         113 :                 if (page)
    1658           1 :                         put_page(page);
    1659         113 :                 page = find_or_create_page(inode->i_mapping, pnum, gfp);
    1660         113 :                 if (page) {
    1661         113 :                         if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
    1662             :         "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) {
    1663             :                                 /* should never happen */
    1664           0 :                                 unlock_page(page);
    1665           0 :                                 ret = -EINVAL;
    1666           0 :                                 goto err;
    1667             :                         }
    1668         113 :                         if (!PageUptodate(page)) {
    1669         112 :                                 ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
    1670             :                                                          gfp);
    1671         112 :                                 if (ret) {
    1672           0 :                                         unlock_page(page);
    1673           0 :                                         goto err;
    1674             :                                 }
    1675             :                         }
    1676         113 :                         unlock_page(page);
    1677             :                 }
    1678             :         }
    1679    21442101 :         if (page == NULL) {
    1680           0 :                 ret = -ENOMEM;
    1681           0 :                 goto err;
    1682             :         }
    1683    21442101 :         if (!PageUptodate(page)) {
    1684           0 :                 ret = -EIO;
    1685           0 :                 goto err;
    1686             :         }
    1687             : 
    1688             :         /* Pages marked accessed already */
    1689    21435251 :         e4b->bd_buddy_page = page;
    1690    21435251 :         e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
    1691             : 
    1692    21435251 :         return 0;
    1693             : 
    1694           0 : err:
    1695           0 :         if (page)
    1696           0 :                 put_page(page);
    1697           0 :         if (e4b->bd_bitmap_page)
    1698           0 :                 put_page(e4b->bd_bitmap_page);
    1699             : 
    1700           0 :         e4b->bd_buddy = NULL;
    1701           0 :         e4b->bd_bitmap = NULL;
    1702           0 :         return ret;
    1703             : }
    1704             : 
    1705             : static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
    1706             :                               struct ext4_buddy *e4b)
    1707             : {
    1708    18324438 :         return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
    1709             : }
    1710             : 
    1711    21448637 : static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
    1712             : {
    1713    21448637 :         if (e4b->bd_bitmap_page)
    1714    21448637 :                 put_page(e4b->bd_bitmap_page);
    1715    21449445 :         if (e4b->bd_buddy_page)
    1716    21449445 :                 put_page(e4b->bd_buddy_page);
    1717    21447267 : }
    1718             : 
    1719             : 
    1720   500207666 : static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
    1721             : {
    1722   500207666 :         int order = 1, max;
    1723   500207666 :         void *bb;
    1724             : 
    1725   500207666 :         BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
    1726   500207666 :         BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
    1727             : 
    1728  2651480684 :         while (order <= e4b->bd_blkbits + 1) {
    1729  2519067202 :                 bb = mb_find_buddy(e4b, order, &max);
    1730  2468578794 :                 if (!mb_test_bit(block >> order, bb)) {
    1731             :                         /* this block is part of buddy of order 'order' */
    1732   370389820 :                         return order;
    1733             :                 }
    1734  2151273018 :                 order++;
    1735             :         }
    1736             :         return 0;
    1737             : }
    1738             : 
    1739     2644773 : static void mb_clear_bits(void *bm, int cur, int len)
    1740             : {
    1741     2644773 :         __u32 *addr;
    1742             : 
    1743     2644773 :         len = cur + len;
    1744    32159553 :         while (cur < len) {
    1745    29514769 :                 if ((cur & 31) == 0 && (len - cur) >= 32) {
    1746             :                         /* fast path: clear whole word at once */
    1747     7001371 :                         addr = bm + (cur >> 3);
    1748     7001371 :                         *addr = 0;
    1749     7001371 :                         cur += 32;
    1750     7001371 :                         continue;
    1751             :                 }
    1752    22513398 :                 mb_clear_bit(cur, bm);
    1753    22513409 :                 cur++;
    1754             :         }
    1755     2644784 : }
    1756             : 
    1757             : /* clear bits in given range
    1758             :  * will return first found zero bit if any, -1 otherwise
    1759             :  */
    1760     2838366 : static int mb_test_and_clear_bits(void *bm, int cur, int len)
    1761             : {
    1762     2838366 :         __u32 *addr;
    1763     2838366 :         int zero_bit = -1;
    1764             : 
    1765     2838366 :         len = cur + len;
    1766    67768396 :         while (cur < len) {
    1767    64930022 :                 if ((cur & 31) == 0 && (len - cur) >= 32) {
    1768             :                         /* fast path: clear whole word at once */
    1769    24702319 :                         addr = bm + (cur >> 3);
    1770    24702319 :                         if (*addr != (__u32)(-1) && zero_bit == -1)
    1771           0 :                                 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
    1772    24702319 :                         *addr = 0;
    1773    24702319 :                         cur += 32;
    1774    24702319 :                         continue;
    1775             :                 }
    1776    40227703 :                 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
    1777           0 :                         zero_bit = cur;
    1778    40227711 :                 cur++;
    1779             :         }
    1780             : 
    1781     2838374 :         return zero_bit;
    1782             : }
    1783             : 
    1784     6317915 : void mb_set_bits(void *bm, int cur, int len)
    1785             : {
    1786     6317915 :         __u32 *addr;
    1787             : 
    1788     6317915 :         len = cur + len;
    1789   100922173 :         while (cur < len) {
    1790    94604334 :                 if ((cur & 31) == 0 && (len - cur) >= 32) {
    1791             :                         /* fast path: set whole word at once */
    1792    35109457 :                         addr = bm + (cur >> 3);
    1793    35109457 :                         *addr = 0xffffffff;
    1794    35109457 :                         cur += 32;
    1795    35109457 :                         continue;
    1796             :                 }
    1797    59494877 :                 mb_set_bit(cur, bm);
    1798    59494801 :                 cur++;
    1799             :         }
    1800     6317839 : }
    1801             : 
    1802     9643900 : static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
    1803             : {
    1804     9643900 :         if (mb_test_bit(*bit + side, bitmap)) {
    1805     7340588 :                 mb_clear_bit(*bit, bitmap);
    1806     7340607 :                 (*bit) -= side;
    1807     7340607 :                 return 1;
    1808             :         }
    1809             :         else {
    1810     2303322 :                 (*bit) += side;
    1811     2303322 :                 mb_set_bit(*bit, bitmap);
    1812     2303322 :                 return -1;
    1813             :         }
    1814             : }
    1815             : 
    1816     2577456 : static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
    1817             : {
    1818     2577456 :         int max;
    1819     2577456 :         int order = 1;
    1820     2577456 :         void *buddy = mb_find_buddy(e4b, order, &max);
    1821             : 
    1822    10997852 :         while (buddy) {
    1823    10997841 :                 void *buddy2;
    1824             : 
    1825             :                 /* Bits in range [first; last] are known to be set since
    1826             :                  * corresponding blocks were allocated. Bits in range
    1827             :                  * (first; last) will stay set because they form buddies on
    1828             :                  * upper layer. We just deal with borders if they don't
    1829             :                  * align with upper layer and then go up.
    1830             :                  * Releasing entire group is all about clearing
    1831             :                  * single bit of highest order buddy.
    1832             :                  */
    1833             : 
    1834             :                 /* Example:
    1835             :                  * ---------------------------------
    1836             :                  * |   1   |   1   |   1   |   1   |
    1837             :                  * ---------------------------------
    1838             :                  * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
    1839             :                  * ---------------------------------
    1840             :                  *   0   1   2   3   4   5   6   7
    1841             :                  *      \_____________________/
    1842             :                  *
    1843             :                  * Neither [1] nor [6] is aligned to above layer.
    1844             :                  * Left neighbour [0] is free, so mark it busy,
    1845             :                  * decrease bb_counters and extend range to
    1846             :                  * [0; 6]
    1847             :                  * Right neighbour [7] is busy. It can't be coaleasced with [6], so
    1848             :                  * mark [6] free, increase bb_counters and shrink range to
    1849             :                  * [0; 5].
    1850             :                  * Then shift range to [0; 2], go up and do the same.
    1851             :                  */
    1852             : 
    1853             : 
    1854    10997841 :                 if (first & 1)
    1855     5139832 :                         e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
    1856    10997827 :                 if (!(last & 1))
    1857     4504492 :                         e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
    1858    10997800 :                 if (first > last)
    1859             :                         break;
    1860     8440519 :                 order++;
    1861             : 
    1862     8440519 :                 buddy2 = mb_find_buddy(e4b, order, &max);
    1863     8440549 :                 if (!buddy2) {
    1864       20153 :                         mb_clear_bits(buddy, first, last - first + 1);
    1865       20153 :                         e4b->bd_info->bb_counters[order - 1] += last - first + 1;
    1866       20153 :                         break;
    1867             :                 }
    1868     8420396 :                 first >>= 1;
    1869     8420396 :                 last >>= 1;
    1870     8420396 :                 buddy = buddy2;
    1871             :         }
    1872     2577434 : }
    1873             : 
    1874     2838341 : static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
    1875             :                            int first, int count)
    1876             : {
    1877     2838341 :         int left_is_free = 0;
    1878     2838341 :         int right_is_free = 0;
    1879     2838341 :         int block;
    1880     2838341 :         int last = first + count - 1;
    1881     2838341 :         struct super_block *sb = e4b->bd_sb;
    1882             : 
    1883     2838341 :         if (WARN_ON(count == 0))
    1884             :                 return;
    1885     2838341 :         BUG_ON(last >= (sb->s_blocksize << 3));
    1886     2838341 :         assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
    1887             :         /* Don't bother if the block group is corrupt. */
    1888     2838341 :         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
    1889             :                 return;
    1890             : 
    1891     2838341 :         mb_check_buddy(e4b);
    1892     2838341 :         mb_free_blocks_double(inode, e4b, first, count);
    1893             : 
    1894     2838341 :         this_cpu_inc(discard_pa_seq);
    1895     2838332 :         e4b->bd_info->bb_free += count;
    1896     2838332 :         if (first < e4b->bd_info->bb_first_free)
    1897       90494 :                 e4b->bd_info->bb_first_free = first;
    1898             : 
    1899             :         /* access memory sequentially: check left neighbour,
    1900             :          * clear range and then check right neighbour
    1901             :          */
    1902     2838332 :         if (first != 0)
    1903     2816726 :                 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
    1904     2838361 :         block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
    1905     2838382 :         if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
    1906     2815523 :                 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
    1907             : 
    1908     2838374 :         if (unlikely(block != -1)) {
    1909           0 :                 struct ext4_sb_info *sbi = EXT4_SB(sb);
    1910           0 :                 ext4_fsblk_t blocknr;
    1911             : 
    1912           0 :                 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
    1913           0 :                 blocknr += EXT4_C2B(sbi, block);
    1914           0 :                 if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
    1915           0 :                         ext4_grp_locked_error(sb, e4b->bd_group,
    1916             :                                               inode ? inode->i_ino : 0,
    1917             :                                               blocknr,
    1918             :                                               "freeing already freed block (bit %u); block bitmap corrupt.",
    1919             :                                               block);
    1920           0 :                         ext4_mark_group_bitmap_corrupted(
    1921             :                                 sb, e4b->bd_group,
    1922             :                                 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
    1923             :                 }
    1924           0 :                 goto done;
    1925             :         }
    1926             : 
    1927             :         /* let's maintain fragments counter */
    1928     2838374 :         if (left_is_free && right_is_free)
    1929      481031 :                 e4b->bd_info->bb_fragments--;
    1930     2357343 :         else if (!left_is_free && !right_is_free)
    1931     1608482 :                 e4b->bd_info->bb_fragments++;
    1932             : 
    1933             :         /* buddy[0] == bd_bitmap is a special case, so handle
    1934             :          * it right away and let mb_buddy_mark_free stay free of
    1935             :          * zero order checks.
    1936             :          * Check if neighbours are to be coaleasced,
    1937             :          * adjust bitmap bb_counters and borders appropriately.
    1938             :          */
    1939     2838374 :         if (first & 1) {
    1940     1203968 :                 first += !left_is_free;
    1941     2158767 :                 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
    1942             :         }
    1943     2838374 :         if (!(last & 1)) {
    1944     1164142 :                 last -= !right_is_free;
    1945     1909039 :                 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
    1946             :         }
    1947             : 
    1948     2838374 :         if (first <= last)
    1949     2577458 :                 mb_buddy_mark_free(e4b, first >> 1, last >> 1);
    1950             : 
    1951      260916 : done:
    1952     2838348 :         mb_set_largest_free_order(sb, e4b->bd_info);
    1953     2838376 :         mb_update_avg_fragment_size(sb, e4b->bd_info);
    1954     2838390 :         mb_check_buddy(e4b);
    1955             : }
    1956             : 
    1957   156390378 : static int mb_find_extent(struct ext4_buddy *e4b, int block,
    1958             :                                 int needed, struct ext4_free_extent *ex)
    1959             : {
    1960   156390378 :         int next = block;
    1961   156390378 :         int max, order;
    1962   156390378 :         void *buddy;
    1963             : 
    1964   156390378 :         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
    1965   156390378 :         BUG_ON(ex == NULL);
    1966             : 
    1967   156390378 :         buddy = mb_find_buddy(e4b, 0, &max);
    1968   156267465 :         BUG_ON(buddy == NULL);
    1969   156267465 :         BUG_ON(block >= max);
    1970   156267465 :         if (mb_test_bit(block, buddy)) {
    1971      964723 :                 ex->fe_len = 0;
    1972      964723 :                 ex->fe_start = 0;
    1973      964723 :                 ex->fe_group = 0;
    1974      964723 :                 return 0;
    1975             :         }
    1976             : 
    1977             :         /* find actual order */
    1978   155240608 :         order = mb_find_order_for_block(e4b, block);
    1979   154691550 :         block = block >> order;
    1980             : 
    1981   154691550 :         ex->fe_len = 1 << order;
    1982   154691550 :         ex->fe_start = block << order;
    1983   154691550 :         ex->fe_group = e4b->bd_group;
    1984             : 
    1985             :         /* calc difference from given start */
    1986   154691550 :         next = next - ex->fe_start;
    1987   154691550 :         ex->fe_len -= next;
    1988   154691550 :         ex->fe_start += next;
    1989             : 
    1990   976209383 :         while (needed > ex->fe_len &&
    1991   486244875 :                mb_find_buddy(e4b, order, &max)) {
    1992             : 
    1993   480903520 :                 if (block + 1 >= max)
    1994             :                         break;
    1995             : 
    1996   480493982 :                 next = (block + 1) * (1 << order);
    1997   480493982 :                 if (mb_test_bit(next, e4b->bd_bitmap))
    1998             :                         break;
    1999             : 
    2000   340932720 :                 order = mb_find_order_for_block(e4b, next);
    2001             : 
    2002   340614313 :                 block = next >> order;
    2003   340614313 :                 ex->fe_len += 1 << order;
    2004             :         }
    2005             : 
    2006   154767360 :         if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
    2007             :                 /* Should never happen! (but apparently sometimes does?!?) */
    2008           0 :                 WARN_ON(1);
    2009           0 :                 ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
    2010             :                         "corruption or bug in mb_find_extent "
    2011             :                         "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
    2012             :                         block, order, needed, ex->fe_group, ex->fe_start,
    2013             :                         ex->fe_len, ex->fe_logical);
    2014           0 :                 ex->fe_len = 0;
    2015           0 :                 ex->fe_start = 0;
    2016           0 :                 ex->fe_group = 0;
    2017             :         }
    2018   154767360 :         return ex->fe_len;
    2019             : }
    2020             : 
    2021     2697199 : static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
    2022             : {
    2023     2697199 :         int ord;
    2024     2697199 :         int mlen = 0;
    2025     2697199 :         int max = 0;
    2026     2697199 :         int cur;
    2027     2697199 :         int start = ex->fe_start;
    2028     2697199 :         int len = ex->fe_len;
    2029     2697199 :         unsigned ret = 0;
    2030     2697199 :         int len0 = len;
    2031     2697199 :         void *buddy;
    2032     2697199 :         bool split = false;
    2033             : 
    2034     2697199 :         BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
    2035     2697199 :         BUG_ON(e4b->bd_group != ex->fe_group);
    2036     2697199 :         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
    2037     2697199 :         mb_check_buddy(e4b);
    2038     2697199 :         mb_mark_used_double(e4b, start, len);
    2039             : 
    2040     2697199 :         this_cpu_inc(discard_pa_seq);
    2041     2697182 :         e4b->bd_info->bb_free -= len;
    2042     2697182 :         if (e4b->bd_info->bb_first_free == start)
    2043      560015 :                 e4b->bd_info->bb_first_free += len;
    2044             : 
    2045             :         /* let's maintain fragments counter */
    2046     2697182 :         if (start != 0)
    2047     2673633 :                 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
    2048     2697170 :         if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
    2049     2672826 :                 max = !mb_test_bit(start + len, e4b->bd_bitmap);
    2050     2697180 :         if (mlen && max)
    2051      176816 :                 e4b->bd_info->bb_fragments++;
    2052     2520364 :         else if (!mlen && !max)
    2053     1050992 :                 e4b->bd_info->bb_fragments--;
    2054             : 
    2055             :         /* let's maintain buddy itself */
    2056    14177665 :         while (len) {
    2057    11480472 :                 if (!split)
    2058     8481605 :                         ord = mb_find_order_for_block(e4b, start);
    2059             : 
    2060    11480419 :                 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
    2061             :                         /* the whole chunk may be allocated at once! */
    2062     8481844 :                         mlen = 1 << ord;
    2063     8481844 :                         if (!split)
    2064     6514848 :                                 buddy = mb_find_buddy(e4b, ord, &max);
    2065             :                         else
    2066             :                                 split = false;
    2067     8481805 :                         BUG_ON((start >> ord) >= max);
    2068     8481805 :                         mb_set_bit(start >> ord, buddy);
    2069     8481720 :                         e4b->bd_info->bb_counters[ord]--;
    2070     8481720 :                         start += mlen;
    2071     8481720 :                         len -= mlen;
    2072     8481720 :                         BUG_ON(len < 0);
    2073     8481720 :                         continue;
    2074             :                 }
    2075             : 
    2076             :                 /* store for history */
    2077     2998575 :                 if (ret == 0)
    2078      921151 :                         ret = len | (ord << 16);
    2079             : 
    2080             :                 /* we have to split large buddy */
    2081     2998575 :                 BUG_ON(ord <= 0);
    2082     2998575 :                 buddy = mb_find_buddy(e4b, ord, &max);
    2083     2998570 :                 mb_set_bit(start >> ord, buddy);
    2084     2998565 :                 e4b->bd_info->bb_counters[ord]--;
    2085             : 
    2086     2998565 :                 ord--;
    2087     2998565 :                 cur = (start >> ord) & ~1U;
    2088     2998565 :                 buddy = mb_find_buddy(e4b, ord, &max);
    2089     2998569 :                 mb_clear_bit(cur, buddy);
    2090     2998567 :                 mb_clear_bit(cur + 1, buddy);
    2091     2998765 :                 e4b->bd_info->bb_counters[ord]++;
    2092     2998765 :                 e4b->bd_info->bb_counters[ord]++;
    2093     2998765 :                 split = true;
    2094             :         }
    2095     2697193 :         mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
    2096             : 
    2097     2697146 :         mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
    2098     2697113 :         mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
    2099     2697089 :         mb_check_buddy(e4b);
    2100             : 
    2101     2697089 :         return ret;
    2102             : }
    2103             : 
    2104             : /*
    2105             :  * Must be called under group lock!
    2106             :  */
    2107     2679660 : static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
    2108             :                                         struct ext4_buddy *e4b)
    2109             : {
    2110     2679660 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    2111     2679660 :         int ret;
    2112             : 
    2113     2679660 :         BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
    2114     2679660 :         BUG_ON(ac->ac_status == AC_STATUS_FOUND);
    2115             : 
    2116     2679660 :         ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
    2117     2679660 :         ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
    2118     2679660 :         ret = mb_mark_used(e4b, &ac->ac_b_ex);
    2119             : 
    2120             :         /* preallocation can change ac_b_ex, thus we store actually
    2121             :          * allocated blocks for history */
    2122     2679575 :         ac->ac_f_ex = ac->ac_b_ex;
    2123             : 
    2124     2679575 :         ac->ac_status = AC_STATUS_FOUND;
    2125     2679575 :         ac->ac_tail = ret & 0xffff;
    2126     2679575 :         ac->ac_buddy = ret >> 16;
    2127             : 
    2128             :         /*
    2129             :          * take the page reference. We want the page to be pinned
    2130             :          * so that we don't get a ext4_mb_init_cache_call for this
    2131             :          * group until we update the bitmap. That would mean we
    2132             :          * double allocate blocks. The reference is dropped
    2133             :          * in ext4_mb_release_context
    2134             :          */
    2135     2679575 :         ac->ac_bitmap_page = e4b->bd_bitmap_page;
    2136     2679575 :         get_page(ac->ac_bitmap_page);
    2137     2679657 :         ac->ac_buddy_page = e4b->bd_buddy_page;
    2138     2679657 :         get_page(ac->ac_buddy_page);
    2139             :         /* store last allocated for subsequent stream allocation */
    2140     2679687 :         if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
    2141     2036807 :                 spin_lock(&sbi->s_md_lock);
    2142     2036824 :                 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
    2143     2036824 :                 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
    2144     2036824 :                 spin_unlock(&sbi->s_md_lock);
    2145             :         }
    2146             :         /*
    2147             :          * As we've just preallocated more space than
    2148             :          * user requested originally, we store allocated
    2149             :          * space in a special descriptor.
    2150             :          */
    2151     2679688 :         if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
    2152      716958 :                 ext4_mb_new_preallocation(ac);
    2153             : 
    2154     2679677 : }
    2155             : 
    2156   160740519 : static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
    2157             :                                         struct ext4_buddy *e4b,
    2158             :                                         int finish_group)
    2159             : {
    2160   160740519 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    2161   160740519 :         struct ext4_free_extent *bex = &ac->ac_b_ex;
    2162   160740519 :         struct ext4_free_extent *gex = &ac->ac_g_ex;
    2163             : 
    2164   160740519 :         if (ac->ac_status == AC_STATUS_FOUND)
    2165             :                 return;
    2166             :         /*
    2167             :          * We don't want to scan for a whole year
    2168             :          */
    2169   159632007 :         if (ac->ac_found > sbi->s_mb_max_to_scan &&
    2170     1321092 :                         !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
    2171     1321098 :                 ac->ac_status = AC_STATUS_BREAK;
    2172     1321098 :                 return;
    2173             :         }
    2174             : 
    2175             :         /*
    2176             :          * Haven't found good chunk so far, let's continue
    2177             :          */
    2178   158310909 :         if (bex->fe_len < gex->fe_len)
    2179             :                 return;
    2180             : 
    2181     6944925 :         if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
    2182      678089 :                 ext4_mb_use_best_found(ac, e4b);
    2183             : }
    2184             : 
    2185             : /*
    2186             :  * The routine checks whether found extent is good enough. If it is,
    2187             :  * then the extent gets marked used and flag is set to the context
    2188             :  * to stop scanning. Otherwise, the extent is compared with the
    2189             :  * previous found extent and if new one is better, then it's stored
    2190             :  * in the context. Later, the best found extent will be used, if
    2191             :  * mballoc can't find good enough extent.
    2192             :  *
    2193             :  * The algorithm used is roughly as follows:
    2194             :  *
    2195             :  * * If free extent found is exactly as big as goal, then
    2196             :  *   stop the scan and use it immediately
    2197             :  *
    2198             :  * * If free extent found is smaller than goal, then keep retrying
    2199             :  *   upto a max of sbi->s_mb_max_to_scan times (default 200). After
    2200             :  *   that stop scanning and use whatever we have.
    2201             :  *
    2202             :  * * If free extent found is bigger than goal, then keep retrying
    2203             :  *   upto a max of sbi->s_mb_min_to_scan times (default 10) before
    2204             :  *   stopping the scan and using the extent.
    2205             :  *
    2206             :  *
    2207             :  * FIXME: real allocation policy is to be designed yet!
    2208             :  */
    2209   153782114 : static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
    2210             :                                         struct ext4_free_extent *ex,
    2211             :                                         struct ext4_buddy *e4b)
    2212             : {
    2213   153782114 :         struct ext4_free_extent *bex = &ac->ac_b_ex;
    2214   153782114 :         struct ext4_free_extent *gex = &ac->ac_g_ex;
    2215             : 
    2216   153782114 :         BUG_ON(ex->fe_len <= 0);
    2217   153782114 :         BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
    2218   153782114 :         BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
    2219   153782114 :         BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
    2220             : 
    2221   153782114 :         ac->ac_found++;
    2222   153782114 :         ac->ac_cX_found[ac->ac_criteria]++;
    2223             : 
    2224             :         /*
    2225             :          * The special case - take what you catch first
    2226             :          */
    2227   153824574 :         if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
    2228      136227 :                 *bex = *ex;
    2229      136227 :                 ext4_mb_use_best_found(ac, e4b);
    2230      136227 :                 return;
    2231             :         }
    2232             : 
    2233             :         /*
    2234             :          * Let's check whether the chuck is good enough
    2235             :          */
    2236   153688347 :         if (ex->fe_len == gex->fe_len) {
    2237      588039 :                 *bex = *ex;
    2238      588039 :                 ext4_mb_use_best_found(ac, e4b);
    2239      588039 :                 return;
    2240             :         }
    2241             : 
    2242             :         /*
    2243             :          * If this is first found extent, just store it in the context
    2244             :          */
    2245   153100308 :         if (bex->fe_len == 0) {
    2246     1784501 :                 *bex = *ex;
    2247     1784501 :                 return;
    2248             :         }
    2249             : 
    2250             :         /*
    2251             :          * If new found extent is better, store it in the context
    2252             :          */
    2253   151315807 :         if (bex->fe_len < gex->fe_len) {
    2254             :                 /* if the request isn't satisfied, any found extent
    2255             :                  * larger than previous best one is better */
    2256   144699726 :                 if (ex->fe_len > bex->fe_len)
    2257     3361865 :                         *bex = *ex;
    2258     6616081 :         } else if (ex->fe_len > gex->fe_len) {
    2259             :                 /* if the request is satisfied, then we try to find
    2260             :                  * an extent that still satisfy the request, but is
    2261             :                  * smaller than previous one */
    2262     6578651 :                 if (ex->fe_len < bex->fe_len)
    2263      670887 :                         *bex = *ex;
    2264             :         }
    2265             : 
    2266   151315807 :         ext4_mb_check_limits(ac, e4b, 0);
    2267             : }
    2268             : 
    2269             : static noinline_for_stack
    2270      811570 : void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
    2271             :                                         struct ext4_buddy *e4b)
    2272             : {
    2273      811570 :         struct ext4_free_extent ex = ac->ac_b_ex;
    2274      811570 :         ext4_group_t group = ex.fe_group;
    2275      811570 :         int max;
    2276      811570 :         int err;
    2277             : 
    2278      811570 :         BUG_ON(ex.fe_len <= 0);
    2279      811570 :         err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
    2280      811572 :         if (err)
    2281           0 :                 return;
    2282             : 
    2283      811572 :         ext4_lock_group(ac->ac_sb, group);
    2284      811598 :         max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
    2285             : 
    2286      811566 :         if (max > 0) {
    2287      675340 :                 ac->ac_b_ex = ex;
    2288      675340 :                 ext4_mb_use_best_found(ac, e4b);
    2289             :         }
    2290             : 
    2291      811556 :         ext4_unlock_group(ac->ac_sb, group);
    2292      811599 :         ext4_mb_unload_buddy(e4b);
    2293             : }
    2294             : 
    2295             : static noinline_for_stack
    2296     2685004 : int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
    2297             :                                 struct ext4_buddy *e4b)
    2298             : {
    2299     2685004 :         ext4_group_t group = ac->ac_g_ex.fe_group;
    2300     2685004 :         int max;
    2301     2685004 :         int err;
    2302     2685004 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    2303     2685004 :         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
    2304     2685015 :         struct ext4_free_extent ex;
    2305             : 
    2306     2685015 :         if (!grp)
    2307             :                 return -EFSCORRUPTED;
    2308     2685015 :         if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
    2309             :                 return 0;
    2310     1330075 :         if (grp->bb_free == 0)
    2311             :                 return 0;
    2312             : 
    2313     1229232 :         err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
    2314     1229281 :         if (err)
    2315             :                 return err;
    2316             : 
    2317     1229281 :         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
    2318           0 :                 ext4_mb_unload_buddy(e4b);
    2319           0 :                 return 0;
    2320             :         }
    2321             : 
    2322     1229281 :         ext4_lock_group(ac->ac_sb, group);
    2323     1229354 :         max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
    2324             :                              ac->ac_g_ex.fe_len, &ex);
    2325     1229358 :         ex.fe_logical = 0xDEADFA11; /* debug value */
    2326             : 
    2327     1229358 :         if (max >= ac->ac_g_ex.fe_len &&
    2328      238735 :             ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) {
    2329         229 :                 ext4_fsblk_t start;
    2330             : 
    2331         229 :                 start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
    2332             :                 /* use do_div to get remainder (would be 64-bit modulo) */
    2333         229 :                 if (do_div(start, sbi->s_stripe) == 0) {
    2334          96 :                         ac->ac_found++;
    2335          96 :                         ac->ac_b_ex = ex;
    2336          96 :                         ext4_mb_use_best_found(ac, e4b);
    2337             :                 }
    2338     1229129 :         } else if (max >= ac->ac_g_ex.fe_len) {
    2339      238506 :                 BUG_ON(ex.fe_len <= 0);
    2340      238506 :                 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
    2341      238506 :                 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
    2342      238506 :                 ac->ac_found++;
    2343      238506 :                 ac->ac_b_ex = ex;
    2344      238506 :                 ext4_mb_use_best_found(ac, e4b);
    2345      990623 :         } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
    2346             :                 /* Sometimes, caller may want to merge even small
    2347             :                  * number of blocks to an existing extent */
    2348           0 :                 BUG_ON(ex.fe_len <= 0);
    2349           0 :                 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
    2350           0 :                 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
    2351           0 :                 ac->ac_found++;
    2352           0 :                 ac->ac_b_ex = ex;
    2353           0 :                 ext4_mb_use_best_found(ac, e4b);
    2354             :         }
    2355     1229358 :         ext4_unlock_group(ac->ac_sb, group);
    2356     1229420 :         ext4_mb_unload_buddy(e4b);
    2357             : 
    2358     1229420 :         return 0;
    2359             : }
    2360             : 
    2361             : /*
    2362             :  * The routine scans buddy structures (not bitmap!) from given order
    2363             :  * to max order and tries to find big enough chunk to satisfy the req
    2364             :  */
    2365             : static noinline_for_stack
    2366      350723 : void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
    2367             :                                         struct ext4_buddy *e4b)
    2368             : {
    2369      350723 :         struct super_block *sb = ac->ac_sb;
    2370      350723 :         struct ext4_group_info *grp = e4b->bd_info;
    2371      350723 :         void *buddy;
    2372      350723 :         int i;
    2373      350723 :         int k;
    2374      350723 :         int max;
    2375             : 
    2376      350723 :         BUG_ON(ac->ac_2order <= 0);
    2377      524913 :         for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
    2378      524913 :                 if (grp->bb_counters[i] == 0)
    2379      174190 :                         continue;
    2380             : 
    2381      350723 :                 buddy = mb_find_buddy(e4b, i, &max);
    2382      350723 :                 if (WARN_RATELIMIT(buddy == NULL,
    2383             :                          "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i))
    2384           0 :                         continue;
    2385             : 
    2386      350723 :                 k = mb_find_next_zero_bit(buddy, max, 0);
    2387      350723 :                 if (k >= max) {
    2388           0 :                         ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
    2389             :                                 "%d free clusters of order %d. But found 0",
    2390             :                                 grp->bb_counters[i], i);
    2391           0 :                         ext4_mark_group_bitmap_corrupted(ac->ac_sb,
    2392             :                                          e4b->bd_group,
    2393             :                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
    2394           0 :                         break;
    2395             :                 }
    2396      350723 :                 ac->ac_found++;
    2397      350723 :                 ac->ac_cX_found[ac->ac_criteria]++;
    2398             : 
    2399      350723 :                 ac->ac_b_ex.fe_len = 1 << i;
    2400      350723 :                 ac->ac_b_ex.fe_start = k << i;
    2401      350723 :                 ac->ac_b_ex.fe_group = e4b->bd_group;
    2402             : 
    2403      350723 :                 ext4_mb_use_best_found(ac, e4b);
    2404             : 
    2405      350722 :                 BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
    2406             : 
    2407      350722 :                 if (EXT4_SB(sb)->s_mb_stats)
    2408           0 :                         atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
    2409             : 
    2410             :                 break;
    2411             :         }
    2412      350722 : }
    2413             : 
    2414             : /*
    2415             :  * The routine scans the group and measures all found extents.
    2416             :  * In order to optimize scanning, caller must pass number of
    2417             :  * free blocks in the group, so the routine can know upper limit.
    2418             :  */
    2419             : static noinline_for_stack
    2420     9416469 : void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
    2421             :                                         struct ext4_buddy *e4b)
    2422             : {
    2423     9416469 :         struct super_block *sb = ac->ac_sb;
    2424     9416469 :         void *bitmap = e4b->bd_bitmap;
    2425     9416469 :         struct ext4_free_extent ex;
    2426     9416469 :         int i, j, freelen;
    2427     9416469 :         int free;
    2428             : 
    2429     9416469 :         free = e4b->bd_info->bb_free;
    2430     9416469 :         if (WARN_ON(free <= 0))
    2431           0 :                 return;
    2432             : 
    2433     9416469 :         i = e4b->bd_info->bb_first_free;
    2434             : 
    2435   177364502 :         while (free && ac->ac_status == AC_STATUS_CONTINUE) {
    2436   167950114 :                 i = mb_find_next_zero_bit(bitmap,
    2437   167950114 :                                                 EXT4_CLUSTERS_PER_GROUP(sb), i);
    2438   168071339 :                 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
    2439             :                         /*
    2440             :                          * IF we have corrupt bitmap, we won't find any
    2441             :                          * free blocks even though group info says we
    2442             :                          * have free blocks
    2443             :                          */
    2444           0 :                         ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
    2445             :                                         "%d free clusters as per "
    2446             :                                         "group info. But bitmap says 0",
    2447             :                                         free);
    2448           0 :                         ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
    2449             :                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
    2450           0 :                         break;
    2451             :                 }
    2452             : 
    2453   168071339 :                 if (ac->ac_criteria < CR_FAST) {
    2454             :                         /*
    2455             :                          * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
    2456             :                          * sure that this group will have a large enough
    2457             :                          * continuous free extent, so skip over the smaller free
    2458             :                          * extents
    2459             :                          */
    2460    21951288 :                         j = mb_find_next_bit(bitmap,
    2461             :                                                 EXT4_CLUSTERS_PER_GROUP(sb), i);
    2462    21950674 :                         freelen = j - i;
    2463             : 
    2464    21950674 :                         if (freelen < ac->ac_g_ex.fe_len) {
    2465    14079302 :                                 i = j;
    2466    14079302 :                                 free -= freelen;
    2467    14079302 :                                 continue;
    2468             :                         }
    2469             :                 }
    2470             : 
    2471   153991423 :                 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
    2472   153720019 :                 if (WARN_ON(ex.fe_len <= 0))
    2473             :                         break;
    2474   153720019 :                 if (free < ex.fe_len) {
    2475           0 :                         ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
    2476             :                                         "%d free clusters as per "
    2477             :                                         "group info. But got %d blocks",
    2478             :                                         free, ex.fe_len);
    2479           0 :                         ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
    2480             :                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
    2481             :                         /*
    2482             :                          * The number of free blocks differs. This mostly
    2483             :                          * indicate that the bitmap is corrupt. So exit
    2484             :                          * without claiming the space.
    2485             :                          */
    2486           0 :                         break;
    2487             :                 }
    2488   153720019 :                 ex.fe_logical = 0xDEADC0DE; /* debug value */
    2489   153720019 :                 ext4_mb_measure_extent(ac, &ex, e4b);
    2490             : 
    2491   153868731 :                 i += ex.fe_len;
    2492   153868731 :                 free -= ex.fe_len;
    2493             :         }
    2494             : 
    2495     9414388 :         ext4_mb_check_limits(ac, e4b, 1);
    2496             : }
    2497             : 
    2498             : /*
    2499             :  * This is a special case for storages like raid5
    2500             :  * we try to find stripe-aligned chunks for stripe-size-multiple requests
    2501             :  */
    2502             : static noinline_for_stack
    2503      151172 : void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
    2504             :                                  struct ext4_buddy *e4b)
    2505             : {
    2506      151172 :         struct super_block *sb = ac->ac_sb;
    2507      151172 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2508      151172 :         void *bitmap = e4b->bd_bitmap;
    2509      151172 :         struct ext4_free_extent ex;
    2510      151172 :         ext4_fsblk_t first_group_block;
    2511      151172 :         ext4_fsblk_t a;
    2512      151172 :         ext4_grpblk_t i, stripe;
    2513      151172 :         int max;
    2514             : 
    2515      151172 :         BUG_ON(sbi->s_stripe == 0);
    2516             : 
    2517             :         /* find first stripe-aligned block in group */
    2518      151172 :         first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
    2519             : 
    2520      151172 :         a = first_group_block + sbi->s_stripe - 1;
    2521      151172 :         do_div(a, sbi->s_stripe);
    2522      151172 :         i = (a * sbi->s_stripe) - first_group_block;
    2523             : 
    2524      151172 :         stripe = EXT4_B2C(sbi, sbi->s_stripe);
    2525      151172 :         i = EXT4_B2C(sbi, i);
    2526    18246095 :         while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
    2527    18107600 :                 if (!mb_test_bit(i, bitmap)) {
    2528      261121 :                         max = mb_find_extent(e4b, i, stripe, &ex);
    2529      261121 :                         if (max >= stripe) {
    2530       12677 :                                 ac->ac_found++;
    2531       12677 :                                 ac->ac_cX_found[ac->ac_criteria]++;
    2532       12677 :                                 ex.fe_logical = 0xDEADF00D; /* debug value */
    2533       12677 :                                 ac->ac_b_ex = ex;
    2534       12677 :                                 ext4_mb_use_best_found(ac, e4b);
    2535       12677 :                                 break;
    2536             :                         }
    2537             :                 }
    2538    18094923 :                 i += stripe;
    2539             :         }
    2540      151172 : }
    2541             : 
    2542             : /*
    2543             :  * This is also called BEFORE we load the buddy bitmap.
    2544             :  * Returns either 1 or 0 indicating that the group is either suitable
    2545             :  * for the allocation or not.
    2546             :  */
    2547    35083170 : static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
    2548             :                                 ext4_group_t group, enum criteria cr)
    2549             : {
    2550    35083170 :         ext4_grpblk_t free, fragments;
    2551    35083170 :         int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
    2552    35078447 :         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
    2553             : 
    2554    35115534 :         BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS);
    2555             : 
    2556    35115534 :         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp))
    2557             :                 return false;
    2558             : 
    2559    35115534 :         free = grp->bb_free;
    2560    35115534 :         if (free == 0)
    2561             :                 return false;
    2562             : 
    2563    25516619 :         fragments = grp->bb_fragments;
    2564    25516619 :         if (fragments == 0)
    2565             :                 return false;
    2566             : 
    2567    25516580 :         switch (cr) {
    2568     1168541 :         case CR_POWER2_ALIGNED:
    2569     1168541 :                 BUG_ON(ac->ac_2order == 0);
    2570             : 
    2571             :                 /* Avoid using the first bg of a flexgroup for data files */
    2572     1168541 :                 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
    2573     1167815 :                     (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
    2574     1167815 :                     ((group % flex_size) == 0))
    2575             :                         return false;
    2576             : 
    2577      996586 :                 if (free < ac->ac_g_ex.fe_len)
    2578             :                         return false;
    2579             : 
    2580      996581 :                 if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
    2581             :                         return true;
    2582             : 
    2583      996581 :                 if (grp->bb_largest_free_order < ac->ac_2order)
    2584      293317 :                         return false;
    2585             : 
    2586             :                 return true;
    2587     7948914 :         case CR_GOAL_LEN_FAST:
    2588             :         case CR_BEST_AVAIL_LEN:
    2589     7948914 :                 if ((free / fragments) >= ac->ac_g_ex.fe_len)
    2590     2852884 :                         return true;
    2591             :                 break;
    2592     3641626 :         case CR_GOAL_LEN_SLOW:
    2593     3641626 :                 if (free >= ac->ac_g_ex.fe_len)
    2594     3639796 :                         return true;
    2595             :                 break;
    2596             :         case CR_ANY_FREE:
    2597             :                 return true;
    2598           0 :         default:
    2599           0 :                 BUG();
    2600             :         }
    2601             : 
    2602             :         return false;
    2603             : }
    2604             : 
    2605             : /*
    2606             :  * This could return negative error code if something goes wrong
    2607             :  * during ext4_mb_init_group(). This should not be called with
    2608             :  * ext4_lock_group() held.
    2609             :  *
    2610             :  * Note: because we are conditionally operating with the group lock in
    2611             :  * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
    2612             :  * function using __acquire and __release.  This means we need to be
    2613             :  * super careful before messing with the error path handling via "goto
    2614             :  * out"!
    2615             :  */
    2616   168821892 : static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
    2617             :                                      ext4_group_t group, enum criteria cr)
    2618             : {
    2619   168821892 :         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
    2620   169528355 :         struct super_block *sb = ac->ac_sb;
    2621   169528355 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    2622   169528355 :         bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
    2623   169528355 :         ext4_grpblk_t free;
    2624   169528355 :         int ret = 0;
    2625             : 
    2626   169528355 :         if (!grp)
    2627             :                 return -EFSCORRUPTED;
    2628   169528355 :         if (sbi->s_mb_stats)
    2629           0 :                 atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
    2630   169528355 :         if (should_lock) {
    2631       67623 :                 ext4_lock_group(sb, group);
    2632   169528829 :                 __release(ext4_group_lock_ptr(sb, group));
    2633             :         }
    2634   169528829 :         free = grp->bb_free;
    2635   169528829 :         if (free == 0)
    2636   112750066 :                 goto out;
    2637    56778763 :         if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len)
    2638    42637842 :                 goto out;
    2639    14140921 :         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
    2640         268 :                 goto out;
    2641    14140653 :         if (should_lock) {
    2642          83 :                 __acquire(ext4_group_lock_ptr(sb, group));
    2643          83 :                 ext4_unlock_group(sb, group);
    2644             :         }
    2645             : 
    2646             :         /* We only do this if the grp has never been initialized */
    2647    14140653 :         if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
    2648        4601 :                 struct ext4_group_desc *gdp =
    2649        4601 :                         ext4_get_group_desc(sb, group, NULL);
    2650        4601 :                 int ret;
    2651             : 
    2652             :                 /*
    2653             :                  * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
    2654             :                  * search to find large good chunks almost for free. If buddy
    2655             :                  * data is not ready, then this optimization makes no sense. But
    2656             :                  * we never skip the first block group in a flex_bg, since this
    2657             :                  * gets used for metadata block allocation, and we want to make
    2658             :                  * sure we locate metadata blocks in the first block group in
    2659             :                  * the flex_bg if possible.
    2660             :                  */
    2661        4601 :                 if (cr < CR_FAST &&
    2662        4267 :                     (!sbi->s_log_groups_per_flex ||
    2663        5697 :                      ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
    2664        2687 :                     !(ext4_has_group_desc_csum(sb) &&
    2665        1443 :                       (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
    2666             :                         return 0;
    2667        2424 :                 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
    2668        2424 :                 if (ret)
    2669             :                         return ret;
    2670             :         }
    2671             : 
    2672    14138211 :         if (should_lock) {
    2673          83 :                 ext4_lock_group(sb, group);
    2674    14138211 :                 __release(ext4_group_lock_ptr(sb, group));
    2675             :         }
    2676    14138211 :         ret = ext4_mb_good_group(ac, group, cr);
    2677   169522639 : out:
    2678   169522639 :         if (should_lock) {
    2679       67731 :                 __acquire(ext4_group_lock_ptr(sb, group));
    2680       67731 :                 ext4_unlock_group(sb, group);
    2681             :         }
    2682             :         return ret;
    2683             : }
    2684             : 
    2685             : /*
    2686             :  * Start prefetching @nr block bitmaps starting at @group.
    2687             :  * Return the next group which needs to be prefetched.
    2688             :  */
    2689    15448382 : ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
    2690             :                               unsigned int nr, int *cnt)
    2691             : {
    2692    15448382 :         ext4_group_t ngroups = ext4_get_groups_count(sb);
    2693    15449331 :         struct buffer_head *bh;
    2694    15449331 :         struct blk_plug plug;
    2695             : 
    2696    15449331 :         blk_start_plug(&plug);
    2697   225185399 :         while (nr-- > 0) {
    2698   209738107 :                 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
    2699             :                                                                   NULL);
    2700   209067975 :                 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
    2701             : 
    2702             :                 /*
    2703             :                  * Prefetch block groups with free blocks; but don't
    2704             :                  * bother if it is marked uninitialized on disk, since
    2705             :                  * it won't require I/O to read.  Also only try to
    2706             :                  * prefetch once, so we avoid getblk() call, which can
    2707             :                  * be expensive.
    2708             :                  */
    2709   208414200 :                 if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
    2710      314723 :                     EXT4_MB_GRP_NEED_INIT(grp) &&
    2711      156788 :                     ext4_free_group_clusters(sb, gdp) > 0 ) {
    2712      154939 :                         bh = ext4_read_block_bitmap_nowait(sb, group, true);
    2713      154938 :                         if (bh && !IS_ERR(bh)) {
    2714      309845 :                                 if (!buffer_uptodate(bh) && cnt)
    2715       69143 :                                         (*cnt)++;
    2716      154923 :                                 brelse(bh);
    2717             :                         }
    2718             :                 }
    2719   209736068 :                 if (++group >= ngroups)
    2720     1339136 :                         group = 0;
    2721             :         }
    2722    15448904 :         blk_finish_plug(&plug);
    2723    15449766 :         return group;
    2724             : }
    2725             : 
    2726             : /*
    2727             :  * Prefetching reads the block bitmap into the buffer cache; but we
    2728             :  * need to make sure that the buddy bitmap in the page cache has been
    2729             :  * initialized.  Note that ext4_mb_init_group() will block if the I/O
    2730             :  * is not yet completed, or indeed if it was not initiated by
    2731             :  * ext4_mb_prefetch did not start the I/O.
    2732             :  *
    2733             :  * TODO: We should actually kick off the buddy bitmap setup in a work
    2734             :  * queue when the buffer I/O is completed, so that we don't block
    2735             :  * waiting for the block allocation bitmap read to finish when
    2736             :  * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
    2737             :  */
    2738     2448467 : void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
    2739             :                            unsigned int nr)
    2740             : {
    2741     2448467 :         struct ext4_group_desc *gdp;
    2742     2448467 :         struct ext4_group_info *grp;
    2743             : 
    2744    32195759 :         while (nr-- > 0) {
    2745    29747152 :                 if (!group)
    2746      252580 :                         group = ext4_get_groups_count(sb);
    2747    29747152 :                 group--;
    2748    29747152 :                 gdp = ext4_get_group_desc(sb, group, NULL);
    2749    29747831 :                 grp = ext4_get_group_info(sb, group);
    2750             : 
    2751    59772814 :                 if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
    2752      278216 :                     ext4_free_group_clusters(sb, gdp) > 0) {
    2753      149383 :                         if (ext4_mb_init_group(sb, group, GFP_NOFS))
    2754             :                                 break;
    2755             :                 }
    2756             :         }
    2757     2448608 : }
    2758             : 
    2759             : static noinline_for_stack int
    2760     2685015 : ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
    2761             : {
    2762     2685015 :         ext4_group_t prefetch_grp = 0, ngroups, group, i;
    2763     2685015 :         enum criteria new_cr, cr = CR_GOAL_LEN_FAST;
    2764     2685015 :         int err = 0, first_err = 0;
    2765     2685015 :         unsigned int nr = 0, prefetch_ios = 0;
    2766     2685015 :         struct ext4_sb_info *sbi;
    2767     2685015 :         struct super_block *sb;
    2768     2685015 :         struct ext4_buddy e4b;
    2769     2685015 :         int lost;
    2770             : 
    2771     2685015 :         sb = ac->ac_sb;
    2772     2685015 :         sbi = EXT4_SB(sb);
    2773     2685015 :         ngroups = ext4_get_groups_count(sb);
    2774             :         /* non-extent files are limited to low blocks/groups */
    2775     2684752 :         if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
    2776         690 :                 ngroups = sbi->s_blockfile_groups;
    2777             : 
    2778     2684752 :         BUG_ON(ac->ac_status == AC_STATUS_FOUND);
    2779             : 
    2780             :         /* first, try the goal */
    2781     2684752 :         err = ext4_mb_find_by_goal(ac, &e4b);
    2782     2685007 :         if (err || ac->ac_status == AC_STATUS_FOUND)
    2783      238601 :                 goto out;
    2784             : 
    2785     2446406 :         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
    2786           0 :                 goto out;
    2787             : 
    2788             :         /*
    2789             :          * ac->ac_2order is set only if the fe_len is a power of 2
    2790             :          * if ac->ac_2order is set we also set criteria to 0 so that we
    2791             :          * try exact allocation using buddy.
    2792             :          */
    2793     2446406 :         i = fls(ac->ac_g_ex.fe_len);
    2794     2446406 :         ac->ac_2order = 0;
    2795             :         /*
    2796             :          * We search using buddy data only if the order of the request
    2797             :          * is greater than equal to the sbi_s_mb_order2_reqs
    2798             :          * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
    2799             :          * We also support searching for power-of-two requests only for
    2800             :          * requests upto maximum buddy size we have constructed.
    2801             :          */
    2802     2446406 :         if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
    2803             :                 /*
    2804             :                  * This should tell if fe_len is exactly power of 2
    2805             :                  */
    2806     1846750 :                 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
    2807      493359 :                         ac->ac_2order = array_index_nospec(i - 1,
    2808             :                                                            MB_NUM_ORDERS(sb));
    2809             :         }
    2810             : 
    2811             :         /* if stream allocation is enabled, use global goal */
    2812     2446399 :         if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
    2813             :                 /* TBD: may be hot point */
    2814     1803638 :                 spin_lock(&sbi->s_md_lock);
    2815     1804023 :                 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
    2816     1804023 :                 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
    2817     1804023 :                 spin_unlock(&sbi->s_md_lock);
    2818             :         }
    2819             : 
    2820             :         /*
    2821             :          * Let's just scan groups to find more-less suitable blocks We
    2822             :          * start with CR_GOAL_LEN_FAST, unless it is power of 2
    2823             :          * aligned, in which case let's do that faster approach first.
    2824             :          */
    2825     2446758 :         if (ac->ac_2order)
    2826      493448 :                 cr = CR_POWER2_ALIGNED;
    2827     1953310 : repeat:
    2828    10751362 :         for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
    2829     4908277 :                 ac->ac_criteria = cr;
    2830             :                 /*
    2831             :                  * searching for the right group start
    2832             :                  * from the goal value specified
    2833             :                  */
    2834     4908277 :                 group = ac->ac_g_ex.fe_group;
    2835     4908277 :                 ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
    2836     4908277 :                 prefetch_grp = group;
    2837             : 
    2838   170906260 :                 for (i = 0, new_cr = cr; i < ngroups; i++,
    2839   166708041 :                      ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
    2840   170073613 :                         int ret = 0;
    2841             : 
    2842   170073613 :                         cond_resched();
    2843   170351957 :                         if (new_cr != cr) {
    2844     1650514 :                                 cr = new_cr;
    2845     1650514 :                                 goto repeat;
    2846             :                         }
    2847             : 
    2848             :                         /*
    2849             :                          * Batch reads of the block allocation bitmaps
    2850             :                          * to get multiple READs in flight; limit
    2851             :                          * prefetching at cr=0/1, otherwise mballoc can
    2852             :                          * spend a lot of time loading imperfect groups
    2853             :                          */
    2854   168701443 :                         if ((prefetch_grp == group) &&
    2855     3881213 :                             (cr >= CR_FAST ||
    2856     3881213 :                              prefetch_ios < sbi->s_mb_prefetch_limit)) {
    2857    15439392 :                                 nr = sbi->s_mb_prefetch;
    2858    15439392 :                                 if (ext4_has_feature_flex_bg(sb)) {
    2859    15438520 :                                         nr = 1 << sbi->s_log_groups_per_flex;
    2860    15438520 :                                         nr -= group & (nr - 1);
    2861    15438520 :                                         nr = min(nr, sbi->s_mb_prefetch);
    2862             :                                 }
    2863    15439392 :                                 prefetch_grp = ext4_mb_prefetch(sb, group,
    2864             :                                                         nr, &prefetch_ios);
    2865             :                         }
    2866             : 
    2867             :                         /* This now checks without needing the buddy page */
    2868   168708063 :                         ret = ext4_mb_good_group_nolock(ac, group, cr);
    2869   169126258 :                         if (ret <= 0) {
    2870   159211758 :                                 if (!first_err)
    2871   159231511 :                                         first_err = ret;
    2872   159211758 :                                 continue;
    2873             :                         }
    2874             : 
    2875     9914500 :                         err = ext4_mb_load_buddy(sb, group, &e4b);
    2876     9918663 :                         if (err)
    2877           0 :                                 goto out;
    2878             : 
    2879     9918663 :                         ext4_lock_group(sb, group);
    2880             : 
    2881             :                         /*
    2882             :                          * We need to check again after locking the
    2883             :                          * block group
    2884             :                          */
    2885     9922468 :                         ret = ext4_mb_good_group(ac, group, cr);
    2886     9917586 :                         if (ret == 0) {
    2887         866 :                                 ext4_unlock_group(sb, group);
    2888         866 :                                 ext4_mb_unload_buddy(&e4b);
    2889         866 :                                 continue;
    2890             :                         }
    2891             : 
    2892     9916720 :                         ac->ac_groups_scanned++;
    2893     9916720 :                         if (cr == CR_POWER2_ALIGNED)
    2894      350723 :                                 ext4_mb_simple_scan_group(ac, &e4b);
    2895     9565997 :                         else if ((cr == CR_GOAL_LEN_FAST ||
    2896     1348819 :                                  cr == CR_BEST_AVAIL_LEN) &&
    2897     1348819 :                                  sbi->s_stripe &&
    2898     1232181 :                                  !(ac->ac_g_ex.fe_len %
    2899     1232181 :                                  EXT4_B2C(sbi, sbi->s_stripe)))
    2900      151172 :                                 ext4_mb_scan_aligned(ac, &e4b);
    2901             :                         else
    2902     9414825 :                                 ext4_mb_complex_scan_group(ac, &e4b);
    2903             : 
    2904     9915687 :                         ext4_unlock_group(sb, group);
    2905     9924997 :                         ext4_mb_unload_buddy(&e4b);
    2906             : 
    2907     9921687 :                         if (ac->ac_status != AC_STATUS_CONTINUE)
    2908             :                                 break;
    2909             :                 }
    2910             :                 /* Processed all groups and haven't found blocks */
    2911     3258917 :                 if (sbi->s_mb_stats && i == ngroups)
    2912           0 :                         atomic64_inc(&sbi->s_bal_cX_failed[cr]);
    2913             : 
    2914     3258917 :                 if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN)
    2915             :                         /* Reset goal length to original goal length before
    2916             :                          * falling into CR_GOAL_LEN_SLOW */
    2917      117701 :                         ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
    2918             :         }
    2919             : 
    2920     2584168 :         if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
    2921      812615 :             !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
    2922             :                 /*
    2923             :                  * We've been searching too long. Let's try to allocate
    2924             :                  * the best chunk we've found so far
    2925             :                  */
    2926      812615 :                 ext4_mb_try_best_found(ac, &e4b);
    2927      811581 :                 if (ac->ac_status != AC_STATUS_FOUND) {
    2928             :                         /*
    2929             :                          * Someone more lucky has already allocated it.
    2930             :                          * The only thing we can do is just take first
    2931             :                          * found block(s)
    2932             :                          */
    2933      136249 :                         lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
    2934      136256 :                         mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
    2935             :                                  ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
    2936             :                                  ac->ac_b_ex.fe_len, lost);
    2937             : 
    2938      136256 :                         ac->ac_b_ex.fe_group = 0;
    2939      136256 :                         ac->ac_b_ex.fe_start = 0;
    2940      136256 :                         ac->ac_b_ex.fe_len = 0;
    2941      136256 :                         ac->ac_status = AC_STATUS_CONTINUE;
    2942      136256 :                         ac->ac_flags |= EXT4_MB_HINT_FIRST;
    2943      136256 :                         cr = CR_ANY_FREE;
    2944      136256 :                         goto repeat;
    2945             :                 }
    2946             :         }
    2947             : 
    2948     2446886 :         if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
    2949           0 :                 atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
    2950     2446886 : out:
    2951     2685487 :         if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
    2952           1 :                 err = first_err;
    2953             : 
    2954     2685487 :         mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
    2955             :                  ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
    2956             :                  ac->ac_flags, cr, err);
    2957             : 
    2958     2685487 :         if (nr)
    2959     2446876 :                 ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
    2960             : 
    2961     2685612 :         return err;
    2962             : }
    2963             : 
    2964          47 : static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
    2965             : {
    2966          47 :         struct super_block *sb = pde_data(file_inode(seq->file));
    2967          47 :         ext4_group_t group;
    2968             : 
    2969          94 :         if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
    2970           0 :                 return NULL;
    2971          47 :         group = *pos + 1;
    2972          47 :         return (void *) ((unsigned long) group);
    2973             : }
    2974             : 
    2975        1615 : static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
    2976             : {
    2977        1615 :         struct super_block *sb = pde_data(file_inode(seq->file));
    2978        1615 :         ext4_group_t group;
    2979             : 
    2980        1615 :         ++*pos;
    2981        3230 :         if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
    2982           0 :                 return NULL;
    2983        1615 :         group = *pos + 1;
    2984        1615 :         return (void *) ((unsigned long) group);
    2985             : }
    2986             : 
    2987        1662 : static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
    2988             : {
    2989        1662 :         struct super_block *sb = pde_data(file_inode(seq->file));
    2990        1662 :         ext4_group_t group = (ext4_group_t) ((unsigned long) v);
    2991        1662 :         int i;
    2992        1662 :         int err, buddy_loaded = 0;
    2993        1662 :         struct ext4_buddy e4b;
    2994        1662 :         struct ext4_group_info *grinfo;
    2995        1662 :         unsigned char blocksize_bits = min_t(unsigned char,
    2996             :                                              sb->s_blocksize_bits,
    2997             :                                              EXT4_MAX_BLOCK_LOG_SIZE);
    2998        1662 :         struct sg {
    2999             :                 struct ext4_group_info info;
    3000             :                 ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
    3001             :         } sg;
    3002             : 
    3003        1662 :         group--;
    3004        1662 :         if (group == 0)
    3005          30 :                 seq_puts(seq, "#group: free  frags first ["
    3006             :                               " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
    3007             :                               " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
    3008             : 
    3009        1662 :         i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
    3010             :                 sizeof(struct ext4_group_info);
    3011             : 
    3012        1662 :         grinfo = ext4_get_group_info(sb, group);
    3013        1662 :         if (!grinfo)
    3014             :                 return 0;
    3015             :         /* Load the group info in memory only if not already loaded. */
    3016        1662 :         if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
    3017        1645 :                 err = ext4_mb_load_buddy(sb, group, &e4b);
    3018        1645 :                 if (err) {
    3019           0 :                         seq_printf(seq, "#%-5u: I/O error\n", group);
    3020           0 :                         return 0;
    3021             :                 }
    3022             :                 buddy_loaded = 1;
    3023             :         }
    3024             : 
    3025        1662 :         memcpy(&sg, grinfo, i);
    3026             : 
    3027        1662 :         if (buddy_loaded)
    3028        1645 :                 ext4_mb_unload_buddy(&e4b);
    3029             : 
    3030        1662 :         seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
    3031             :                         sg.info.bb_fragments, sg.info.bb_first_free);
    3032       26592 :         for (i = 0; i <= 13; i++)
    3033       23268 :                 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
    3034             :                                 sg.info.bb_counters[i] : 0);
    3035        1662 :         seq_puts(seq, " ]\n");
    3036             : 
    3037        1662 :         return 0;
    3038             : }
    3039             : 
    3040          47 : static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
    3041             : {
    3042          47 : }
    3043             : 
    3044             : const struct seq_operations ext4_mb_seq_groups_ops = {
    3045             :         .start  = ext4_mb_seq_groups_start,
    3046             :         .next   = ext4_mb_seq_groups_next,
    3047             :         .stop   = ext4_mb_seq_groups_stop,
    3048             :         .show   = ext4_mb_seq_groups_show,
    3049             : };
    3050             : 
    3051           0 : int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
    3052             : {
    3053           0 :         struct super_block *sb = seq->private;
    3054           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3055             : 
    3056           0 :         seq_puts(seq, "mballoc:\n");
    3057           0 :         if (!sbi->s_mb_stats) {
    3058           0 :                 seq_puts(seq, "\tmb stats collection turned off.\n");
    3059           0 :                 seq_puts(
    3060             :                         seq,
    3061             :                         "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
    3062           0 :                 return 0;
    3063             :         }
    3064           0 :         seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
    3065           0 :         seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
    3066             : 
    3067           0 :         seq_printf(seq, "\tgroups_scanned: %u\n",
    3068             :                    atomic_read(&sbi->s_bal_groups_scanned));
    3069             : 
    3070             :         /* CR_POWER2_ALIGNED stats */
    3071           0 :         seq_puts(seq, "\tcr_p2_aligned_stats:\n");
    3072           0 :         seq_printf(seq, "\t\thits: %llu\n",
    3073             :                    atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED]));
    3074           0 :         seq_printf(
    3075             :                 seq, "\t\tgroups_considered: %llu\n",
    3076             :                 atomic64_read(
    3077             :                         &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]));
    3078           0 :         seq_printf(seq, "\t\textents_scanned: %u\n",
    3079             :                    atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
    3080           0 :         seq_printf(seq, "\t\tuseless_loops: %llu\n",
    3081             :                    atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
    3082           0 :         seq_printf(seq, "\t\tbad_suggestions: %u\n",
    3083             :                    atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions));
    3084             : 
    3085             :         /* CR_GOAL_LEN_FAST stats */
    3086           0 :         seq_puts(seq, "\tcr_goal_fast_stats:\n");
    3087           0 :         seq_printf(seq, "\t\thits: %llu\n",
    3088             :                    atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST]));
    3089           0 :         seq_printf(seq, "\t\tgroups_considered: %llu\n",
    3090             :                    atomic64_read(
    3091             :                            &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST]));
    3092           0 :         seq_printf(seq, "\t\textents_scanned: %u\n",
    3093             :                    atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
    3094           0 :         seq_printf(seq, "\t\tuseless_loops: %llu\n",
    3095             :                    atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
    3096           0 :         seq_printf(seq, "\t\tbad_suggestions: %u\n",
    3097             :                    atomic_read(&sbi->s_bal_goal_fast_bad_suggestions));
    3098             : 
    3099             :         /* CR_BEST_AVAIL_LEN stats */
    3100           0 :         seq_puts(seq, "\tcr_best_avail_stats:\n");
    3101           0 :         seq_printf(seq, "\t\thits: %llu\n",
    3102             :                    atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN]));
    3103           0 :         seq_printf(
    3104             :                 seq, "\t\tgroups_considered: %llu\n",
    3105             :                 atomic64_read(
    3106             :                         &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN]));
    3107           0 :         seq_printf(seq, "\t\textents_scanned: %u\n",
    3108             :                    atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
    3109           0 :         seq_printf(seq, "\t\tuseless_loops: %llu\n",
    3110             :                    atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
    3111           0 :         seq_printf(seq, "\t\tbad_suggestions: %u\n",
    3112             :                    atomic_read(&sbi->s_bal_best_avail_bad_suggestions));
    3113             : 
    3114             :         /* CR_GOAL_LEN_SLOW stats */
    3115           0 :         seq_puts(seq, "\tcr_goal_slow_stats:\n");
    3116           0 :         seq_printf(seq, "\t\thits: %llu\n",
    3117             :                    atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW]));
    3118           0 :         seq_printf(seq, "\t\tgroups_considered: %llu\n",
    3119             :                    atomic64_read(
    3120             :                            &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW]));
    3121           0 :         seq_printf(seq, "\t\textents_scanned: %u\n",
    3122             :                    atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW]));
    3123           0 :         seq_printf(seq, "\t\tuseless_loops: %llu\n",
    3124             :                    atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW]));
    3125             : 
    3126             :         /* CR_ANY_FREE stats */
    3127           0 :         seq_puts(seq, "\tcr_any_free_stats:\n");
    3128           0 :         seq_printf(seq, "\t\thits: %llu\n",
    3129             :                    atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE]));
    3130           0 :         seq_printf(
    3131             :                 seq, "\t\tgroups_considered: %llu\n",
    3132             :                 atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE]));
    3133           0 :         seq_printf(seq, "\t\textents_scanned: %u\n",
    3134             :                    atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE]));
    3135           0 :         seq_printf(seq, "\t\tuseless_loops: %llu\n",
    3136             :                    atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE]));
    3137             : 
    3138             :         /* Aggregates */
    3139           0 :         seq_printf(seq, "\textents_scanned: %u\n",
    3140             :                    atomic_read(&sbi->s_bal_ex_scanned));
    3141           0 :         seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
    3142           0 :         seq_printf(seq, "\t\tlen_goal_hits: %u\n",
    3143             :                    atomic_read(&sbi->s_bal_len_goals));
    3144           0 :         seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
    3145           0 :         seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
    3146           0 :         seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
    3147           0 :         seq_printf(seq, "\tbuddies_generated: %u/%u\n",
    3148             :                    atomic_read(&sbi->s_mb_buddies_generated),
    3149             :                    ext4_get_groups_count(sb));
    3150           0 :         seq_printf(seq, "\tbuddies_time_used: %llu\n",
    3151             :                    atomic64_read(&sbi->s_mb_generation_time));
    3152           0 :         seq_printf(seq, "\tpreallocated: %u\n",
    3153             :                    atomic_read(&sbi->s_mb_preallocated));
    3154           0 :         seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded));
    3155           0 :         return 0;
    3156             : }
    3157             : 
    3158           0 : static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
    3159             : __acquires(&EXT4_SB(sb)->s_mb_rb_lock)
    3160             : {
    3161           0 :         struct super_block *sb = pde_data(file_inode(seq->file));
    3162           0 :         unsigned long position;
    3163             : 
    3164           0 :         if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
    3165             :                 return NULL;
    3166           0 :         position = *pos + 1;
    3167           0 :         return (void *) ((unsigned long) position);
    3168             : }
    3169             : 
    3170           0 : static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
    3171             : {
    3172           0 :         struct super_block *sb = pde_data(file_inode(seq->file));
    3173           0 :         unsigned long position;
    3174             : 
    3175           0 :         ++*pos;
    3176           0 :         if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
    3177             :                 return NULL;
    3178           0 :         position = *pos + 1;
    3179           0 :         return (void *) ((unsigned long) position);
    3180             : }
    3181             : 
    3182           0 : static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
    3183             : {
    3184           0 :         struct super_block *sb = pde_data(file_inode(seq->file));
    3185           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3186           0 :         unsigned long position = ((unsigned long) v);
    3187           0 :         struct ext4_group_info *grp;
    3188           0 :         unsigned int count;
    3189             : 
    3190           0 :         position--;
    3191           0 :         if (position >= MB_NUM_ORDERS(sb)) {
    3192           0 :                 position -= MB_NUM_ORDERS(sb);
    3193           0 :                 if (position == 0)
    3194           0 :                         seq_puts(seq, "avg_fragment_size_lists:\n");
    3195             : 
    3196           0 :                 count = 0;
    3197           0 :                 read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
    3198           0 :                 list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
    3199             :                                     bb_avg_fragment_size_node)
    3200           0 :                         count++;
    3201           0 :                 read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
    3202           0 :                 seq_printf(seq, "\tlist_order_%u_groups: %u\n",
    3203             :                                         (unsigned int)position, count);
    3204           0 :                 return 0;
    3205             :         }
    3206             : 
    3207           0 :         if (position == 0) {
    3208           0 :                 seq_printf(seq, "optimize_scan: %d\n",
    3209           0 :                            test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
    3210           0 :                 seq_puts(seq, "max_free_order_lists:\n");
    3211             :         }
    3212           0 :         count = 0;
    3213           0 :         read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
    3214           0 :         list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
    3215             :                             bb_largest_free_order_node)
    3216           0 :                 count++;
    3217           0 :         read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
    3218           0 :         seq_printf(seq, "\tlist_order_%u_groups: %u\n",
    3219             :                    (unsigned int)position, count);
    3220             : 
    3221           0 :         return 0;
    3222             : }
    3223             : 
    3224           0 : static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
    3225             : {
    3226           0 : }
    3227             : 
    3228             : const struct seq_operations ext4_mb_seq_structs_summary_ops = {
    3229             :         .start  = ext4_mb_seq_structs_summary_start,
    3230             :         .next   = ext4_mb_seq_structs_summary_next,
    3231             :         .stop   = ext4_mb_seq_structs_summary_stop,
    3232             :         .show   = ext4_mb_seq_structs_summary_show,
    3233             : };
    3234             : 
    3235             : static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
    3236             : {
    3237      859087 :         int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
    3238     1718174 :         struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
    3239             : 
    3240      859087 :         BUG_ON(!cachep);
    3241      859087 :         return cachep;
    3242             : }
    3243             : 
    3244             : /*
    3245             :  * Allocate the top-level s_group_info array for the specified number
    3246             :  * of groups
    3247             :  */
    3248        2561 : int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
    3249             : {
    3250        2561 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3251        2561 :         unsigned size;
    3252        2561 :         struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
    3253             : 
    3254           0 :         size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
    3255        2561 :                 EXT4_DESC_PER_BLOCK_BITS(sb);
    3256        2561 :         if (size <= sbi->s_group_info_size)
    3257             :                 return 0;
    3258             : 
    3259        2547 :         size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
    3260        2547 :         new_groupinfo = kvzalloc(size, GFP_KERNEL);
    3261        2547 :         if (!new_groupinfo) {
    3262           0 :                 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
    3263           0 :                 return -ENOMEM;
    3264             :         }
    3265        2547 :         rcu_read_lock();
    3266        2547 :         old_groupinfo = rcu_dereference(sbi->s_group_info);
    3267        2547 :         if (old_groupinfo)
    3268          22 :                 memcpy(new_groupinfo, old_groupinfo,
    3269             :                        sbi->s_group_info_size * sizeof(*sbi->s_group_info));
    3270        2547 :         rcu_read_unlock();
    3271        2547 :         rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
    3272        2547 :         sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
    3273        2547 :         if (old_groupinfo)
    3274          11 :                 ext4_kvfree_array_rcu(old_groupinfo);
    3275             :         ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
    3276             :                    sbi->s_group_info_size);
    3277             :         return 0;
    3278             : }
    3279             : 
    3280             : /* Create and initialize ext4_group_info data for the given group. */
    3281      856551 : int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
    3282             :                           struct ext4_group_desc *desc)
    3283             : {
    3284      856551 :         int i;
    3285      856551 :         int metalen = 0;
    3286      856551 :         int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
    3287      856551 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3288      856551 :         struct ext4_group_info **meta_group_info;
    3289      856551 :         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
    3290             : 
    3291             :         /*
    3292             :          * First check if this group is the first of a reserved block.
    3293             :          * If it's true, we have to allocate a new table of pointers
    3294             :          * to ext4_group_info structures
    3295             :          */
    3296      856551 :         if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
    3297       16459 :                 metalen = sizeof(*meta_group_info) <<
    3298       16459 :                         EXT4_DESC_PER_BLOCK_BITS(sb);
    3299       16459 :                 meta_group_info = kmalloc(metalen, GFP_NOFS);
    3300       16459 :                 if (meta_group_info == NULL) {
    3301           0 :                         ext4_msg(sb, KERN_ERR, "can't allocate mem "
    3302             :                                  "for a buddy group");
    3303           0 :                         return -ENOMEM;
    3304             :                 }
    3305       16459 :                 rcu_read_lock();
    3306       16459 :                 rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
    3307       16459 :                 rcu_read_unlock();
    3308             :         }
    3309             : 
    3310      856551 :         meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
    3311      856551 :         i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
    3312             : 
    3313      856551 :         meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
    3314      856551 :         if (meta_group_info[i] == NULL) {
    3315           0 :                 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
    3316           0 :                 goto exit_group_info;
    3317             :         }
    3318      856551 :         set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
    3319      856551 :                 &(meta_group_info[i]->bb_state));
    3320             : 
    3321             :         /*
    3322             :          * initialize bb_free to be able to skip
    3323             :          * empty groups without initialization
    3324             :          */
    3325      856551 :         if (ext4_has_group_desc_csum(sb) &&
    3326      853736 :             (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
    3327     1390588 :                 meta_group_info[i]->bb_free =
    3328      695294 :                         ext4_free_clusters_after_init(sb, group, desc);
    3329             :         } else {
    3330      322514 :                 meta_group_info[i]->bb_free =
    3331      161257 :                         ext4_free_group_clusters(sb, desc);
    3332             :         }
    3333             : 
    3334      856551 :         INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
    3335      856551 :         init_rwsem(&meta_group_info[i]->alloc_sem);
    3336      856551 :         meta_group_info[i]->bb_free_root = RB_ROOT;
    3337      856551 :         INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
    3338      856551 :         INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
    3339      856551 :         meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
    3340      856551 :         meta_group_info[i]->bb_avg_fragment_size_order = -1;  /* uninit */
    3341      856551 :         meta_group_info[i]->bb_group = group;
    3342             : 
    3343      856551 :         mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
    3344      856551 :         return 0;
    3345             : 
    3346             : exit_group_info:
    3347             :         /* If a meta_group_info table has been allocated, release it now */
    3348           0 :         if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
    3349           0 :                 struct ext4_group_info ***group_info;
    3350             : 
    3351           0 :                 rcu_read_lock();
    3352           0 :                 group_info = rcu_dereference(sbi->s_group_info);
    3353           0 :                 kfree(group_info[idx]);
    3354           0 :                 group_info[idx] = NULL;
    3355           0 :                 rcu_read_unlock();
    3356             :         }
    3357             :         return -ENOMEM;
    3358             : } /* ext4_mb_add_groupinfo */
    3359             : 
    3360        2536 : static int ext4_mb_init_backend(struct super_block *sb)
    3361             : {
    3362        2536 :         ext4_group_t ngroups = ext4_get_groups_count(sb);
    3363        2536 :         ext4_group_t i;
    3364        2536 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3365        2536 :         int err;
    3366        2536 :         struct ext4_group_desc *desc;
    3367        2536 :         struct ext4_group_info ***group_info;
    3368        2536 :         struct kmem_cache *cachep;
    3369             : 
    3370        2536 :         err = ext4_mb_alloc_groupinfo(sb, ngroups);
    3371        2536 :         if (err)
    3372             :                 return err;
    3373             : 
    3374        2536 :         sbi->s_buddy_cache = new_inode(sb);
    3375        2536 :         if (sbi->s_buddy_cache == NULL) {
    3376           0 :                 ext4_msg(sb, KERN_ERR, "can't get new inode");
    3377           0 :                 goto err_freesgi;
    3378             :         }
    3379             :         /* To avoid potentially colliding with an valid on-disk inode number,
    3380             :          * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
    3381             :          * not in the inode hash, so it should never be found by iget(), but
    3382             :          * this will avoid confusion if it ever shows up during debugging. */
    3383        2536 :         sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
    3384        2536 :         EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
    3385      854586 :         for (i = 0; i < ngroups; i++) {
    3386      852050 :                 cond_resched();
    3387      852050 :                 desc = ext4_get_group_desc(sb, i, NULL);
    3388      852050 :                 if (desc == NULL) {
    3389           0 :                         ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
    3390           0 :                         goto err_freebuddy;
    3391             :                 }
    3392      852050 :                 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
    3393           0 :                         goto err_freebuddy;
    3394             :         }
    3395             : 
    3396        2536 :         if (ext4_has_feature_flex_bg(sb)) {
    3397             :                 /* a single flex group is supposed to be read by a single IO.
    3398             :                  * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
    3399             :                  * unsigned integer, so the maximum shift is 32.
    3400             :                  */
    3401        2345 :                 if (sbi->s_es->s_log_groups_per_flex >= 32) {
    3402           0 :                         ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
    3403           0 :                         goto err_freebuddy;
    3404             :                 }
    3405        2345 :                 sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
    3406             :                         BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
    3407        2345 :                 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
    3408             :         } else {
    3409         191 :                 sbi->s_mb_prefetch = 32;
    3410             :         }
    3411        2536 :         if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
    3412         701 :                 sbi->s_mb_prefetch = ext4_get_groups_count(sb);
    3413             :         /* now many real IOs to prefetch within a single allocation at cr=0
    3414             :          * given cr=0 is an CPU-related optimization we shouldn't try to
    3415             :          * load too many groups, at some point we should start to use what
    3416             :          * we've got in memory.
    3417             :          * with an average random access time 5ms, it'd take a second to get
    3418             :          * 200 groups (* N with flex_bg), so let's make this limit 4
    3419             :          */
    3420        2536 :         sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
    3421        2536 :         if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
    3422        2518 :                 sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
    3423             : 
    3424             :         return 0;
    3425             : 
    3426           0 : err_freebuddy:
    3427           0 :         cachep = get_groupinfo_cache(sb->s_blocksize_bits);
    3428           0 :         while (i-- > 0) {
    3429           0 :                 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
    3430             : 
    3431           0 :                 if (grp)
    3432           0 :                         kmem_cache_free(cachep, grp);
    3433             :         }
    3434           0 :         i = sbi->s_group_info_size;
    3435           0 :         rcu_read_lock();
    3436           0 :         group_info = rcu_dereference(sbi->s_group_info);
    3437           0 :         while (i-- > 0)
    3438           0 :                 kfree(group_info[i]);
    3439           0 :         rcu_read_unlock();
    3440           0 :         iput(sbi->s_buddy_cache);
    3441           0 : err_freesgi:
    3442           0 :         rcu_read_lock();
    3443           0 :         kvfree(rcu_dereference(sbi->s_group_info));
    3444           0 :         rcu_read_unlock();
    3445           0 :         return -ENOMEM;
    3446             : }
    3447             : 
    3448           0 : static void ext4_groupinfo_destroy_slabs(void)
    3449             : {
    3450           0 :         int i;
    3451             : 
    3452           0 :         for (i = 0; i < NR_GRPINFO_CACHES; i++) {
    3453           0 :                 kmem_cache_destroy(ext4_groupinfo_caches[i]);
    3454           0 :                 ext4_groupinfo_caches[i] = NULL;
    3455             :         }
    3456           0 : }
    3457             : 
    3458        2536 : static int ext4_groupinfo_create_slab(size_t size)
    3459             : {
    3460        2536 :         static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
    3461        2536 :         int slab_size;
    3462        2536 :         int blocksize_bits = order_base_2(size);
    3463        2536 :         int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
    3464        2536 :         struct kmem_cache *cachep;
    3465             : 
    3466        2536 :         if (cache_index >= NR_GRPINFO_CACHES)
    3467             :                 return -EINVAL;
    3468             : 
    3469        2536 :         if (unlikely(cache_index < 0))
    3470           0 :                 cache_index = 0;
    3471             : 
    3472        2536 :         mutex_lock(&ext4_grpinfo_slab_create_mutex);
    3473        2536 :         if (ext4_groupinfo_caches[cache_index]) {
    3474        2533 :                 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
    3475        2533 :                 return 0;       /* Already created */
    3476             :         }
    3477             : 
    3478           3 :         slab_size = offsetof(struct ext4_group_info,
    3479             :                                 bb_counters[blocksize_bits + 2]);
    3480             : 
    3481           3 :         cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
    3482             :                                         slab_size, 0, SLAB_RECLAIM_ACCOUNT,
    3483             :                                         NULL);
    3484             : 
    3485           3 :         ext4_groupinfo_caches[cache_index] = cachep;
    3486             : 
    3487           3 :         mutex_unlock(&ext4_grpinfo_slab_create_mutex);
    3488           3 :         if (!cachep) {
    3489           0 :                 printk(KERN_EMERG
    3490             :                        "EXT4-fs: no memory for groupinfo slab cache\n");
    3491           0 :                 return -ENOMEM;
    3492             :         }
    3493             : 
    3494             :         return 0;
    3495             : }
    3496             : 
    3497           0 : static void ext4_discard_work(struct work_struct *work)
    3498             : {
    3499           0 :         struct ext4_sb_info *sbi = container_of(work,
    3500             :                         struct ext4_sb_info, s_discard_work);
    3501           0 :         struct super_block *sb = sbi->s_sb;
    3502           0 :         struct ext4_free_data *fd, *nfd;
    3503           0 :         struct ext4_buddy e4b;
    3504           0 :         struct list_head discard_list;
    3505           0 :         ext4_group_t grp, load_grp;
    3506           0 :         int err = 0;
    3507             : 
    3508           0 :         INIT_LIST_HEAD(&discard_list);
    3509           0 :         spin_lock(&sbi->s_md_lock);
    3510           0 :         list_splice_init(&sbi->s_discard_list, &discard_list);
    3511           0 :         spin_unlock(&sbi->s_md_lock);
    3512             : 
    3513           0 :         load_grp = UINT_MAX;
    3514           0 :         list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
    3515             :                 /*
    3516             :                  * If filesystem is umounting or no memory or suffering
    3517             :                  * from no space, give up the discard
    3518             :                  */
    3519           0 :                 if ((sb->s_flags & SB_ACTIVE) && !err &&
    3520             :                     !atomic_read(&sbi->s_retry_alloc_pending)) {
    3521           0 :                         grp = fd->efd_group;
    3522           0 :                         if (grp != load_grp) {
    3523           0 :                                 if (load_grp != UINT_MAX)
    3524           0 :                                         ext4_mb_unload_buddy(&e4b);
    3525             : 
    3526           0 :                                 err = ext4_mb_load_buddy(sb, grp, &e4b);
    3527           0 :                                 if (err) {
    3528           0 :                                         kmem_cache_free(ext4_free_data_cachep, fd);
    3529           0 :                                         load_grp = UINT_MAX;
    3530           0 :                                         continue;
    3531             :                                 } else {
    3532             :                                         load_grp = grp;
    3533             :                                 }
    3534             :                         }
    3535             : 
    3536           0 :                         ext4_lock_group(sb, grp);
    3537           0 :                         ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
    3538           0 :                                                 fd->efd_start_cluster + fd->efd_count - 1, 1);
    3539           0 :                         ext4_unlock_group(sb, grp);
    3540             :                 }
    3541           0 :                 kmem_cache_free(ext4_free_data_cachep, fd);
    3542             :         }
    3543             : 
    3544           0 :         if (load_grp != UINT_MAX)
    3545           0 :                 ext4_mb_unload_buddy(&e4b);
    3546           0 : }
    3547             : 
    3548        2536 : int ext4_mb_init(struct super_block *sb)
    3549             : {
    3550        2536 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3551        2536 :         unsigned i, j;
    3552        2536 :         unsigned offset, offset_incr;
    3553        2536 :         unsigned max;
    3554        2536 :         int ret;
    3555             : 
    3556        2536 :         i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
    3557             : 
    3558        2536 :         sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
    3559        2536 :         if (sbi->s_mb_offsets == NULL) {
    3560           0 :                 ret = -ENOMEM;
    3561           0 :                 goto out;
    3562             :         }
    3563             : 
    3564        2536 :         i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
    3565        2536 :         sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
    3566        2536 :         if (sbi->s_mb_maxs == NULL) {
    3567           0 :                 ret = -ENOMEM;
    3568           0 :                 goto out;
    3569             :         }
    3570             : 
    3571        2536 :         ret = ext4_groupinfo_create_slab(sb->s_blocksize);
    3572        2536 :         if (ret < 0)
    3573           0 :                 goto out;
    3574             : 
    3575             :         /* order 0 is regular bitmap */
    3576        2536 :         sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
    3577        2536 :         sbi->s_mb_offsets[0] = 0;
    3578             : 
    3579        2536 :         i = 1;
    3580        2536 :         offset = 0;
    3581        2536 :         offset_incr = 1 << (sb->s_blocksize_bits - 1);
    3582        2536 :         max = sb->s_blocksize << 2;
    3583       32953 :         do {
    3584       32953 :                 sbi->s_mb_offsets[i] = offset;
    3585       32953 :                 sbi->s_mb_maxs[i] = max;
    3586       32953 :                 offset += offset_incr;
    3587       32953 :                 offset_incr = offset_incr >> 1;
    3588       32953 :                 max = max >> 1;
    3589       32953 :                 i++;
    3590       32953 :         } while (i < MB_NUM_ORDERS(sb));
    3591             : 
    3592        5072 :         sbi->s_mb_avg_fragment_size =
    3593        2536 :                 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
    3594             :                         GFP_KERNEL);
    3595        2536 :         if (!sbi->s_mb_avg_fragment_size) {
    3596           0 :                 ret = -ENOMEM;
    3597           0 :                 goto out;
    3598             :         }
    3599        5072 :         sbi->s_mb_avg_fragment_size_locks =
    3600        2536 :                 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
    3601             :                         GFP_KERNEL);
    3602        2536 :         if (!sbi->s_mb_avg_fragment_size_locks) {
    3603           0 :                 ret = -ENOMEM;
    3604           0 :                 goto out;
    3605             :         }
    3606       38025 :         for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
    3607       35489 :                 INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
    3608       35489 :                 rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
    3609             :         }
    3610        5072 :         sbi->s_mb_largest_free_orders =
    3611        2536 :                 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
    3612             :                         GFP_KERNEL);
    3613        2536 :         if (!sbi->s_mb_largest_free_orders) {
    3614           0 :                 ret = -ENOMEM;
    3615           0 :                 goto out;
    3616             :         }
    3617        5072 :         sbi->s_mb_largest_free_orders_locks =
    3618        2536 :                 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
    3619             :                         GFP_KERNEL);
    3620        2536 :         if (!sbi->s_mb_largest_free_orders_locks) {
    3621           0 :                 ret = -ENOMEM;
    3622           0 :                 goto out;
    3623             :         }
    3624       38025 :         for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
    3625       35489 :                 INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
    3626       35489 :                 rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
    3627             :         }
    3628             : 
    3629        2536 :         spin_lock_init(&sbi->s_md_lock);
    3630        2536 :         sbi->s_mb_free_pending = 0;
    3631        2536 :         INIT_LIST_HEAD(&sbi->s_freed_data_list);
    3632        2536 :         INIT_LIST_HEAD(&sbi->s_discard_list);
    3633        2536 :         INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
    3634        2536 :         atomic_set(&sbi->s_retry_alloc_pending, 0);
    3635             : 
    3636        2536 :         sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
    3637        2536 :         sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
    3638        2536 :         sbi->s_mb_stats = MB_DEFAULT_STATS;
    3639        2536 :         sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
    3640        2536 :         sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
    3641        2536 :         sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;
    3642             : 
    3643             :         /*
    3644             :          * The default group preallocation is 512, which for 4k block
    3645             :          * sizes translates to 2 megabytes.  However for bigalloc file
    3646             :          * systems, this is probably too big (i.e, if the cluster size
    3647             :          * is 1 megabyte, then group preallocation size becomes half a
    3648             :          * gigabyte!).  As a default, we will keep a two megabyte
    3649             :          * group pralloc size for cluster sizes up to 64k, and after
    3650             :          * that, we will force a minimum group preallocation size of
    3651             :          * 32 clusters.  This translates to 8 megs when the cluster
    3652             :          * size is 256k, and 32 megs when the cluster size is 1 meg,
    3653             :          * which seems reasonable as a default.
    3654             :          */
    3655        2536 :         sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
    3656             :                                        sbi->s_cluster_bits, 32);
    3657             :         /*
    3658             :          * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
    3659             :          * to the lowest multiple of s_stripe which is bigger than
    3660             :          * the s_mb_group_prealloc as determined above. We want
    3661             :          * the preallocation size to be an exact multiple of the
    3662             :          * RAID stripe size so that preallocations don't fragment
    3663             :          * the stripes.
    3664             :          */
    3665        2536 :         if (sbi->s_stripe > 1) {
    3666        2493 :                 sbi->s_mb_group_prealloc = roundup(
    3667             :                         sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe));
    3668             :         }
    3669             : 
    3670        2536 :         sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
    3671        2536 :         if (sbi->s_locality_groups == NULL) {
    3672           0 :                 ret = -ENOMEM;
    3673           0 :                 goto out;
    3674             :         }
    3675       12680 :         for_each_possible_cpu(i) {
    3676       10144 :                 struct ext4_locality_group *lg;
    3677       10144 :                 lg = per_cpu_ptr(sbi->s_locality_groups, i);
    3678       10144 :                 mutex_init(&lg->lg_mutex);
    3679      121728 :                 for (j = 0; j < PREALLOC_TB_SIZE; j++)
    3680      101440 :                         INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
    3681       10144 :                 spin_lock_init(&lg->lg_prealloc_lock);
    3682             :         }
    3683             : 
    3684        2536 :         if (bdev_nonrot(sb->s_bdev))
    3685           6 :                 sbi->s_mb_max_linear_groups = 0;
    3686             :         else
    3687        2530 :                 sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
    3688             :         /* init file for buddy data */
    3689        2536 :         ret = ext4_mb_init_backend(sb);
    3690        2536 :         if (ret != 0)
    3691           0 :                 goto out_free_locality_groups;
    3692             : 
    3693             :         return 0;
    3694             : 
    3695             : out_free_locality_groups:
    3696           0 :         free_percpu(sbi->s_locality_groups);
    3697           0 :         sbi->s_locality_groups = NULL;
    3698           0 : out:
    3699           0 :         kfree(sbi->s_mb_avg_fragment_size);
    3700           0 :         kfree(sbi->s_mb_avg_fragment_size_locks);
    3701           0 :         kfree(sbi->s_mb_largest_free_orders);
    3702           0 :         kfree(sbi->s_mb_largest_free_orders_locks);
    3703           0 :         kfree(sbi->s_mb_offsets);
    3704           0 :         sbi->s_mb_offsets = NULL;
    3705           0 :         kfree(sbi->s_mb_maxs);
    3706           0 :         sbi->s_mb_maxs = NULL;
    3707           0 :         return ret;
    3708             : }
    3709             : 
    3710             : /* need to called with the ext4 group lock held */
    3711      856551 : static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
    3712             : {
    3713      856551 :         struct ext4_prealloc_space *pa;
    3714      856551 :         struct list_head *cur, *tmp;
    3715      856551 :         int count = 0;
    3716             : 
    3717      857324 :         list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
    3718         773 :                 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
    3719         773 :                 list_del(&pa->pa_group_list);
    3720         773 :                 count++;
    3721         773 :                 kmem_cache_free(ext4_pspace_cachep, pa);
    3722             :         }
    3723      856551 :         return count;
    3724             : }
    3725             : 
    3726        2536 : int ext4_mb_release(struct super_block *sb)
    3727             : {
    3728        2536 :         ext4_group_t ngroups = ext4_get_groups_count(sb);
    3729        2536 :         ext4_group_t i;
    3730        2536 :         int num_meta_group_infos;
    3731        2536 :         struct ext4_group_info *grinfo, ***group_info;
    3732        2536 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3733        2536 :         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
    3734        2536 :         int count;
    3735             : 
    3736        2536 :         if (test_opt(sb, DISCARD)) {
    3737             :                 /*
    3738             :                  * wait the discard work to drain all of ext4_free_data
    3739             :                  */
    3740           8 :                 flush_work(&sbi->s_discard_work);
    3741           8 :                 WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
    3742             :         }
    3743             : 
    3744        2536 :         if (sbi->s_group_info) {
    3745      859087 :                 for (i = 0; i < ngroups; i++) {
    3746      856551 :                         cond_resched();
    3747      856551 :                         grinfo = ext4_get_group_info(sb, i);
    3748      856551 :                         if (!grinfo)
    3749           0 :                                 continue;
    3750      856551 :                         mb_group_bb_bitmap_free(grinfo);
    3751      856551 :                         ext4_lock_group(sb, i);
    3752      856551 :                         count = ext4_mb_cleanup_pa(grinfo);
    3753      856551 :                         if (count)
    3754             :                                 mb_debug(sb, "mballoc: %d PAs left\n",
    3755             :                                          count);
    3756      856551 :                         ext4_unlock_group(sb, i);
    3757      856551 :                         kmem_cache_free(cachep, grinfo);
    3758             :                 }
    3759        2536 :                 num_meta_group_infos = (ngroups +
    3760           0 :                                 EXT4_DESC_PER_BLOCK(sb) - 1) >>
    3761        2536 :                         EXT4_DESC_PER_BLOCK_BITS(sb);
    3762        2536 :                 rcu_read_lock();
    3763        2536 :                 group_info = rcu_dereference(sbi->s_group_info);
    3764       18995 :                 for (i = 0; i < num_meta_group_infos; i++)
    3765       16459 :                         kfree(group_info[i]);
    3766        2536 :                 kvfree(group_info);
    3767        2536 :                 rcu_read_unlock();
    3768             :         }
    3769        2536 :         kfree(sbi->s_mb_avg_fragment_size);
    3770        2536 :         kfree(sbi->s_mb_avg_fragment_size_locks);
    3771        2536 :         kfree(sbi->s_mb_largest_free_orders);
    3772        2536 :         kfree(sbi->s_mb_largest_free_orders_locks);
    3773        2536 :         kfree(sbi->s_mb_offsets);
    3774        2536 :         kfree(sbi->s_mb_maxs);
    3775        2536 :         iput(sbi->s_buddy_cache);
    3776        2536 :         if (sbi->s_mb_stats) {
    3777           0 :                 ext4_msg(sb, KERN_INFO,
    3778             :                        "mballoc: %u blocks %u reqs (%u success)",
    3779             :                                 atomic_read(&sbi->s_bal_allocated),
    3780             :                                 atomic_read(&sbi->s_bal_reqs),
    3781             :                                 atomic_read(&sbi->s_bal_success));
    3782           0 :                 ext4_msg(sb, KERN_INFO,
    3783             :                       "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
    3784             :                                 "%u 2^N hits, %u breaks, %u lost",
    3785             :                                 atomic_read(&sbi->s_bal_ex_scanned),
    3786             :                                 atomic_read(&sbi->s_bal_groups_scanned),
    3787             :                                 atomic_read(&sbi->s_bal_goals),
    3788             :                                 atomic_read(&sbi->s_bal_2orders),
    3789             :                                 atomic_read(&sbi->s_bal_breaks),
    3790             :                                 atomic_read(&sbi->s_mb_lost_chunks));
    3791           0 :                 ext4_msg(sb, KERN_INFO,
    3792             :                        "mballoc: %u generated and it took %llu",
    3793             :                                 atomic_read(&sbi->s_mb_buddies_generated),
    3794             :                                 atomic64_read(&sbi->s_mb_generation_time));
    3795           0 :                 ext4_msg(sb, KERN_INFO,
    3796             :                        "mballoc: %u preallocated, %u discarded",
    3797             :                                 atomic_read(&sbi->s_mb_preallocated),
    3798             :                                 atomic_read(&sbi->s_mb_discarded));
    3799             :         }
    3800             : 
    3801        2536 :         free_percpu(sbi->s_locality_groups);
    3802             : 
    3803        2536 :         return 0;
    3804             : }
    3805             : 
    3806       17545 : static inline int ext4_issue_discard(struct super_block *sb,
    3807             :                 ext4_group_t block_group, ext4_grpblk_t cluster, int count,
    3808             :                 struct bio **biop)
    3809             : {
    3810       17545 :         ext4_fsblk_t discard_block;
    3811             : 
    3812       17545 :         discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
    3813             :                          ext4_group_first_block_no(sb, block_group));
    3814       17545 :         count = EXT4_C2B(EXT4_SB(sb), count);
    3815       17545 :         trace_ext4_discard_blocks(sb,
    3816             :                         (unsigned long long) discard_block, count);
    3817       17545 :         if (biop) {
    3818           0 :                 return __blkdev_issue_discard(sb->s_bdev,
    3819           0 :                         (sector_t)discard_block << (sb->s_blocksize_bits - 9),
    3820           0 :                         (sector_t)count << (sb->s_blocksize_bits - 9),
    3821             :                         GFP_NOFS, biop);
    3822             :         } else
    3823       17545 :                 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
    3824             : }
    3825             : 
    3826     1664427 : static void ext4_free_data_in_buddy(struct super_block *sb,
    3827             :                                     struct ext4_free_data *entry)
    3828             : {
    3829     1664427 :         struct ext4_buddy e4b;
    3830     1664427 :         struct ext4_group_info *db;
    3831     1664427 :         int err, count = 0;
    3832             : 
    3833     1664427 :         mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
    3834             :                  entry->efd_count, entry->efd_group, entry);
    3835             : 
    3836     1664427 :         err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
    3837             :         /* we expect to find existing buddy because it's pinned */
    3838     1664427 :         BUG_ON(err != 0);
    3839             : 
    3840     1664427 :         spin_lock(&EXT4_SB(sb)->s_md_lock);
    3841     1664427 :         EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
    3842     1664427 :         spin_unlock(&EXT4_SB(sb)->s_md_lock);
    3843             : 
    3844     1664427 :         db = e4b.bd_info;
    3845             :         /* there are blocks to put in buddy to make them really free */
    3846     1664427 :         count += entry->efd_count;
    3847     1664427 :         ext4_lock_group(sb, entry->efd_group);
    3848             :         /* Take it out of per group rb tree */
    3849     1664427 :         rb_erase(&entry->efd_node, &(db->bb_free_root));
    3850     1664427 :         mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
    3851             : 
    3852             :         /*
    3853             :          * Clear the trimmed flag for the group so that the next
    3854             :          * ext4_trim_fs can trim it.
    3855             :          * If the volume is mounted with -o discard, online discard
    3856             :          * is supported and the free blocks will be trimmed online.
    3857             :          */
    3858     1664427 :         if (!test_opt(sb, DISCARD))
    3859     1664427 :                 EXT4_MB_GRP_CLEAR_TRIMMED(db);
    3860             : 
    3861     1664427 :         if (!db->bb_free_root.rb_node) {
    3862             :                 /* No more items in the per group rb tree
    3863             :                  * balance refcounts from ext4_mb_free_metadata()
    3864             :                  */
    3865      212339 :                 put_page(e4b.bd_buddy_page);
    3866      212339 :                 put_page(e4b.bd_bitmap_page);
    3867             :         }
    3868     1664427 :         ext4_unlock_group(sb, entry->efd_group);
    3869     1664427 :         ext4_mb_unload_buddy(&e4b);
    3870             : 
    3871     1664427 :         mb_debug(sb, "freed %d blocks in 1 structures\n", count);
    3872     1664427 : }
    3873             : 
    3874             : /*
    3875             :  * This function is called by the jbd2 layer once the commit has finished,
    3876             :  * so we know we can free the blocks that were released with that commit.
    3877             :  */
    3878      206991 : void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
    3879             : {
    3880      206991 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    3881      206991 :         struct ext4_free_data *entry, *tmp;
    3882      206991 :         struct list_head freed_data_list;
    3883      206991 :         struct list_head *cut_pos = NULL;
    3884      206991 :         bool wake;
    3885             : 
    3886      206991 :         INIT_LIST_HEAD(&freed_data_list);
    3887             : 
    3888      206991 :         spin_lock(&sbi->s_md_lock);
    3889     1871418 :         list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
    3890     1673997 :                 if (entry->efd_tid != commit_tid)
    3891             :                         break;
    3892     1664427 :                 cut_pos = &entry->efd_list;
    3893             :         }
    3894      206991 :         if (cut_pos)
    3895      136905 :                 list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
    3896             :                                   cut_pos);
    3897      206991 :         spin_unlock(&sbi->s_md_lock);
    3898             : 
    3899     1871418 :         list_for_each_entry(entry, &freed_data_list, efd_list)
    3900     1664427 :                 ext4_free_data_in_buddy(sb, entry);
    3901             : 
    3902      206991 :         if (test_opt(sb, DISCARD)) {
    3903           0 :                 spin_lock(&sbi->s_md_lock);
    3904           0 :                 wake = list_empty(&sbi->s_discard_list);
    3905           0 :                 list_splice_tail(&freed_data_list, &sbi->s_discard_list);
    3906           0 :                 spin_unlock(&sbi->s_md_lock);
    3907           0 :                 if (wake)
    3908           0 :                         queue_work(system_unbound_wq, &sbi->s_discard_work);
    3909             :         } else {
    3910     1871418 :                 list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
    3911     1664427 :                         kmem_cache_free(ext4_free_data_cachep, entry);
    3912             :         }
    3913      206991 : }
    3914             : 
    3915          12 : int __init ext4_init_mballoc(void)
    3916             : {
    3917          12 :         ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
    3918             :                                         SLAB_RECLAIM_ACCOUNT);
    3919          12 :         if (ext4_pspace_cachep == NULL)
    3920           0 :                 goto out;
    3921             : 
    3922          12 :         ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
    3923             :                                     SLAB_RECLAIM_ACCOUNT);
    3924          12 :         if (ext4_ac_cachep == NULL)
    3925           0 :                 goto out_pa_free;
    3926             : 
    3927          12 :         ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
    3928             :                                            SLAB_RECLAIM_ACCOUNT);
    3929          12 :         if (ext4_free_data_cachep == NULL)
    3930           0 :                 goto out_ac_free;
    3931             : 
    3932             :         return 0;
    3933             : 
    3934             : out_ac_free:
    3935           0 :         kmem_cache_destroy(ext4_ac_cachep);
    3936           0 : out_pa_free:
    3937           0 :         kmem_cache_destroy(ext4_pspace_cachep);
    3938             : out:
    3939             :         return -ENOMEM;
    3940             : }
    3941             : 
    3942           0 : void ext4_exit_mballoc(void)
    3943             : {
    3944             :         /*
    3945             :          * Wait for completion of call_rcu()'s on ext4_pspace_cachep
    3946             :          * before destroying the slab cache.
    3947             :          */
    3948           0 :         rcu_barrier();
    3949           0 :         kmem_cache_destroy(ext4_pspace_cachep);
    3950           0 :         kmem_cache_destroy(ext4_ac_cachep);
    3951           0 :         kmem_cache_destroy(ext4_free_data_cachep);
    3952           0 :         ext4_groupinfo_destroy_slabs();
    3953           0 : }
    3954             : 
    3955             : 
    3956             : /*
    3957             :  * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
    3958             :  * Returns 0 if success or error code
    3959             :  */
    3960             : static noinline_for_stack int
    3961     3619061 : ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
    3962             :                                 handle_t *handle, unsigned int reserv_clstrs)
    3963             : {
    3964     3619061 :         struct buffer_head *bitmap_bh = NULL;
    3965     3619061 :         struct ext4_group_desc *gdp;
    3966     3619061 :         struct buffer_head *gdp_bh;
    3967     3619061 :         struct ext4_sb_info *sbi;
    3968     3619061 :         struct super_block *sb;
    3969     3619061 :         ext4_fsblk_t block;
    3970     3619061 :         int err, len;
    3971             : 
    3972     3619061 :         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
    3973     3619061 :         BUG_ON(ac->ac_b_ex.fe_len <= 0);
    3974             : 
    3975     3619061 :         sb = ac->ac_sb;
    3976     3619061 :         sbi = EXT4_SB(sb);
    3977             : 
    3978     3619061 :         bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
    3979     3619141 :         if (IS_ERR(bitmap_bh)) {
    3980           0 :                 return PTR_ERR(bitmap_bh);
    3981             :         }
    3982             : 
    3983     3619141 :         BUFFER_TRACE(bitmap_bh, "getting write access");
    3984     3619141 :         err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
    3985             :                                             EXT4_JTR_NONE);
    3986     3619851 :         if (err)
    3987           0 :                 goto out_err;
    3988             : 
    3989     3619851 :         err = -EIO;
    3990     3619851 :         gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
    3991     3619899 :         if (!gdp)
    3992           0 :                 goto out_err;
    3993             : 
    3994     3619899 :         ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
    3995             :                         ext4_free_group_clusters(sb, gdp));
    3996             : 
    3997     3619899 :         BUFFER_TRACE(gdp_bh, "get_write_access");
    3998     3619899 :         err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
    3999     3619879 :         if (err)
    4000           0 :                 goto out_err;
    4001             : 
    4002     3619879 :         block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
    4003             : 
    4004     3619693 :         len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
    4005     3619693 :         if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
    4006           0 :                 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
    4007             :                            "fs metadata", block, block+len);
    4008             :                 /* File system mounted not to panic on error
    4009             :                  * Fix the bitmap and return EFSCORRUPTED
    4010             :                  * We leak some of the blocks here.
    4011             :                  */
    4012           0 :                 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
    4013           0 :                 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
    4014             :                               ac->ac_b_ex.fe_len);
    4015           0 :                 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
    4016           0 :                 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
    4017           0 :                 if (!err)
    4018           0 :                         err = -EFSCORRUPTED;
    4019           0 :                 goto out_err;
    4020             :         }
    4021             : 
    4022     3619683 :         ext4_lock_group(sb, ac->ac_b_ex.fe_group);
    4023             : #ifdef AGGRESSIVE_CHECK
    4024             :         {
    4025             :                 int i;
    4026             :                 for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
    4027             :                         BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
    4028             :                                                 bitmap_bh->b_data));
    4029             :                 }
    4030             :         }
    4031             : #endif
    4032     3620016 :         mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
    4033             :                       ac->ac_b_ex.fe_len);
    4034     3619947 :         if (ext4_has_group_desc_csum(sb) &&
    4035     3616123 :             (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
    4036        3980 :                 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
    4037        3980 :                 ext4_free_group_clusters_set(sb, gdp,
    4038             :                                              ext4_free_clusters_after_init(sb,
    4039             :                                                 ac->ac_b_ex.fe_group, gdp));
    4040             :         }
    4041     3619821 :         len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
    4042     3619951 :         ext4_free_group_clusters_set(sb, gdp, len);
    4043     3619794 :         ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
    4044     3619289 :         ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
    4045             : 
    4046     3619660 :         ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
    4047     3619864 :         percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
    4048             :         /*
    4049             :          * Now reduce the dirty block count also. Should not go negative
    4050             :          */
    4051     3619878 :         if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
    4052             :                 /* release all the reserved blocks if non delalloc */
    4053     2732966 :                 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
    4054             :                                    reserv_clstrs);
    4055             : 
    4056     3619863 :         if (sbi->s_log_groups_per_flex) {
    4057     3615605 :                 ext4_group_t flex_group = ext4_flex_group(sbi,
    4058             :                                                           ac->ac_b_ex.fe_group);
    4059     7231113 :                 atomic64_sub(ac->ac_b_ex.fe_len,
    4060     3615549 :                              &sbi_array_rcu_deref(sbi, s_flex_groups,
    4061             :                                                   flex_group)->free_clusters);
    4062             :         }
    4063             : 
    4064     3619836 :         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
    4065     3619834 :         if (err)
    4066           0 :                 goto out_err;
    4067     3619834 :         err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
    4068             : 
    4069     3619890 : out_err:
    4070     3619890 :         brelse(bitmap_bh);
    4071             :         return err;
    4072             : }
    4073             : 
    4074             : /*
    4075             :  * Idempotent helper for Ext4 fast commit replay path to set the state of
    4076             :  * blocks in bitmaps and update counters.
    4077             :  */
    4078           0 : void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
    4079             :                         int len, int state)
    4080             : {
    4081           0 :         struct buffer_head *bitmap_bh = NULL;
    4082           0 :         struct ext4_group_desc *gdp;
    4083           0 :         struct buffer_head *gdp_bh;
    4084           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    4085           0 :         ext4_group_t group;
    4086           0 :         ext4_grpblk_t blkoff;
    4087           0 :         int i, err;
    4088           0 :         int already;
    4089           0 :         unsigned int clen, clen_changed, thisgrp_len;
    4090             : 
    4091           0 :         while (len > 0) {
    4092           0 :                 ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
    4093             : 
    4094             :                 /*
    4095             :                  * Check to see if we are freeing blocks across a group
    4096             :                  * boundary.
    4097             :                  * In case of flex_bg, this can happen that (block, len) may
    4098             :                  * span across more than one group. In that case we need to
    4099             :                  * get the corresponding group metadata to work with.
    4100             :                  * For this we have goto again loop.
    4101             :                  */
    4102           0 :                 thisgrp_len = min_t(unsigned int, (unsigned int)len,
    4103             :                         EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
    4104           0 :                 clen = EXT4_NUM_B2C(sbi, thisgrp_len);
    4105             : 
    4106           0 :                 if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
    4107           0 :                         ext4_error(sb, "Marking blocks in system zone - "
    4108             :                                    "Block = %llu, len = %u",
    4109             :                                    block, thisgrp_len);
    4110           0 :                         bitmap_bh = NULL;
    4111           0 :                         break;
    4112             :                 }
    4113             : 
    4114           0 :                 bitmap_bh = ext4_read_block_bitmap(sb, group);
    4115           0 :                 if (IS_ERR(bitmap_bh)) {
    4116           0 :                         err = PTR_ERR(bitmap_bh);
    4117           0 :                         bitmap_bh = NULL;
    4118           0 :                         break;
    4119             :                 }
    4120             : 
    4121           0 :                 err = -EIO;
    4122           0 :                 gdp = ext4_get_group_desc(sb, group, &gdp_bh);
    4123           0 :                 if (!gdp)
    4124             :                         break;
    4125             : 
    4126           0 :                 ext4_lock_group(sb, group);
    4127           0 :                 already = 0;
    4128           0 :                 for (i = 0; i < clen; i++)
    4129           0 :                         if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
    4130             :                                          !state)
    4131           0 :                                 already++;
    4132             : 
    4133           0 :                 clen_changed = clen - already;
    4134           0 :                 if (state)
    4135           0 :                         mb_set_bits(bitmap_bh->b_data, blkoff, clen);
    4136             :                 else
    4137           0 :                         mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
    4138           0 :                 if (ext4_has_group_desc_csum(sb) &&
    4139           0 :                     (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
    4140           0 :                         gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
    4141           0 :                         ext4_free_group_clusters_set(sb, gdp,
    4142             :                              ext4_free_clusters_after_init(sb, group, gdp));
    4143             :                 }
    4144           0 :                 if (state)
    4145           0 :                         clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
    4146             :                 else
    4147           0 :                         clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
    4148             : 
    4149           0 :                 ext4_free_group_clusters_set(sb, gdp, clen);
    4150           0 :                 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
    4151           0 :                 ext4_group_desc_csum_set(sb, group, gdp);
    4152             : 
    4153           0 :                 ext4_unlock_group(sb, group);
    4154             : 
    4155           0 :                 if (sbi->s_log_groups_per_flex) {
    4156           0 :                         ext4_group_t flex_group = ext4_flex_group(sbi, group);
    4157           0 :                         struct flex_groups *fg = sbi_array_rcu_deref(sbi,
    4158             :                                                    s_flex_groups, flex_group);
    4159             : 
    4160           0 :                         if (state)
    4161           0 :                                 atomic64_sub(clen_changed, &fg->free_clusters);
    4162             :                         else
    4163           0 :                                 atomic64_add(clen_changed, &fg->free_clusters);
    4164             : 
    4165             :                 }
    4166             : 
    4167           0 :                 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
    4168           0 :                 if (err)
    4169             :                         break;
    4170           0 :                 sync_dirty_buffer(bitmap_bh);
    4171           0 :                 err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
    4172           0 :                 sync_dirty_buffer(gdp_bh);
    4173           0 :                 if (err)
    4174             :                         break;
    4175             : 
    4176           0 :                 block += thisgrp_len;
    4177           0 :                 len -= thisgrp_len;
    4178           0 :                 brelse(bitmap_bh);
    4179           0 :                 BUG_ON(len < 0);
    4180             :         }
    4181             : 
    4182           0 :         if (err)
    4183           0 :                 brelse(bitmap_bh);
    4184           0 : }
    4185             : 
    4186             : /*
    4187             :  * here we normalize request for locality group
    4188             :  * Group request are normalized to s_mb_group_prealloc, which goes to
    4189             :  * s_strip if we set the same via mount option.
    4190             :  * s_mb_group_prealloc can be configured via
    4191             :  * /sys/fs/ext4/<partition>/mb_group_prealloc
    4192             :  *
    4193             :  * XXX: should we try to preallocate more than the group has now?
    4194             :  */
    4195             : static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
    4196             : {
    4197       27358 :         struct super_block *sb = ac->ac_sb;
    4198       27358 :         struct ext4_locality_group *lg = ac->ac_lg;
    4199             : 
    4200           0 :         BUG_ON(lg == NULL);
    4201       27358 :         ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
    4202       27358 :         mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
    4203             : }
    4204             : 
    4205             : /*
    4206             :  * This function returns the next element to look at during inode
    4207             :  * PA rbtree walk. We assume that we have held the inode PA rbtree lock
    4208             :  * (ei->i_prealloc_lock)
    4209             :  *
    4210             :  * new_start    The start of the range we want to compare
    4211             :  * cur_start    The existing start that we are comparing against
    4212             :  * node The node of the rb_tree
    4213             :  */
    4214             : static inline struct rb_node*
    4215             : ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node)
    4216             : {
    4217    13065709 :         if (new_start < cur_start)
    4218     3935445 :                 return node->rb_left;
    4219             :         else
    4220     9130264 :                 return node->rb_right;
    4221             : }
    4222             : 
    4223             : static inline void
    4224     1475377 : ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
    4225             :                           ext4_lblk_t start, ext4_lblk_t end)
    4226             : {
    4227     1475377 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    4228     1475377 :         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
    4229     1475377 :         struct ext4_prealloc_space *tmp_pa;
    4230     1475377 :         ext4_lblk_t tmp_pa_start, tmp_pa_end;
    4231     1475377 :         struct rb_node *iter;
    4232             : 
    4233     1475377 :         read_lock(&ei->i_prealloc_lock);
    4234     5088826 :         for (iter = ei->i_prealloc_node.rb_node; iter;
    4235     3613320 :              iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) {
    4236     3613376 :                 tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
    4237             :                                   pa_node.inode_node);
    4238     3613376 :                 tmp_pa_start = tmp_pa->pa_lstart;
    4239     3613376 :                 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
    4240             : 
    4241     3613376 :                 spin_lock(&tmp_pa->pa_lock);
    4242     3611023 :                 if (tmp_pa->pa_deleted == 0)
    4243     3610733 :                         BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start));
    4244     3611023 :                 spin_unlock(&tmp_pa->pa_lock);
    4245             :         }
    4246     1475450 :         read_unlock(&ei->i_prealloc_lock);
    4247     1475476 : }
    4248             : 
    4249             : /*
    4250             :  * Given an allocation context "ac" and a range "start", "end", check
    4251             :  * and adjust boundaries if the range overlaps with any of the existing
    4252             :  * preallocatoins stored in the corresponding inode of the allocation context.
    4253             :  *
    4254             :  * Parameters:
    4255             :  *      ac                      allocation context
    4256             :  *      start                   start of the new range
    4257             :  *      end                     end of the new range
    4258             :  */
    4259             : static inline void
    4260     1474824 : ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
    4261             :                           ext4_lblk_t *start, ext4_lblk_t *end)
    4262             : {
    4263     1474824 :         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
    4264     1474824 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    4265     1474824 :         struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL;
    4266     1474824 :         struct rb_node *iter;
    4267     1474824 :         ext4_lblk_t new_start, new_end;
    4268     1474824 :         ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1;
    4269             : 
    4270     1474824 :         new_start = *start;
    4271     1474824 :         new_end = *end;
    4272             : 
    4273             :         /*
    4274             :          * Adjust the normalized range so that it doesn't overlap with any
    4275             :          * existing preallocated blocks(PAs). Make sure to hold the rbtree lock
    4276             :          * so it doesn't change underneath us.
    4277             :          */
    4278     1474824 :         read_lock(&ei->i_prealloc_lock);
    4279             : 
    4280             :         /* Step 1: find any one immediate neighboring PA of the normalized range */
    4281     5088783 :         for (iter = ei->i_prealloc_node.rb_node; iter;
    4282     3613594 :              iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
    4283             :                                             tmp_pa_start, iter)) {
    4284     3613389 :                 tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
    4285             :                                   pa_node.inode_node);
    4286     3613389 :                 tmp_pa_start = tmp_pa->pa_lstart;
    4287     3613389 :                 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
    4288             : 
    4289             :                 /* PA must not overlap original request */
    4290     3613389 :                 spin_lock(&tmp_pa->pa_lock);
    4291     3610515 :                 if (tmp_pa->pa_deleted == 0)
    4292     3610228 :                         BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
    4293             :                                  ac->ac_o_ex.fe_logical < tmp_pa_start));
    4294     3610515 :                 spin_unlock(&tmp_pa->pa_lock);
    4295             :         }
    4296             : 
    4297             :         /*
    4298             :          * Step 2: check if the found PA is left or right neighbor and
    4299             :          * get the other neighbor
    4300             :          */
    4301     1475394 :         if (tmp_pa) {
    4302      489427 :                 if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) {
    4303      279317 :                         struct rb_node *tmp;
    4304             : 
    4305      279317 :                         left_pa = tmp_pa;
    4306      279317 :                         tmp = rb_next(&left_pa->pa_node.inode_node);
    4307      279294 :                         if (tmp) {
    4308      126675 :                                 right_pa = rb_entry(tmp,
    4309             :                                                     struct ext4_prealloc_space,
    4310             :                                                     pa_node.inode_node);
    4311             :                         }
    4312             :                 } else {
    4313      210110 :                         struct rb_node *tmp;
    4314             : 
    4315      210110 :                         right_pa = tmp_pa;
    4316      210110 :                         tmp = rb_prev(&right_pa->pa_node.inode_node);
    4317      210113 :                         if (tmp) {
    4318             :                                 left_pa = rb_entry(tmp,
    4319             :                                                    struct ext4_prealloc_space,
    4320             :                                                    pa_node.inode_node);
    4321             :                         }
    4322             :                 }
    4323             :         }
    4324             : 
    4325             :         /* Step 3: get the non deleted neighbors */
    4326      392746 :         if (left_pa) {
    4327      392746 :                 for (iter = &left_pa->pa_node.inode_node;;
    4328          99 :                      iter = rb_prev(iter)) {
    4329      392845 :                         if (!iter) {
    4330             :                                 left_pa = NULL;
    4331             :                                 break;
    4332             :                         }
    4333             : 
    4334      392765 :                         tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
    4335             :                                           pa_node.inode_node);
    4336      392765 :                         left_pa = tmp_pa;
    4337      392765 :                         spin_lock(&tmp_pa->pa_lock);
    4338      392702 :                         if (tmp_pa->pa_deleted == 0) {
    4339      392603 :                                 spin_unlock(&tmp_pa->pa_lock);
    4340             :                                 break;
    4341             :                         }
    4342          99 :                         spin_unlock(&tmp_pa->pa_lock);
    4343             :                 }
    4344             :         }
    4345             : 
    4346     1475307 :         if (right_pa) {
    4347      336750 :                 for (iter = &right_pa->pa_node.inode_node;;
    4348          48 :                      iter = rb_next(iter)) {
    4349      336798 :                         if (!iter) {
    4350             :                                 right_pa = NULL;
    4351             :                                 break;
    4352             :                         }
    4353             : 
    4354      336881 :                         tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
    4355             :                                           pa_node.inode_node);
    4356      336881 :                         right_pa = tmp_pa;
    4357      336881 :                         spin_lock(&tmp_pa->pa_lock);
    4358      336777 :                         if (tmp_pa->pa_deleted == 0) {
    4359      336729 :                                 spin_unlock(&tmp_pa->pa_lock);
    4360             :                                 break;
    4361             :                         }
    4362          48 :                         spin_unlock(&tmp_pa->pa_lock);
    4363             :                 }
    4364             :         }
    4365             : 
    4366     1475246 :         if (left_pa) {
    4367      785184 :                 left_pa_end =
    4368      392592 :                         left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len);
    4369      392592 :                 BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical);
    4370             :         }
    4371             : 
    4372     1475246 :         if (right_pa) {
    4373      336723 :                 right_pa_start = right_pa->pa_lstart;
    4374      336723 :                 BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical);
    4375             :         }
    4376             : 
    4377             :         /* Step 4: trim our normalized range to not overlap with the neighbors */
    4378     1475246 :         if (left_pa) {
    4379      392602 :                 if (left_pa_end > new_start)
    4380             :                         new_start = left_pa_end;
    4381             :         }
    4382             : 
    4383     1475246 :         if (right_pa) {
    4384      336724 :                 if (right_pa_start < new_end)
    4385             :                         new_end = right_pa_start;
    4386             :         }
    4387     1475246 :         read_unlock(&ei->i_prealloc_lock);
    4388             : 
    4389             :         /* XXX: extra loop to check we really don't overlap preallocations */
    4390     1475425 :         ext4_mb_pa_assert_overlap(ac, new_start, new_end);
    4391             : 
    4392     1475458 :         *start = new_start;
    4393     1475458 :         *end = new_end;
    4394     1475458 : }
    4395             : 
    4396             : /*
    4397             :  * Normalization means making request better in terms of
    4398             :  * size and alignment
    4399             :  */
    4400             : static noinline_for_stack void
    4401     2678942 : ext4_mb_normalize_request(struct ext4_allocation_context *ac,
    4402             :                                 struct ext4_allocation_request *ar)
    4403             : {
    4404     2678942 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    4405     2678942 :         struct ext4_super_block *es = sbi->s_es;
    4406     2678942 :         int bsbits, max;
    4407     2678942 :         ext4_lblk_t end;
    4408     2678942 :         loff_t size, start_off;
    4409     2678942 :         loff_t orig_size __maybe_unused;
    4410     2678942 :         ext4_lblk_t start;
    4411             : 
    4412             :         /* do normalize only data requests, metadata requests
    4413             :            do not need preallocation */
    4414     2678942 :         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
    4415     1203935 :                 return;
    4416             : 
    4417             :         /* sometime caller may want exact blocks */
    4418     2147867 :         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
    4419             :                 return;
    4420             : 
    4421             :         /* caller may indicate that preallocation isn't
    4422             :          * required (it's a tail, for example) */
    4423     2147867 :         if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
    4424             :                 return;
    4425             : 
    4426     1502365 :         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
    4427       27358 :                 ext4_mb_normalize_group_request(ac);
    4428       27358 :                 return ;
    4429             :         }
    4430             : 
    4431     1475007 :         bsbits = ac->ac_sb->s_blocksize_bits;
    4432             : 
    4433             :         /* first, let's learn actual file size
    4434             :          * given current request is allocated */
    4435     1475007 :         size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
    4436     1475007 :         size = size << bsbits;
    4437     1475007 :         if (size < i_size_read(ac->ac_inode))
    4438             :                 size = i_size_read(ac->ac_inode);
    4439     1475007 :         orig_size = size;
    4440             : 
    4441             :         /* max size of free chunks */
    4442     1475007 :         max = 2 << bsbits;
    4443             : 
    4444             : #define NRL_CHECK_SIZE(req, size, max, chunk_size)      \
    4445             :                 (req <= (size) || max <= (chunk_size))
    4446             : 
    4447             :         /* first, try to predict filesize */
    4448             :         /* XXX: should this table be tunable? */
    4449     1475007 :         start_off = 0;
    4450     1475007 :         if (size <= 16 * 1024) {
    4451             :                 size = 16 * 1024;
    4452     1474893 :         } else if (size <= 32 * 1024) {
    4453             :                 size = 32 * 1024;
    4454     1474708 :         } else if (size <= 64 * 1024) {
    4455             :                 size = 64 * 1024;
    4456     1474673 :         } else if (size <= 128 * 1024) {
    4457             :                 size = 128 * 1024;
    4458     1466834 :         } else if (size <= 256 * 1024) {
    4459             :                 size = 256 * 1024;
    4460     1408806 :         } else if (size <= 512 * 1024) {
    4461             :                 size = 512 * 1024;
    4462     1278285 :         } else if (size <= 1024 * 1024) {
    4463             :                 size = 1024 * 1024;
    4464     1097414 :         } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
    4465           0 :                 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
    4466       73513 :                                                 (21 - bsbits)) << 21;
    4467       73513 :                 size = 2 * 1024 * 1024;
    4468     1023901 :         } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
    4469           0 :                 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
    4470       36993 :                                                         (22 - bsbits)) << 22;
    4471       36993 :                 size = 4 * 1024 * 1024;
    4472      986908 :         } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
    4473             :                                         (8<<20)>>bsbits, max, 8 * 1024)) {
    4474           0 :                 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
    4475      986908 :                                                         (23 - bsbits)) << 23;
    4476      986908 :                 size = 8 * 1024 * 1024;
    4477             :         } else {
    4478           0 :                 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
    4479           0 :                 size      = (loff_t) EXT4_C2B(sbi,
    4480           0 :                                               ac->ac_o_ex.fe_len) << bsbits;
    4481             :         }
    4482     1475007 :         size = size >> bsbits;
    4483     1475007 :         start = start_off >> bsbits;
    4484             : 
    4485             :         /*
    4486             :          * For tiny groups (smaller than 8MB) the chosen allocation
    4487             :          * alignment may be larger than group size. Make sure the
    4488             :          * alignment does not move allocation to a different group which
    4489             :          * makes mballoc fail assertions later.
    4490             :          */
    4491     1475007 :         start = max(start, rounddown(ac->ac_o_ex.fe_logical,
    4492             :                         (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
    4493             : 
    4494             :         /* don't cover already allocated blocks in selected range */
    4495     1475007 :         if (ar->pleft && start <= ar->lleft) {
    4496     1150979 :                 size -= ar->lleft + 1 - start;
    4497     1150979 :                 start = ar->lleft + 1;
    4498             :         }
    4499     1475007 :         if (ar->pright && start + size - 1 >= ar->lright)
    4500      482279 :                 size -= start + size - ar->lright;
    4501             : 
    4502             :         /*
    4503             :          * Trim allocation request for filesystems with artificially small
    4504             :          * groups.
    4505             :          */
    4506     1475007 :         if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
    4507          48 :                 size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
    4508             : 
    4509     1475007 :         end = start + size;
    4510             : 
    4511     1475007 :         ext4_mb_pa_adjust_overlap(ac, &start, &end);
    4512             : 
    4513     1475472 :         size = end - start;
    4514             : 
    4515             :         /*
    4516             :          * In this function "start" and "size" are normalized for better
    4517             :          * alignment and length such that we could preallocate more blocks.
    4518             :          * This normalization is done such that original request of
    4519             :          * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
    4520             :          * "size" boundaries.
    4521             :          * (Note fe_len can be relaxed since FS block allocation API does not
    4522             :          * provide gurantee on number of contiguous blocks allocation since that
    4523             :          * depends upon free space left, etc).
    4524             :          * In case of inode pa, later we use the allocated blocks
    4525             :          * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated
    4526             :          * range of goal/best blocks [start, size] to put it at the
    4527             :          * ac_o_ex.fe_logical extent of this inode.
    4528             :          * (See ext4_mb_use_inode_pa() for more details)
    4529             :          */
    4530     1475472 :         if (start + size <= ac->ac_o_ex.fe_logical ||
    4531             :                         start > ac->ac_o_ex.fe_logical) {
    4532           0 :                 ext4_msg(ac->ac_sb, KERN_ERR,
    4533             :                          "start %lu, size %lu, fe_logical %lu",
    4534             :                          (unsigned long) start, (unsigned long) size,
    4535             :                          (unsigned long) ac->ac_o_ex.fe_logical);
    4536           0 :                 BUG();
    4537             :         }
    4538     1475472 :         BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
    4539             : 
    4540             :         /* now prepare goal request */
    4541             : 
    4542             :         /* XXX: is it better to align blocks WRT to logical
    4543             :          * placement or satisfy big request as is */
    4544     1475472 :         ac->ac_g_ex.fe_logical = start;
    4545     1475472 :         ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
    4546     1475472 :         ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
    4547             : 
    4548             :         /* define goal start in order to merge */
    4549     1475472 :         if (ar->pright && (ar->lright == (start + size)) &&
    4550      482522 :             ar->pright >= size &&
    4551      482522 :             ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
    4552             :                 /* merge to the right */
    4553      482522 :                 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
    4554             :                                                 &ac->ac_g_ex.fe_group,
    4555             :                                                 &ac->ac_g_ex.fe_start);
    4556      482515 :                 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
    4557             :         }
    4558     2702646 :         if (ar->pleft && (ar->lleft + 1 == start) &&
    4559     1227181 :             ar->pleft + 1 < ext4_blocks_count(es)) {
    4560             :                 /* merge to the left */
    4561     1227123 :                 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
    4562             :                                                 &ac->ac_g_ex.fe_group,
    4563             :                                                 &ac->ac_g_ex.fe_start);
    4564     1227125 :                 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
    4565             :         }
    4566             : 
    4567     1475467 :         mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
    4568             :                  orig_size, start);
    4569             : }
    4570             : 
    4571     3620081 : static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
    4572             : {
    4573     3620081 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    4574             : 
    4575     3620081 :         if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
    4576           0 :                 atomic_inc(&sbi->s_bal_reqs);
    4577           0 :                 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
    4578           0 :                 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
    4579           0 :                         atomic_inc(&sbi->s_bal_success);
    4580             : 
    4581           0 :                 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
    4582           0 :                 for (int i=0; i<EXT4_MB_NUM_CRS; i++) {
    4583           0 :                         atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]);
    4584             :                 }
    4585             : 
    4586           0 :                 atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
    4587           0 :                 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
    4588           0 :                                 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
    4589           0 :                         atomic_inc(&sbi->s_bal_goals);
    4590             :                 /* did we allocate as much as normalizer originally wanted? */
    4591           0 :                 if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len)
    4592           0 :                         atomic_inc(&sbi->s_bal_len_goals);
    4593             : 
    4594           0 :                 if (ac->ac_found > sbi->s_mb_max_to_scan)
    4595           0 :                         atomic_inc(&sbi->s_bal_breaks);
    4596             :         }
    4597             : 
    4598     3620081 :         if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
    4599     2679829 :                 trace_ext4_mballoc_alloc(ac);
    4600             :         else
    4601      940252 :                 trace_ext4_mballoc_prealloc(ac);
    4602     3619796 : }
    4603             : 
    4604             : /*
    4605             :  * Called on failure; free up any blocks from the inode PA for this
    4606             :  * context.  We don't need this for MB_GROUP_PA because we only change
    4607             :  * pa_free in ext4_mb_release_context(), but on failure, we've already
    4608             :  * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
    4609             :  */
    4610           1 : static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
    4611             : {
    4612           1 :         struct ext4_prealloc_space *pa = ac->ac_pa;
    4613           1 :         struct ext4_buddy e4b;
    4614           1 :         int err;
    4615             : 
    4616           1 :         if (pa == NULL) {
    4617           1 :                 if (ac->ac_f_ex.fe_len == 0)
    4618           1 :                         return;
    4619           0 :                 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
    4620           0 :                 if (WARN_RATELIMIT(err,
    4621             :                                    "ext4: mb_load_buddy failed (%d)", err))
    4622             :                         /*
    4623             :                          * This should never happen since we pin the
    4624             :                          * pages in the ext4_allocation_context so
    4625             :                          * ext4_mb_load_buddy() should never fail.
    4626             :                          */
    4627             :                         return;
    4628           0 :                 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
    4629           0 :                 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
    4630             :                                ac->ac_f_ex.fe_len);
    4631           0 :                 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
    4632           0 :                 ext4_mb_unload_buddy(&e4b);
    4633           0 :                 return;
    4634             :         }
    4635           0 :         if (pa->pa_type == MB_INODE_PA) {
    4636           0 :                 spin_lock(&pa->pa_lock);
    4637           0 :                 pa->pa_free += ac->ac_b_ex.fe_len;
    4638           0 :                 spin_unlock(&pa->pa_lock);
    4639             :         }
    4640             : }
    4641             : 
    4642             : /*
    4643             :  * use blocks preallocated to inode
    4644             :  */
    4645     1390611 : static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
    4646             :                                 struct ext4_prealloc_space *pa)
    4647             : {
    4648     1390611 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    4649     1390611 :         ext4_fsblk_t start;
    4650     1390611 :         ext4_fsblk_t end;
    4651     1390611 :         int len;
    4652             : 
    4653             :         /* found preallocated blocks, use them */
    4654     1390611 :         start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
    4655     1390611 :         end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
    4656             :                   start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
    4657     1390611 :         len = EXT4_NUM_B2C(sbi, end - start);
    4658     1390611 :         ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
    4659             :                                         &ac->ac_b_ex.fe_start);
    4660     1390429 :         ac->ac_b_ex.fe_len = len;
    4661     1390429 :         ac->ac_status = AC_STATUS_FOUND;
    4662     1390429 :         ac->ac_pa = pa;
    4663             : 
    4664     1390429 :         BUG_ON(start < pa->pa_pstart);
    4665     1390429 :         BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
    4666     1390429 :         BUG_ON(pa->pa_free < len);
    4667     1390429 :         BUG_ON(ac->ac_b_ex.fe_len <= 0);
    4668     1390429 :         pa->pa_free -= len;
    4669             : 
    4670     1390429 :         mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
    4671     1390429 : }
    4672             : 
    4673             : /*
    4674             :  * use blocks preallocated to locality group
    4675             :  */
    4676      266501 : static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
    4677             :                                 struct ext4_prealloc_space *pa)
    4678             : {
    4679      266501 :         unsigned int len = ac->ac_o_ex.fe_len;
    4680             : 
    4681      266501 :         ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
    4682             :                                         &ac->ac_b_ex.fe_group,
    4683             :                                         &ac->ac_b_ex.fe_start);
    4684      266501 :         ac->ac_b_ex.fe_len = len;
    4685      266501 :         ac->ac_status = AC_STATUS_FOUND;
    4686      266501 :         ac->ac_pa = pa;
    4687             : 
    4688             :         /* we don't correct pa_pstart or pa_len here to avoid
    4689             :          * possible race when the group is being loaded concurrently
    4690             :          * instead we correct pa later, after blocks are marked
    4691             :          * in on-disk bitmap -- see ext4_mb_release_context()
    4692             :          * Other CPUs are prevented from allocating from this pa by lg_mutex
    4693             :          */
    4694      266501 :         mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
    4695             :                  pa->pa_lstart, len, pa);
    4696      266501 : }
    4697             : 
    4698             : /*
    4699             :  * Return the prealloc space that have minimal distance
    4700             :  * from the goal block. @cpa is the prealloc
    4701             :  * space that is having currently known minimal distance
    4702             :  * from the goal block.
    4703             :  */
    4704             : static struct ext4_prealloc_space *
    4705      272497 : ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
    4706             :                         struct ext4_prealloc_space *pa,
    4707             :                         struct ext4_prealloc_space *cpa)
    4708             : {
    4709      272497 :         ext4_fsblk_t cur_distance, new_distance;
    4710             : 
    4711      272497 :         if (cpa == NULL) {
    4712      264806 :                 atomic_inc(&pa->pa_count);
    4713      264806 :                 return pa;
    4714             :         }
    4715        7691 :         cur_distance = abs(goal_block - cpa->pa_pstart);
    4716        7691 :         new_distance = abs(goal_block - pa->pa_pstart);
    4717             : 
    4718        7691 :         if (cur_distance <= new_distance)
    4719             :                 return cpa;
    4720             : 
    4721             :         /* drop the previous reference */
    4722        7442 :         atomic_dec(&cpa->pa_count);
    4723        7442 :         atomic_inc(&pa->pa_count);
    4724        7442 :         return pa;
    4725             : }
    4726             : 
    4727             : /*
    4728             :  * check if found pa meets EXT4_MB_HINT_GOAL_ONLY
    4729             :  */
    4730             : static bool
    4731      674968 : ext4_mb_pa_goal_check(struct ext4_allocation_context *ac,
    4732             :                       struct ext4_prealloc_space *pa)
    4733             : {
    4734      674968 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    4735      674968 :         ext4_fsblk_t start;
    4736             : 
    4737      674968 :         if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)))
    4738             :                 return true;
    4739             : 
    4740             :         /*
    4741             :          * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted
    4742             :          * in ext4_mb_normalize_request and will keep same with ac_o_ex
    4743             :          * from ext4_mb_initialize_context. Choose ac_g_ex here to keep
    4744             :          * consistent with ext4_mb_find_by_goal.
    4745             :          */
    4746           0 :         start = pa->pa_pstart +
    4747           0 :                 (ac->ac_g_ex.fe_logical - pa->pa_lstart);
    4748           0 :         if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start)
    4749             :                 return false;
    4750             : 
    4751           0 :         if (ac->ac_g_ex.fe_len > pa->pa_len -
    4752           0 :             EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart))
    4753           0 :                 return false;
    4754             : 
    4755             :         return true;
    4756             : }
    4757             : 
    4758             : /*
    4759             :  * search goal blocks in preallocated space
    4760             :  */
    4761             : static noinline_for_stack bool
    4762     3618480 : ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
    4763             : {
    4764     3618480 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    4765     3618480 :         int order, i;
    4766     3618480 :         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
    4767     3618480 :         struct ext4_locality_group *lg;
    4768     3618480 :         struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL;
    4769     3618480 :         loff_t tmp_pa_end;
    4770     3618480 :         struct rb_node *iter;
    4771     3618480 :         ext4_fsblk_t goal_block;
    4772             : 
    4773             :         /* only data can be preallocated */
    4774     3618480 :         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
    4775             :                 return false;
    4776             : 
    4777             :         /*
    4778             :          * first, try per-file preallocation by searching the inode pa rbtree.
    4779             :          *
    4780             :          * Here, we can't do a direct traversal of the tree because
    4781             :          * ext4_mb_discard_group_preallocation() can paralelly mark the pa
    4782             :          * deleted and that can cause direct traversal to skip some entries.
    4783             :          */
    4784     3087229 :         read_lock(&ei->i_prealloc_lock);
    4785             : 
    4786     3087954 :         if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) {
    4787     1869964 :                 goto try_group_pa;
    4788             :         }
    4789             : 
    4790             :         /*
    4791             :          * Step 1: Find a pa with logical start immediately adjacent to the
    4792             :          * original logical start. This could be on the left or right.
    4793             :          *
    4794             :          * (tmp_pa->pa_lstart never changes so we can skip locking for it).
    4795             :          */
    4796     7056785 :         for (iter = ei->i_prealloc_node.rb_node; iter;
    4797     5838795 :              iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
    4798             :                                             tmp_pa->pa_lstart, iter)) {
    4799     5838795 :                 tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
    4800             :                                   pa_node.inode_node);
    4801             :         }
    4802             : 
    4803             :         /*
    4804             :          * Step 2: The adjacent pa might be to the right of logical start, find
    4805             :          * the left adjacent pa. After this step we'd have a valid tmp_pa whose
    4806             :          * logical start is towards the left of original request's logical start
    4807             :          */
    4808     1217990 :         if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) {
    4809      268775 :                 struct rb_node *tmp;
    4810      268775 :                 tmp = rb_prev(&tmp_pa->pa_node.inode_node);
    4811             : 
    4812      268772 :                 if (tmp) {
    4813             :                         tmp_pa = rb_entry(tmp, struct ext4_prealloc_space,
    4814             :                                             pa_node.inode_node);
    4815             :                 } else {
    4816             :                         /*
    4817             :                          * If there is no adjacent pa to the left then finding
    4818             :                          * an overlapping pa is not possible hence stop searching
    4819             :                          * inode pa tree
    4820             :                          */
    4821      120096 :                         goto try_group_pa;
    4822             :                 }
    4823             :         }
    4824             : 
    4825     1097891 :         BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
    4826             : 
    4827             :         /*
    4828             :          * Step 3: If the left adjacent pa is deleted, keep moving left to find
    4829             :          * the first non deleted adjacent pa. After this step we should have a
    4830             :          * valid tmp_pa which is guaranteed to be non deleted.
    4831             :          */
    4832     1097990 :         for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) {
    4833     1097990 :                 if (!iter) {
    4834             :                         /*
    4835             :                          * no non deleted left adjacent pa, so stop searching
    4836             :                          * inode pa tree
    4837             :                          */
    4838          82 :                         goto try_group_pa;
    4839             :                 }
    4840     1097908 :                 tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
    4841             :                                   pa_node.inode_node);
    4842     1097908 :                 spin_lock(&tmp_pa->pa_lock);
    4843     1098128 :                 if (tmp_pa->pa_deleted == 0) {
    4844             :                         /*
    4845             :                          * We will keep holding the pa_lock from
    4846             :                          * this point on because we don't want group discard
    4847             :                          * to delete this pa underneath us. Since group
    4848             :                          * discard is anyways an ENOSPC operation it
    4849             :                          * should be okay for it to wait a few more cycles.
    4850             :                          */
    4851             :                         break;
    4852             :                 } else {
    4853          99 :                         spin_unlock(&tmp_pa->pa_lock);
    4854             :                 }
    4855             :         }
    4856             : 
    4857     1098029 :         BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
    4858     1098029 :         BUG_ON(tmp_pa->pa_deleted == 1);
    4859             : 
    4860             :         /*
    4861             :          * Step 4: We now have the non deleted left adjacent pa. Only this
    4862             :          * pa can possibly satisfy the request hence check if it overlaps
    4863             :          * original logical start and stop searching if it doesn't.
    4864             :          */
    4865     1098029 :         tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
    4866             : 
    4867     1098029 :         if (ac->ac_o_ex.fe_logical >= tmp_pa_end) {
    4868      422803 :                 spin_unlock(&tmp_pa->pa_lock);
    4869      422900 :                 goto try_group_pa;
    4870             :         }
    4871             : 
    4872             :         /* non-extent files can't have physical blocks past 2^32 */
    4873      675226 :         if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
    4874        3953 :             (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
    4875             :              EXT4_MAX_BLOCK_FILE_PHYS)) {
    4876             :                 /*
    4877             :                  * Since PAs don't overlap, we won't find any other PA to
    4878             :                  * satisfy this.
    4879             :                  */
    4880           0 :                 spin_unlock(&tmp_pa->pa_lock);
    4881           0 :                 goto try_group_pa;
    4882             :         }
    4883             : 
    4884      675226 :         if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
    4885      674687 :                 atomic_inc(&tmp_pa->pa_count);
    4886      675414 :                 ext4_mb_use_inode_pa(ac, tmp_pa);
    4887      674714 :                 spin_unlock(&tmp_pa->pa_lock);
    4888      675212 :                 read_unlock(&ei->i_prealloc_lock);
    4889      675212 :                 return true;
    4890             :         } else {
    4891             :                 /*
    4892             :                  * We found a valid overlapping pa but couldn't use it because
    4893             :                  * it had no free blocks. This should ideally never happen
    4894             :                  * because:
    4895             :                  *
    4896             :                  * 1. When a new inode pa is added to rbtree it must have
    4897             :                  *    pa_free > 0 since otherwise we won't actually need
    4898             :                  *    preallocation.
    4899             :                  *
    4900             :                  * 2. An inode pa that is in the rbtree can only have it's
    4901             :                  *    pa_free become zero when another thread calls:
    4902             :                  *      ext4_mb_new_blocks
    4903             :                  *       ext4_mb_use_preallocated
    4904             :                  *        ext4_mb_use_inode_pa
    4905             :                  *
    4906             :                  * 3. Further, after the above calls make pa_free == 0, we will
    4907             :                  *    immediately remove it from the rbtree in:
    4908             :                  *      ext4_mb_new_blocks
    4909             :                  *       ext4_mb_release_context
    4910             :                  *        ext4_mb_put_pa
    4911             :                  *
    4912             :                  * 4. Since the pa_free becoming 0 and pa_free getting removed
    4913             :                  * from tree both happen in ext4_mb_new_blocks, which is always
    4914             :                  * called with i_data_sem held for data allocations, we can be
    4915             :                  * sure that another process will never see a pa in rbtree with
    4916             :                  * pa_free == 0.
    4917             :                  */
    4918         144 :                 WARN_ON_ONCE(tmp_pa->pa_free == 0);
    4919             :         }
    4920         144 :         spin_unlock(&tmp_pa->pa_lock);
    4921     2413042 : try_group_pa:
    4922     2413042 :         read_unlock(&ei->i_prealloc_lock);
    4923             : 
    4924             :         /* can we use group allocation? */
    4925     2412963 :         if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
    4926             :                 return false;
    4927             : 
    4928             :         /* inode may have no locality group for some reason */
    4929      295863 :         lg = ac->ac_lg;
    4930      295863 :         if (lg == NULL)
    4931             :                 return false;
    4932      295863 :         order  = fls(ac->ac_o_ex.fe_len) - 1;
    4933      295863 :         if (order > PREALLOC_TB_SIZE - 1)
    4934             :                 /* The max size of hash table is PREALLOC_TB_SIZE */
    4935             :                 order = PREALLOC_TB_SIZE - 1;
    4936             : 
    4937      295863 :         goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
    4938             :         /*
    4939             :          * search for the prealloc space that is having
    4940             :          * minimal distance from the goal block.
    4941             :          */
    4942     3389368 :         for (i = order; i < PREALLOC_TB_SIZE; i++) {
    4943     2797626 :                 rcu_read_lock();
    4944     3257214 :                 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i],
    4945             :                                         pa_node.lg_list) {
    4946      459521 :                         spin_lock(&tmp_pa->pa_lock);
    4947      459521 :                         if (tmp_pa->pa_deleted == 0 &&
    4948      459521 :                                         tmp_pa->pa_free >= ac->ac_o_ex.fe_len) {
    4949             : 
    4950      272497 :                                 cpa = ext4_mb_check_group_pa(goal_block,
    4951             :                                                                 tmp_pa, cpa);
    4952             :                         }
    4953      459521 :                         spin_unlock(&tmp_pa->pa_lock);
    4954             :                 }
    4955     2797683 :                 rcu_read_unlock();
    4956             :         }
    4957      295879 :         if (cpa) {
    4958      264806 :                 ext4_mb_use_group_pa(ac, cpa);
    4959      264806 :                 return true;
    4960             :         }
    4961             :         return false;
    4962             : }
    4963             : 
    4964             : /*
    4965             :  * the function goes through all block freed in the group
    4966             :  * but not yet committed and marks them used in in-core bitmap.
    4967             :  * buddy must be generated from this bitmap
    4968             :  * Need to be called with the ext4 group lock held
    4969             :  */
    4970      160468 : static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
    4971             :                                                 ext4_group_t group)
    4972             : {
    4973      160468 :         struct rb_node *n;
    4974      160468 :         struct ext4_group_info *grp;
    4975      160468 :         struct ext4_free_data *entry;
    4976             : 
    4977      160468 :         grp = ext4_get_group_info(sb, group);
    4978      160468 :         if (!grp)
    4979             :                 return;
    4980      160468 :         n = rb_first(&(grp->bb_free_root));
    4981             : 
    4982      160468 :         while (n) {
    4983           0 :                 entry = rb_entry(n, struct ext4_free_data, efd_node);
    4984           0 :                 mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
    4985           0 :                 n = rb_next(n);
    4986             :         }
    4987             :         return;
    4988             : }
    4989             : 
    4990             : /*
    4991             :  * the function goes through all preallocation in this group and marks them
    4992             :  * used in in-core bitmap. buddy must be generated from this bitmap
    4993             :  * Need to be called with ext4 group lock held
    4994             :  */
    4995             : static noinline_for_stack
    4996      160468 : void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
    4997             :                                         ext4_group_t group)
    4998             : {
    4999      160468 :         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
    5000      160468 :         struct ext4_prealloc_space *pa;
    5001      160468 :         struct list_head *cur;
    5002      160468 :         ext4_group_t groupnr;
    5003      160468 :         ext4_grpblk_t start;
    5004      160468 :         int preallocated = 0;
    5005      160468 :         int len;
    5006             : 
    5007      160468 :         if (!grp)
    5008           0 :                 return;
    5009             : 
    5010             :         /* all form of preallocation discards first load group,
    5011             :          * so the only competing code is preallocation use.
    5012             :          * we don't need any locking here
    5013             :          * notice we do NOT ignore preallocations with pa_deleted
    5014             :          * otherwise we could leave used blocks available for
    5015             :          * allocation in buddy when concurrent ext4_mb_put_pa()
    5016             :          * is dropping preallocation
    5017             :          */
    5018      160495 :         list_for_each(cur, &grp->bb_prealloc_list) {
    5019          27 :                 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
    5020          27 :                 spin_lock(&pa->pa_lock);
    5021          27 :                 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
    5022             :                                              &groupnr, &start);
    5023          27 :                 len = pa->pa_len;
    5024          27 :                 spin_unlock(&pa->pa_lock);
    5025          27 :                 if (unlikely(len == 0))
    5026           0 :                         continue;
    5027          27 :                 BUG_ON(groupnr != group);
    5028          27 :                 mb_set_bits(bitmap, start, len);
    5029          27 :                 preallocated += len;
    5030             :         }
    5031      160468 :         mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
    5032             : }
    5033             : 
    5034      716150 : static void ext4_mb_mark_pa_deleted(struct super_block *sb,
    5035             :                                     struct ext4_prealloc_space *pa)
    5036             : {
    5037      716150 :         struct ext4_inode_info *ei;
    5038             : 
    5039      716150 :         if (pa->pa_deleted) {
    5040           0 :                 ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
    5041             :                              pa->pa_type, pa->pa_pstart, pa->pa_lstart,
    5042             :                              pa->pa_len);
    5043           0 :                 return;
    5044             :         }
    5045             : 
    5046      716150 :         pa->pa_deleted = 1;
    5047             : 
    5048      716150 :         if (pa->pa_type == MB_INODE_PA) {
    5049      715214 :                 ei = EXT4_I(pa->pa_inode);
    5050      715214 :                 atomic_dec(&ei->i_prealloc_active);
    5051             :         }
    5052             : }
    5053             : 
    5054     2679110 : static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa)
    5055             : {
    5056     2679110 :         BUG_ON(!pa);
    5057     2679110 :         BUG_ON(atomic_read(&pa->pa_count));
    5058     2679110 :         BUG_ON(pa->pa_deleted == 0);
    5059     2679110 :         kmem_cache_free(ext4_pspace_cachep, pa);
    5060     2679069 : }
    5061             : 
    5062         922 : static void ext4_mb_pa_callback(struct rcu_head *head)
    5063             : {
    5064         922 :         struct ext4_prealloc_space *pa;
    5065             : 
    5066         922 :         pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
    5067         922 :         ext4_mb_pa_free(pa);
    5068         922 : }
    5069             : 
    5070             : /*
    5071             :  * drops a reference to preallocated space descriptor
    5072             :  * if this was the last reference and the space is consumed
    5073             :  */
    5074     1657264 : static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
    5075             :                         struct super_block *sb, struct ext4_prealloc_space *pa)
    5076             : {
    5077     1657264 :         ext4_group_t grp;
    5078     1657264 :         ext4_fsblk_t grp_blk;
    5079     1657264 :         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
    5080             : 
    5081             :         /* in this short window concurrent discard can set pa_deleted */
    5082     1657264 :         spin_lock(&pa->pa_lock);
    5083     1657328 :         if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
    5084     1645197 :                 spin_unlock(&pa->pa_lock);
    5085     1645197 :                 return;
    5086             :         }
    5087             : 
    5088       12139 :         if (pa->pa_deleted == 1) {
    5089           0 :                 spin_unlock(&pa->pa_lock);
    5090           0 :                 return;
    5091             :         }
    5092             : 
    5093       12139 :         ext4_mb_mark_pa_deleted(sb, pa);
    5094       12139 :         spin_unlock(&pa->pa_lock);
    5095             : 
    5096       12139 :         grp_blk = pa->pa_pstart;
    5097             :         /*
    5098             :          * If doing group-based preallocation, pa_pstart may be in the
    5099             :          * next group when pa is used up
    5100             :          */
    5101       12139 :         if (pa->pa_type == MB_GROUP_PA)
    5102         746 :                 grp_blk--;
    5103             : 
    5104       12139 :         grp = ext4_get_group_number(sb, grp_blk);
    5105             : 
    5106             :         /*
    5107             :          * possible race:
    5108             :          *
    5109             :          *  P1 (buddy init)                     P2 (regular allocation)
    5110             :          *                                      find block B in PA
    5111             :          *  copy on-disk bitmap to buddy
    5112             :          *                                      mark B in on-disk bitmap
    5113             :          *                                      drop PA from group
    5114             :          *  mark all PAs in buddy
    5115             :          *
    5116             :          * thus, P1 initializes buddy with B available. to prevent this
    5117             :          * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
    5118             :          * against that pair
    5119             :          */
    5120       12139 :         ext4_lock_group(sb, grp);
    5121       12139 :         list_del(&pa->pa_group_list);
    5122       12139 :         ext4_unlock_group(sb, grp);
    5123             : 
    5124       12139 :         if (pa->pa_type == MB_INODE_PA) {
    5125       11393 :                 write_lock(pa->pa_node_lock.inode_lock);
    5126       11393 :                 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
    5127       11393 :                 write_unlock(pa->pa_node_lock.inode_lock);
    5128       11393 :                 ext4_mb_pa_free(pa);
    5129             :         } else {
    5130         746 :                 spin_lock(pa->pa_node_lock.lg_lock);
    5131         746 :                 list_del_rcu(&pa->pa_node.lg_list);
    5132         746 :                 spin_unlock(pa->pa_node_lock.lg_lock);
    5133         746 :                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
    5134             :         }
    5135             : }
    5136             : 
    5137      715259 : static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new)
    5138             : {
    5139      715259 :         struct rb_node **iter = &root->rb_node, *parent = NULL;
    5140      715259 :         struct ext4_prealloc_space *iter_pa, *new_pa;
    5141      715259 :         ext4_lblk_t iter_start, new_start;
    5142             : 
    5143     3270021 :         while (*iter) {
    5144     2554762 :                 iter_pa = rb_entry(*iter, struct ext4_prealloc_space,
    5145             :                                    pa_node.inode_node);
    5146     2554762 :                 new_pa = rb_entry(new, struct ext4_prealloc_space,
    5147             :                                    pa_node.inode_node);
    5148     2554762 :                 iter_start = iter_pa->pa_lstart;
    5149     2554762 :                 new_start = new_pa->pa_lstart;
    5150             : 
    5151     2554762 :                 parent = *iter;
    5152     2554762 :                 if (new_start < iter_start)
    5153      697719 :                         iter = &((*iter)->rb_left);
    5154             :                 else
    5155     1857043 :                         iter = &((*iter)->rb_right);
    5156             :         }
    5157             : 
    5158      715259 :         rb_link_node(new, parent, iter);
    5159      715259 :         rb_insert_color(new, root);
    5160      715251 : }
    5161             : 
    5162             : /*
    5163             :  * creates new preallocated space for given inode
    5164             :  */
    5165             : static noinline_for_stack void
    5166      715263 : ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
    5167             : {
    5168      715263 :         struct super_block *sb = ac->ac_sb;
    5169      715263 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    5170      715263 :         struct ext4_prealloc_space *pa;
    5171      715263 :         struct ext4_group_info *grp;
    5172      715263 :         struct ext4_inode_info *ei;
    5173             : 
    5174             :         /* preallocate only when found space is larger then requested */
    5175      715263 :         BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
    5176      715263 :         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
    5177      715263 :         BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
    5178      715263 :         BUG_ON(ac->ac_pa == NULL);
    5179             : 
    5180      715263 :         pa = ac->ac_pa;
    5181             : 
    5182      715263 :         if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) {
    5183      244401 :                 int new_bex_start;
    5184      244401 :                 int new_bex_end;
    5185             : 
    5186             :                 /* we can't allocate as much as normalizer wants.
    5187             :                  * so, found space must get proper lstart
    5188             :                  * to cover original request */
    5189      244401 :                 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
    5190      244401 :                 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
    5191             : 
    5192             :                 /*
    5193             :                  * Use the below logic for adjusting best extent as it keeps
    5194             :                  * fragmentation in check while ensuring logical range of best
    5195             :                  * extent doesn't overflow out of goal extent:
    5196             :                  *
    5197             :                  * 1. Check if best ex can be kept at end of goal (before
    5198             :                  *    cr_best_avail trimmed it) and still cover original start
    5199             :                  * 2. Else, check if best ex can be kept at start of goal and
    5200             :                  *    still cover original start
    5201             :                  * 3. Else, keep the best ex at start of original request.
    5202             :                  */
    5203      488802 :                 new_bex_end = ac->ac_g_ex.fe_logical +
    5204      244401 :                         EXT4_C2B(sbi, ac->ac_orig_goal_len);
    5205      244401 :                 new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
    5206      244401 :                 if (ac->ac_o_ex.fe_logical >= new_bex_start)
    5207       22419 :                         goto adjust_bex;
    5208             : 
    5209      221982 :                 new_bex_start = ac->ac_g_ex.fe_logical;
    5210      443964 :                 new_bex_end =
    5211      221982 :                         new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
    5212      221982 :                 if (ac->ac_o_ex.fe_logical < new_bex_end)
    5213       83619 :                         goto adjust_bex;
    5214             : 
    5215      138363 :                 new_bex_start = ac->ac_o_ex.fe_logical;
    5216      276726 :                 new_bex_end =
    5217      138363 :                         new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
    5218             : 
    5219      244401 : adjust_bex:
    5220      244401 :                 ac->ac_b_ex.fe_logical = new_bex_start;
    5221             : 
    5222      244401 :                 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
    5223      244401 :                 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
    5224      244401 :                 BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical +
    5225             :                                       EXT4_C2B(sbi, ac->ac_orig_goal_len)));
    5226             :         }
    5227             : 
    5228      715263 :         pa->pa_lstart = ac->ac_b_ex.fe_logical;
    5229      715263 :         pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
    5230      715254 :         pa->pa_len = ac->ac_b_ex.fe_len;
    5231      715254 :         pa->pa_free = pa->pa_len;
    5232      715254 :         spin_lock_init(&pa->pa_lock);
    5233      715254 :         INIT_LIST_HEAD(&pa->pa_group_list);
    5234      715254 :         pa->pa_deleted = 0;
    5235      715254 :         pa->pa_type = MB_INODE_PA;
    5236             : 
    5237      715254 :         mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
    5238             :                  pa->pa_len, pa->pa_lstart);
    5239      715254 :         trace_ext4_mb_new_inode_pa(ac, pa);
    5240             : 
    5241      715245 :         atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
    5242      715255 :         ext4_mb_use_inode_pa(ac, pa);
    5243             : 
    5244      715250 :         ei = EXT4_I(ac->ac_inode);
    5245      715250 :         grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
    5246      715254 :         if (!grp)
    5247             :                 return;
    5248             : 
    5249      715254 :         pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock;
    5250      715254 :         pa->pa_inode = ac->ac_inode;
    5251             : 
    5252      715254 :         list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
    5253             : 
    5254      715254 :         write_lock(pa->pa_node_lock.inode_lock);
    5255      715260 :         ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node);
    5256      715250 :         write_unlock(pa->pa_node_lock.inode_lock);
    5257      715246 :         atomic_inc(&ei->i_prealloc_active);
    5258             : }
    5259             : 
    5260             : /*
    5261             :  * creates new preallocated space for locality group inodes belongs to
    5262             :  */
    5263             : static noinline_for_stack void
    5264        1695 : ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
    5265             : {
    5266        1695 :         struct super_block *sb = ac->ac_sb;
    5267        1695 :         struct ext4_locality_group *lg;
    5268        1695 :         struct ext4_prealloc_space *pa;
    5269        1695 :         struct ext4_group_info *grp;
    5270             : 
    5271             :         /* preallocate only when found space is larger then requested */
    5272        1695 :         BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
    5273        1695 :         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
    5274        1695 :         BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
    5275        1695 :         BUG_ON(ac->ac_pa == NULL);
    5276             : 
    5277        1695 :         pa = ac->ac_pa;
    5278             : 
    5279        1695 :         pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
    5280        1695 :         pa->pa_lstart = pa->pa_pstart;
    5281        1695 :         pa->pa_len = ac->ac_b_ex.fe_len;
    5282        1695 :         pa->pa_free = pa->pa_len;
    5283        1695 :         spin_lock_init(&pa->pa_lock);
    5284        1695 :         INIT_LIST_HEAD(&pa->pa_node.lg_list);
    5285        1695 :         INIT_LIST_HEAD(&pa->pa_group_list);
    5286        1695 :         pa->pa_deleted = 0;
    5287        1695 :         pa->pa_type = MB_GROUP_PA;
    5288             : 
    5289        1695 :         mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
    5290             :                  pa->pa_len, pa->pa_lstart);
    5291        1695 :         trace_ext4_mb_new_group_pa(ac, pa);
    5292             : 
    5293        1695 :         ext4_mb_use_group_pa(ac, pa);
    5294        1695 :         atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
    5295             : 
    5296        1695 :         grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
    5297        1695 :         if (!grp)
    5298             :                 return;
    5299        1695 :         lg = ac->ac_lg;
    5300        1695 :         BUG_ON(lg == NULL);
    5301             : 
    5302        1695 :         pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock;
    5303        1695 :         pa->pa_inode = NULL;
    5304             : 
    5305        1695 :         list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
    5306             : 
    5307             :         /*
    5308             :          * We will later add the new pa to the right bucket
    5309             :          * after updating the pa_free in ext4_mb_release_context
    5310             :          */
    5311             : }
    5312             : 
    5313      716957 : static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
    5314             : {
    5315      716957 :         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
    5316        1695 :                 ext4_mb_new_group_pa(ac);
    5317             :         else
    5318      715262 :                 ext4_mb_new_inode_pa(ac);
    5319      716948 : }
    5320             : 
    5321             : /*
    5322             :  * finds all unused blocks in on-disk bitmap, frees them in
    5323             :  * in-core bitmap and buddy.
    5324             :  * @pa must be unlinked from inode and group lists, so that
    5325             :  * nobody else can find/use it.
    5326             :  * the caller MUST hold group/inode locks.
    5327             :  * TODO: optimize the case when there are no in-core structures yet
    5328             :  */
    5329             : static noinline_for_stack int
    5330      703823 : ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
    5331             :                         struct ext4_prealloc_space *pa)
    5332             : {
    5333      703823 :         struct super_block *sb = e4b->bd_sb;
    5334      703823 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    5335      703823 :         unsigned int end;
    5336      703823 :         unsigned int next;
    5337      703823 :         ext4_group_t group;
    5338      703823 :         ext4_grpblk_t bit;
    5339      703823 :         unsigned long long grp_blk_start;
    5340      703823 :         int free = 0;
    5341             : 
    5342      703823 :         BUG_ON(pa->pa_deleted == 0);
    5343      703823 :         ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
    5344      703824 :         grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
    5345      703824 :         BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
    5346      703824 :         end = bit + pa->pa_len;
    5347             : 
    5348     1860041 :         while (bit < end) {
    5349     1240333 :                 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
    5350     1240318 :                 if (bit >= end)
    5351             :                         break;
    5352     1156180 :                 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
    5353     1156180 :                 mb_debug(sb, "free preallocated %u/%u in group %u\n",
    5354             :                          (unsigned) ext4_group_first_block_no(sb, group) + bit,
    5355             :                          (unsigned) next - bit, (unsigned) group);
    5356     1156180 :                 free += next - bit;
    5357             : 
    5358     1156180 :                 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
    5359     2312330 :                 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
    5360     1156165 :                                                     EXT4_C2B(sbi, bit)),
    5361             :                                                next - bit);
    5362     1156163 :                 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
    5363     1156217 :                 bit = next + 1;
    5364             :         }
    5365      703846 :         if (free != pa->pa_free) {
    5366           0 :                 ext4_msg(e4b->bd_sb, KERN_CRIT,
    5367             :                          "pa %p: logic %lu, phys. %lu, len %d",
    5368             :                          pa, (unsigned long) pa->pa_lstart,
    5369             :                          (unsigned long) pa->pa_pstart,
    5370             :                          pa->pa_len);
    5371           0 :                 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
    5372             :                                         free, pa->pa_free);
    5373             :                 /*
    5374             :                  * pa is already deleted so we use the value obtained
    5375             :                  * from the bitmap and continue.
    5376             :                  */
    5377             :         }
    5378      703846 :         atomic_add(free, &sbi->s_mb_discarded);
    5379             : 
    5380      703850 :         return 0;
    5381             : }
    5382             : 
    5383             : static noinline_for_stack int
    5384         176 : ext4_mb_release_group_pa(struct ext4_buddy *e4b,
    5385             :                                 struct ext4_prealloc_space *pa)
    5386             : {
    5387         176 :         struct super_block *sb = e4b->bd_sb;
    5388         176 :         ext4_group_t group;
    5389         176 :         ext4_grpblk_t bit;
    5390             : 
    5391         176 :         trace_ext4_mb_release_group_pa(sb, pa);
    5392         176 :         BUG_ON(pa->pa_deleted == 0);
    5393         176 :         ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
    5394         176 :         if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
    5395           0 :                 ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
    5396             :                              e4b->bd_group, group, pa->pa_pstart);
    5397           0 :                 return 0;
    5398             :         }
    5399         176 :         mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
    5400         176 :         atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
    5401         176 :         trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
    5402             : 
    5403         176 :         return 0;
    5404             : }
    5405             : 
    5406             : /*
    5407             :  * releases all preallocations in given group
    5408             :  *
    5409             :  * first, we need to decide discard policy:
    5410             :  * - when do we discard
    5411             :  *   1) ENOSPC
    5412             :  * - how many do we discard
    5413             :  *   1) how many requested
    5414             :  */
    5415             : static noinline_for_stack int
    5416      670720 : ext4_mb_discard_group_preallocations(struct super_block *sb,
    5417             :                                      ext4_group_t group, int *busy)
    5418             : {
    5419      670720 :         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
    5420      669912 :         struct buffer_head *bitmap_bh = NULL;
    5421      669912 :         struct ext4_prealloc_space *pa, *tmp;
    5422      669912 :         struct list_head list;
    5423      669912 :         struct ext4_buddy e4b;
    5424      669912 :         struct ext4_inode_info *ei;
    5425      669912 :         int err;
    5426      669912 :         int free = 0;
    5427             : 
    5428      669912 :         if (!grp)
    5429             :                 return 0;
    5430      669912 :         mb_debug(sb, "discard preallocation for group %u\n", group);
    5431      669912 :         if (list_empty(&grp->bb_prealloc_list))
    5432      648004 :                 goto out_dbg;
    5433             : 
    5434       21908 :         bitmap_bh = ext4_read_block_bitmap(sb, group);
    5435       21905 :         if (IS_ERR(bitmap_bh)) {
    5436           0 :                 err = PTR_ERR(bitmap_bh);
    5437           0 :                 ext4_error_err(sb, -err,
    5438             :                                "Error %d reading block bitmap for %u",
    5439             :                                err, group);
    5440           0 :                 goto out_dbg;
    5441             :         }
    5442             : 
    5443       21905 :         err = ext4_mb_load_buddy(sb, group, &e4b);
    5444       21907 :         if (err) {
    5445           0 :                 ext4_warning(sb, "Error %d loading buddy information for %u",
    5446             :                              err, group);
    5447           0 :                 put_bh(bitmap_bh);
    5448           0 :                 goto out_dbg;
    5449             :         }
    5450             : 
    5451       21907 :         INIT_LIST_HEAD(&list);
    5452       21907 :         ext4_lock_group(sb, group);
    5453      257607 :         list_for_each_entry_safe(pa, tmp,
    5454             :                                 &grp->bb_prealloc_list, pa_group_list) {
    5455      235681 :                 spin_lock(&pa->pa_lock);
    5456      235680 :                 if (atomic_read(&pa->pa_count)) {
    5457        5263 :                         spin_unlock(&pa->pa_lock);
    5458        5264 :                         *busy = 1;
    5459        5264 :                         continue;
    5460             :                 }
    5461      230417 :                 if (pa->pa_deleted) {
    5462           0 :                         spin_unlock(&pa->pa_lock);
    5463           0 :                         continue;
    5464             :                 }
    5465             : 
    5466             :                 /* seems this one can be freed ... */
    5467      230417 :                 ext4_mb_mark_pa_deleted(sb, pa);
    5468             : 
    5469      230416 :                 if (!free)
    5470       21214 :                         this_cpu_inc(discard_pa_seq);
    5471             : 
    5472             :                 /* we can trust pa_free ... */
    5473      230416 :                 free += pa->pa_free;
    5474             : 
    5475      230416 :                 spin_unlock(&pa->pa_lock);
    5476             : 
    5477      230417 :                 list_del(&pa->pa_group_list);
    5478      230416 :                 list_add(&pa->u.pa_tmp_list, &list);
    5479             :         }
    5480             : 
    5481             :         /* now free all selected PAs */
    5482      252331 :         list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
    5483             : 
    5484             :                 /* remove from object (inode or locality group) */
    5485      230403 :                 if (pa->pa_type == MB_GROUP_PA) {
    5486          28 :                         spin_lock(pa->pa_node_lock.lg_lock);
    5487          28 :                         list_del_rcu(&pa->pa_node.lg_list);
    5488          28 :                         spin_unlock(pa->pa_node_lock.lg_lock);
    5489             :                 } else {
    5490      230375 :                         write_lock(pa->pa_node_lock.inode_lock);
    5491      230382 :                         ei = EXT4_I(pa->pa_inode);
    5492      230382 :                         rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
    5493      230374 :                         write_unlock(pa->pa_node_lock.inode_lock);
    5494             :                 }
    5495             : 
    5496      230401 :                 list_del(&pa->u.pa_tmp_list);
    5497             : 
    5498      230400 :                 if (pa->pa_type == MB_GROUP_PA) {
    5499          28 :                         ext4_mb_release_group_pa(&e4b, pa);
    5500          28 :                         call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
    5501             :                 } else {
    5502      230372 :                         ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
    5503      230382 :                         ext4_mb_pa_free(pa);
    5504             :                 }
    5505             :         }
    5506             : 
    5507       21928 :         ext4_unlock_group(sb, group);
    5508       21928 :         ext4_mb_unload_buddy(&e4b);
    5509       21926 :         put_bh(bitmap_bh);
    5510             : out_dbg:
    5511             :         mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
    5512             :                  free, group, grp->bb_free);
    5513             :         return free;
    5514             : }
    5515             : 
    5516             : /*
    5517             :  * releases all non-used preallocated blocks for given inode
    5518             :  *
    5519             :  * It's important to discard preallocations under i_data_sem
    5520             :  * We don't want another block to be served from the prealloc
    5521             :  * space when we are discarding the inode prealloc space.
    5522             :  *
    5523             :  * FIXME!! Make sure it is valid at all the call sites
    5524             :  */
    5525     9743441 : void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
    5526             : {
    5527     9743441 :         struct ext4_inode_info *ei = EXT4_I(inode);
    5528     9743441 :         struct super_block *sb = inode->i_sb;
    5529     9743441 :         struct buffer_head *bitmap_bh = NULL;
    5530     9743441 :         struct ext4_prealloc_space *pa, *tmp;
    5531     9743441 :         ext4_group_t group = 0;
    5532     9743441 :         struct list_head list;
    5533     9743441 :         struct ext4_buddy e4b;
    5534     9743441 :         struct rb_node *iter;
    5535     9743441 :         int err;
    5536             : 
    5537     9743441 :         if (!S_ISREG(inode->i_mode)) {
    5538      986938 :                 return;
    5539             :         }
    5540             : 
    5541     8756599 :         if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
    5542             :                 return;
    5543             : 
    5544     8756503 :         mb_debug(sb, "discard preallocation for inode %lu\n",
    5545             :                  inode->i_ino);
    5546     8756503 :         trace_ext4_discard_preallocations(inode,
    5547             :                         atomic_read(&ei->i_prealloc_active), needed);
    5548             : 
    5549     8754345 :         INIT_LIST_HEAD(&list);
    5550             : 
    5551     8754345 :         if (needed == 0)
    5552     8754345 :                 needed = UINT_MAX;
    5553             : 
    5554           0 : repeat:
    5555             :         /* first, collect all pa's in the inode */
    5556     8754345 :         write_lock(&ei->i_prealloc_lock);
    5557     9230370 :         for (iter = rb_first(&ei->i_prealloc_node); iter && needed;
    5558      473444 :              iter = rb_next(iter)) {
    5559      473452 :                 pa = rb_entry(iter, struct ext4_prealloc_space,
    5560             :                               pa_node.inode_node);
    5561      473452 :                 BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock);
    5562             : 
    5563      473452 :                 spin_lock(&pa->pa_lock);
    5564      473456 :                 if (atomic_read(&pa->pa_count)) {
    5565             :                         /* this shouldn't happen often - nobody should
    5566             :                          * use preallocation while we're discarding it */
    5567           0 :                         spin_unlock(&pa->pa_lock);
    5568           0 :                         write_unlock(&ei->i_prealloc_lock);
    5569           0 :                         ext4_msg(sb, KERN_ERR,
    5570             :                                  "uh-oh! used pa while discarding");
    5571           0 :                         WARN_ON(1);
    5572           0 :                         schedule_timeout_uninterruptible(HZ);
    5573           0 :                         goto repeat;
    5574             : 
    5575             :                 }
    5576      473456 :                 if (pa->pa_deleted == 0) {
    5577      473456 :                         ext4_mb_mark_pa_deleted(sb, pa);
    5578      473471 :                         spin_unlock(&pa->pa_lock);
    5579      473466 :                         rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
    5580      473457 :                         list_add(&pa->u.pa_tmp_list, &list);
    5581      473444 :                         needed--;
    5582      473444 :                         continue;
    5583             :                 }
    5584             : 
    5585             :                 /* someone is deleting pa right now */
    5586           0 :                 spin_unlock(&pa->pa_lock);
    5587           0 :                 write_unlock(&ei->i_prealloc_lock);
    5588             : 
    5589             :                 /* we have to wait here because pa_deleted
    5590             :                  * doesn't mean pa is already unlinked from
    5591             :                  * the list. as we might be called from
    5592             :                  * ->clear_inode() the inode will get freed
    5593             :                  * and concurrent thread which is unlinking
    5594             :                  * pa from inode's list may access already
    5595             :                  * freed memory, bad-bad-bad */
    5596             : 
    5597             :                 /* XXX: if this happens too often, we can
    5598             :                  * add a flag to force wait only in case
    5599             :                  * of ->clear_inode(), but not in case of
    5600             :                  * regular truncate */
    5601           0 :                 schedule_timeout_uninterruptible(HZ);
    5602           0 :                 goto repeat;
    5603             :         }
    5604     8756403 :         write_unlock(&ei->i_prealloc_lock);
    5605             : 
    5606     9229011 :         list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
    5607      473409 :                 BUG_ON(pa->pa_type != MB_INODE_PA);
    5608      473409 :                 group = ext4_get_group_number(sb, pa->pa_pstart);
    5609             : 
    5610      473374 :                 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
    5611             :                                              GFP_NOFS|__GFP_NOFAIL);
    5612      473431 :                 if (err) {
    5613           0 :                         ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
    5614             :                                        err, group);
    5615           0 :                         continue;
    5616             :                 }
    5617             : 
    5618      473431 :                 bitmap_bh = ext4_read_block_bitmap(sb, group);
    5619      473414 :                 if (IS_ERR(bitmap_bh)) {
    5620           0 :                         err = PTR_ERR(bitmap_bh);
    5621           0 :                         ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
    5622             :                                        err, group);
    5623           0 :                         ext4_mb_unload_buddy(&e4b);
    5624           0 :                         continue;
    5625             :                 }
    5626             : 
    5627      473414 :                 ext4_lock_group(sb, group);
    5628      473461 :                 list_del(&pa->pa_group_list);
    5629      473459 :                 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
    5630      473468 :                 ext4_unlock_group(sb, group);
    5631             : 
    5632      473481 :                 ext4_mb_unload_buddy(&e4b);
    5633      473472 :                 put_bh(bitmap_bh);
    5634             : 
    5635      473481 :                 list_del(&pa->u.pa_tmp_list);
    5636      473477 :                 ext4_mb_pa_free(pa);
    5637             :         }
    5638             : }
    5639             : 
    5640     2679247 : static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
    5641             : {
    5642     2679247 :         struct ext4_prealloc_space *pa;
    5643             : 
    5644     2679247 :         BUG_ON(ext4_pspace_cachep == NULL);
    5645     2679247 :         pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
    5646     2679572 :         if (!pa)
    5647             :                 return -ENOMEM;
    5648     2679572 :         atomic_set(&pa->pa_count, 1);
    5649     2679572 :         ac->ac_pa = pa;
    5650     2679572 :         return 0;
    5651             : }
    5652             : 
    5653     1962887 : static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac)
    5654             : {
    5655     1962887 :         struct ext4_prealloc_space *pa = ac->ac_pa;
    5656             : 
    5657     1962887 :         BUG_ON(!pa);
    5658     1962887 :         ac->ac_pa = NULL;
    5659     1962887 :         WARN_ON(!atomic_dec_and_test(&pa->pa_count));
    5660             :         /*
    5661             :          * current function is only called due to an error or due to
    5662             :          * len of found blocks < len of requested blocks hence the PA has not
    5663             :          * been added to grp->bb_prealloc_list. So we don't need to lock it
    5664             :          */
    5665     1962991 :         pa->pa_deleted = 1;
    5666     1962991 :         ext4_mb_pa_free(pa);
    5667     1962904 : }
    5668             : 
    5669             : #ifdef CONFIG_EXT4_DEBUG
    5670      432227 : static inline void ext4_mb_show_pa(struct super_block *sb)
    5671             : {
    5672      432227 :         ext4_group_t i, ngroups;
    5673             : 
    5674      432227 :         if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
    5675             :                 return;
    5676             : 
    5677      432227 :         ngroups = ext4_get_groups_count(sb);
    5678      432227 :         mb_debug(sb, "groups: ");
    5679    48263224 :         for (i = 0; i < ngroups; i++) {
    5680    47398671 :                 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
    5681    46030990 :                 struct ext4_prealloc_space *pa;
    5682    46030990 :                 ext4_grpblk_t start;
    5683    46030990 :                 struct list_head *cur;
    5684             : 
    5685    46030990 :                 if (!grp)
    5686           0 :                         continue;
    5687    46030990 :                 ext4_lock_group(sb, i);
    5688   104038480 :                 list_for_each(cur, &grp->bb_prealloc_list) {
    5689    58462198 :                         pa = list_entry(cur, struct ext4_prealloc_space,
    5690             :                                         pa_group_list);
    5691    58462198 :                         spin_lock(&pa->pa_lock);
    5692    58498412 :                         ext4_get_group_no_and_offset(sb, pa->pa_pstart,
    5693             :                                                      NULL, &start);
    5694    57574834 :                         spin_unlock(&pa->pa_lock);
    5695    58745072 :                         mb_debug(sb, "PA:%u:%d:%d\n", i, start,
    5696             :                                  pa->pa_len);
    5697             :                 }
    5698    45576282 :                 ext4_unlock_group(sb, i);
    5699    47398781 :                 mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
    5700             :                          grp->bb_fragments);
    5701             :         }
    5702             : }
    5703             : 
    5704         234 : static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
    5705             : {
    5706         234 :         struct super_block *sb = ac->ac_sb;
    5707             : 
    5708         234 :         if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
    5709             :                 return;
    5710             : 
    5711         234 :         mb_debug(sb, "Can't allocate:"
    5712             :                         " Allocation context details:");
    5713         234 :         mb_debug(sb, "status %u flags 0x%x",
    5714             :                         ac->ac_status, ac->ac_flags);
    5715         234 :         mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
    5716             :                         "goal %lu/%lu/%lu@%lu, "
    5717             :                         "best %lu/%lu/%lu@%lu cr %d",
    5718             :                         (unsigned long)ac->ac_o_ex.fe_group,
    5719             :                         (unsigned long)ac->ac_o_ex.fe_start,
    5720             :                         (unsigned long)ac->ac_o_ex.fe_len,
    5721             :                         (unsigned long)ac->ac_o_ex.fe_logical,
    5722             :                         (unsigned long)ac->ac_g_ex.fe_group,
    5723             :                         (unsigned long)ac->ac_g_ex.fe_start,
    5724             :                         (unsigned long)ac->ac_g_ex.fe_len,
    5725             :                         (unsigned long)ac->ac_g_ex.fe_logical,
    5726             :                         (unsigned long)ac->ac_b_ex.fe_group,
    5727             :                         (unsigned long)ac->ac_b_ex.fe_start,
    5728             :                         (unsigned long)ac->ac_b_ex.fe_len,
    5729             :                         (unsigned long)ac->ac_b_ex.fe_logical,
    5730             :                         (int)ac->ac_criteria);
    5731         234 :         mb_debug(sb, "%u found", ac->ac_found);
    5732         234 :         mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no");
    5733         234 :         if (ac->ac_pa)
    5734             :                 mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
    5735             :                          "group pa" : "inode pa");
    5736         234 :         ext4_mb_show_pa(sb);
    5737             : }
    5738             : #else
    5739             : static inline void ext4_mb_show_pa(struct super_block *sb)
    5740             : {
    5741             :         return;
    5742             : }
    5743             : static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
    5744             : {
    5745             :         ext4_mb_show_pa(ac->ac_sb);
    5746             :         return;
    5747             : }
    5748             : #endif
    5749             : 
    5750             : /*
    5751             :  * We use locality group preallocation for small size file. The size of the
    5752             :  * file is determined by the current size or the resulting size after
    5753             :  * allocation which ever is larger
    5754             :  *
    5755             :  * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
    5756             :  */
    5757     3617994 : static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
    5758             : {
    5759     3617994 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    5760     3617994 :         int bsbits = ac->ac_sb->s_blocksize_bits;
    5761     3617994 :         loff_t size, isize;
    5762     3617994 :         bool inode_pa_eligible, group_pa_eligible;
    5763             : 
    5764     3617994 :         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
    5765             :                 return;
    5766             : 
    5767     3086768 :         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
    5768             :                 return;
    5769             : 
    5770     3086768 :         group_pa_eligible = sbi->s_mb_group_prealloc > 0;
    5771     3086768 :         inode_pa_eligible = true;
    5772     3086768 :         size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
    5773     3086768 :         isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
    5774           0 :                 >> bsbits;
    5775             : 
    5776             :         /* No point in using inode preallocation for closed files */
    5777     3086768 :         if ((size == isize) && !ext4_fs_is_busy(sbi) &&
    5778      449975 :             !inode_is_open_for_write(ac->ac_inode))
    5779      326054 :                 inode_pa_eligible = false;
    5780             : 
    5781     3086768 :         size = max(size, isize);
    5782             :         /* Don't use group allocation for large files */
    5783     3086768 :         if (size > sbi->s_mb_stream_request)
    5784             :                 group_pa_eligible = false;
    5785             : 
    5786      295858 :         if (!group_pa_eligible) {
    5787     2790910 :                 if (inode_pa_eligible)
    5788     2707881 :                         ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
    5789             :                 else
    5790       83029 :                         ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
    5791     2790910 :                 return;
    5792             :         }
    5793             : 
    5794      295858 :         BUG_ON(ac->ac_lg != NULL);
    5795             :         /*
    5796             :          * locality group prealloc space are per cpu. The reason for having
    5797             :          * per cpu locality group is to reduce the contention between block
    5798             :          * request from multiple CPUs.
    5799             :          */
    5800      295858 :         ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
    5801             : 
    5802             :         /* we're going to use group allocation */
    5803      295858 :         ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
    5804             : 
    5805             :         /* serialize all allocations in the group */
    5806      295858 :         mutex_lock(&ac->ac_lg->lg_mutex);
    5807             : }
    5808             : 
    5809             : static noinline_for_stack void
    5810     3619011 : ext4_mb_initialize_context(struct ext4_allocation_context *ac,
    5811             :                                 struct ext4_allocation_request *ar)
    5812             : {
    5813     3619011 :         struct super_block *sb = ar->inode->i_sb;
    5814     3619011 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    5815     3619011 :         struct ext4_super_block *es = sbi->s_es;
    5816     3619011 :         ext4_group_t group;
    5817     3619011 :         unsigned int len;
    5818     3619011 :         ext4_fsblk_t goal;
    5819     3619011 :         ext4_grpblk_t block;
    5820             : 
    5821             :         /* we can't allocate > group size */
    5822     3619011 :         len = ar->len;
    5823             : 
    5824             :         /* just a dirty hack to filter too big requests  */
    5825     3619011 :         if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
    5826           1 :                 len = EXT4_CLUSTERS_PER_GROUP(sb);
    5827             : 
    5828             :         /* start searching from the goal */
    5829     3619011 :         goal = ar->goal;
    5830     7237951 :         if (goal < le32_to_cpu(es->s_first_data_block) ||
    5831             :                         goal >= ext4_blocks_count(es))
    5832             :                 goal = le32_to_cpu(es->s_first_data_block);
    5833     3619011 :         ext4_get_group_no_and_offset(sb, goal, &group, &block);
    5834             : 
    5835             :         /* set up allocation goals */
    5836     3617690 :         ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
    5837     3617690 :         ac->ac_status = AC_STATUS_CONTINUE;
    5838     3617690 :         ac->ac_sb = sb;
    5839     3617690 :         ac->ac_inode = ar->inode;
    5840     3617690 :         ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
    5841     3617690 :         ac->ac_o_ex.fe_group = group;
    5842     3617690 :         ac->ac_o_ex.fe_start = block;
    5843     3617690 :         ac->ac_o_ex.fe_len = len;
    5844     3617690 :         ac->ac_g_ex = ac->ac_o_ex;
    5845     3617690 :         ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
    5846     3617690 :         ac->ac_flags = ar->flags;
    5847             : 
    5848             :         /* we have to define context: we'll work with a file or
    5849             :          * locality group. this is a policy, actually */
    5850     3617690 :         ext4_mb_group_or_file(ac);
    5851             : 
    5852     3617949 :         mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
    5853             :                         "left: %u/%u, right %u/%u to %swritable\n",
    5854             :                         (unsigned) ar->len, (unsigned) ar->logical,
    5855             :                         (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
    5856             :                         (unsigned) ar->lleft, (unsigned) ar->pleft,
    5857             :                         (unsigned) ar->lright, (unsigned) ar->pright,
    5858             :                         inode_is_open_for_write(ar->inode) ? "" : "non-");
    5859     3617949 : }
    5860             : 
    5861             : static noinline_for_stack void
    5862          37 : ext4_mb_discard_lg_preallocations(struct super_block *sb,
    5863             :                                         struct ext4_locality_group *lg,
    5864             :                                         int order, int total_entries)
    5865             : {
    5866          37 :         ext4_group_t group = 0;
    5867          37 :         struct ext4_buddy e4b;
    5868          37 :         struct list_head discard_list;
    5869          37 :         struct ext4_prealloc_space *pa, *tmp;
    5870             : 
    5871          37 :         mb_debug(sb, "discard locality group preallocation\n");
    5872             : 
    5873          37 :         INIT_LIST_HEAD(&discard_list);
    5874             : 
    5875          37 :         spin_lock(&lg->lg_prealloc_lock);
    5876         148 :         list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
    5877             :                                 pa_node.lg_list,
    5878             :                                 lockdep_is_held(&lg->lg_prealloc_lock)) {
    5879         148 :                 spin_lock(&pa->pa_lock);
    5880         148 :                 if (atomic_read(&pa->pa_count)) {
    5881             :                         /*
    5882             :                          * This is the pa that we just used
    5883             :                          * for block allocation. So don't
    5884             :                          * free that
    5885             :                          */
    5886           0 :                         spin_unlock(&pa->pa_lock);
    5887           0 :                         continue;
    5888             :                 }
    5889         148 :                 if (pa->pa_deleted) {
    5890           0 :                         spin_unlock(&pa->pa_lock);
    5891           0 :                         continue;
    5892             :                 }
    5893             :                 /* only lg prealloc space */
    5894         148 :                 BUG_ON(pa->pa_type != MB_GROUP_PA);
    5895             : 
    5896             :                 /* seems this one can be freed ... */
    5897         148 :                 ext4_mb_mark_pa_deleted(sb, pa);
    5898         148 :                 spin_unlock(&pa->pa_lock);
    5899             : 
    5900         148 :                 list_del_rcu(&pa->pa_node.lg_list);
    5901         148 :                 list_add(&pa->u.pa_tmp_list, &discard_list);
    5902             : 
    5903         148 :                 total_entries--;
    5904         148 :                 if (total_entries <= 5) {
    5905             :                         /*
    5906             :                          * we want to keep only 5 entries
    5907             :                          * allowing it to grow to 8. This
    5908             :                          * mak sure we don't call discard
    5909             :                          * soon for this list.
    5910             :                          */
    5911             :                         break;
    5912             :                 }
    5913             :         }
    5914          37 :         spin_unlock(&lg->lg_prealloc_lock);
    5915             : 
    5916         185 :         list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
    5917         148 :                 int err;
    5918             : 
    5919         148 :                 group = ext4_get_group_number(sb, pa->pa_pstart);
    5920         148 :                 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
    5921             :                                              GFP_NOFS|__GFP_NOFAIL);
    5922         148 :                 if (err) {
    5923           0 :                         ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
    5924             :                                        err, group);
    5925           0 :                         continue;
    5926             :                 }
    5927         148 :                 ext4_lock_group(sb, group);
    5928         148 :                 list_del(&pa->pa_group_list);
    5929         148 :                 ext4_mb_release_group_pa(&e4b, pa);
    5930         148 :                 ext4_unlock_group(sb, group);
    5931             : 
    5932         148 :                 ext4_mb_unload_buddy(&e4b);
    5933         148 :                 list_del(&pa->u.pa_tmp_list);
    5934         148 :                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
    5935             :         }
    5936          37 : }
    5937             : 
    5938             : /*
    5939             :  * We have incremented pa_count. So it cannot be freed at this
    5940             :  * point. Also we hold lg_mutex. So no parallel allocation is
    5941             :  * possible from this lg. That means pa_free cannot be updated.
    5942             :  *
    5943             :  * A parallel ext4_mb_discard_group_preallocations is possible.
    5944             :  * which can cause the lg_prealloc_list to be updated.
    5945             :  */
    5946             : 
    5947      265755 : static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
    5948             : {
    5949      265755 :         int order, added = 0, lg_prealloc_count = 1;
    5950      265755 :         struct super_block *sb = ac->ac_sb;
    5951      265755 :         struct ext4_locality_group *lg = ac->ac_lg;
    5952      265755 :         struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
    5953             : 
    5954      265755 :         order = fls(pa->pa_free) - 1;
    5955      265755 :         if (order > PREALLOC_TB_SIZE - 1)
    5956             :                 /* The max size of hash table is PREALLOC_TB_SIZE */
    5957             :                 order = PREALLOC_TB_SIZE - 1;
    5958             :         /* Add the prealloc space to lg */
    5959      265755 :         spin_lock(&lg->lg_prealloc_lock);
    5960      266904 :         list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
    5961             :                                 pa_node.lg_list,
    5962             :                                 lockdep_is_held(&lg->lg_prealloc_lock)) {
    5963        1149 :                 spin_lock(&tmp_pa->pa_lock);
    5964        1149 :                 if (tmp_pa->pa_deleted) {
    5965           0 :                         spin_unlock(&tmp_pa->pa_lock);
    5966           0 :                         continue;
    5967             :                 }
    5968        1149 :                 if (!added && pa->pa_free < tmp_pa->pa_free) {
    5969             :                         /* Add to the tail of the previous entry */
    5970          12 :                         list_add_tail_rcu(&pa->pa_node.lg_list,
    5971             :                                                 &tmp_pa->pa_node.lg_list);
    5972          12 :                         added = 1;
    5973             :                         /*
    5974             :                          * we want to count the total
    5975             :                          * number of entries in the list
    5976             :                          */
    5977             :                 }
    5978        1149 :                 spin_unlock(&tmp_pa->pa_lock);
    5979        1149 :                 lg_prealloc_count++;
    5980             :         }
    5981      265755 :         if (!added)
    5982      265743 :                 list_add_tail_rcu(&pa->pa_node.lg_list,
    5983      265743 :                                         &lg->lg_prealloc_list[order]);
    5984      265755 :         spin_unlock(&lg->lg_prealloc_lock);
    5985             : 
    5986             :         /* Now trim the list to be not more than 8 elements */
    5987      265755 :         if (lg_prealloc_count > 8) {
    5988          37 :                 ext4_mb_discard_lg_preallocations(sb, lg,
    5989             :                                                   order, lg_prealloc_count);
    5990          37 :                 return;
    5991             :         }
    5992             :         return ;
    5993             : }
    5994             : 
    5995             : /*
    5996             :  * release all resource we used in allocation
    5997             :  */
    5998     3620230 : static int ext4_mb_release_context(struct ext4_allocation_context *ac)
    5999             : {
    6000     3620230 :         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    6001     3620230 :         struct ext4_prealloc_space *pa = ac->ac_pa;
    6002     3620230 :         if (pa) {
    6003     1657292 :                 if (pa->pa_type == MB_GROUP_PA) {
    6004             :                         /* see comment in ext4_mb_use_group_pa() */
    6005      266501 :                         spin_lock(&pa->pa_lock);
    6006      266501 :                         pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
    6007      266501 :                         pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
    6008      266501 :                         pa->pa_free -= ac->ac_b_ex.fe_len;
    6009      266501 :                         pa->pa_len -= ac->ac_b_ex.fe_len;
    6010      266501 :                         spin_unlock(&pa->pa_lock);
    6011             : 
    6012             :                         /*
    6013             :                          * We want to add the pa to the right bucket.
    6014             :                          * Remove it from the list and while adding
    6015             :                          * make sure the list to which we are adding
    6016             :                          * doesn't grow big.
    6017             :                          */
    6018      266501 :                         if (likely(pa->pa_free)) {
    6019      265755 :                                 spin_lock(pa->pa_node_lock.lg_lock);
    6020      265755 :                                 list_del_rcu(&pa->pa_node.lg_list);
    6021      265755 :                                 spin_unlock(pa->pa_node_lock.lg_lock);
    6022      265755 :                                 ext4_mb_add_n_trim(ac);
    6023             :                         }
    6024             :                 }
    6025             : 
    6026     1657292 :                 ext4_mb_put_pa(ac, ac->ac_sb, pa);
    6027             :         }
    6028     3620163 :         if (ac->ac_bitmap_page)
    6029     2679567 :                 put_page(ac->ac_bitmap_page);
    6030     3620187 :         if (ac->ac_buddy_page)
    6031     2679596 :                 put_page(ac->ac_buddy_page);
    6032     3620200 :         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
    6033      295889 :                 mutex_unlock(&ac->ac_lg->lg_mutex);
    6034     3620200 :         ext4_mb_collect_stats(ac);
    6035     3619732 :         return 0;
    6036             : }
    6037             : 
    6038        5941 : static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
    6039             : {
    6040        5941 :         ext4_group_t i, ngroups = ext4_get_groups_count(sb);
    6041        5939 :         int ret;
    6042        5939 :         int freed = 0, busy = 0;
    6043        5939 :         int retry = 0;
    6044             : 
    6045        5939 :         trace_ext4_mb_discard_preallocations(sb, needed);
    6046             : 
    6047        5941 :         if (needed == 0)
    6048           0 :                 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
    6049        5941 :  repeat:
    6050      675465 :         for (i = 0; i < ngroups && needed > 0; i++) {
    6051      669506 :                 ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
    6052      669447 :                 freed += ret;
    6053      669447 :                 needed -= ret;
    6054      669447 :                 cond_resched();
    6055             :         }
    6056             : 
    6057        5959 :         if (needed > 0 && busy && ++retry < 3) {
    6058           1 :                 busy = 0;
    6059           1 :                 goto repeat;
    6060             :         }
    6061             : 
    6062        5958 :         return freed;
    6063             : }
    6064             : 
    6065        5942 : static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
    6066             :                         struct ext4_allocation_context *ac, u64 *seq)
    6067             : {
    6068        5942 :         int freed;
    6069        5942 :         u64 seq_retry = 0;
    6070        5942 :         bool ret = false;
    6071             : 
    6072        5942 :         freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
    6073        5958 :         if (freed) {
    6074        5512 :                 ret = true;
    6075        5512 :                 goto out_dbg;
    6076             :         }
    6077         446 :         seq_retry = ext4_get_discard_pa_seq_sum();
    6078         445 :         if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
    6079         240 :                 ac->ac_flags |= EXT4_MB_STRICT_CHECK;
    6080         240 :                 *seq = seq_retry;
    6081         240 :                 ret = true;
    6082             :         }
    6083             : 
    6084         205 : out_dbg:
    6085        5957 :         mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
    6086        5957 :         return ret;
    6087             : }
    6088             : 
    6089             : /*
    6090             :  * Simple allocator for Ext4 fast commit replay path. It searches for blocks
    6091             :  * linearly starting at the goal block and also excludes the blocks which
    6092             :  * are going to be in use after fast commit replay.
    6093             :  */
    6094             : static ext4_fsblk_t
    6095           0 : ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
    6096             : {
    6097           0 :         struct buffer_head *bitmap_bh;
    6098           0 :         struct super_block *sb = ar->inode->i_sb;
    6099           0 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    6100           0 :         ext4_group_t group, nr;
    6101           0 :         ext4_grpblk_t blkoff;
    6102           0 :         ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
    6103           0 :         ext4_grpblk_t i = 0;
    6104           0 :         ext4_fsblk_t goal, block;
    6105           0 :         struct ext4_super_block *es = EXT4_SB(sb)->s_es;
    6106             : 
    6107           0 :         goal = ar->goal;
    6108           0 :         if (goal < le32_to_cpu(es->s_first_data_block) ||
    6109             :                         goal >= ext4_blocks_count(es))
    6110             :                 goal = le32_to_cpu(es->s_first_data_block);
    6111             : 
    6112           0 :         ar->len = 0;
    6113           0 :         ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
    6114           0 :         for (nr = ext4_get_groups_count(sb); nr > 0; nr--) {
    6115           0 :                 bitmap_bh = ext4_read_block_bitmap(sb, group);
    6116           0 :                 if (IS_ERR(bitmap_bh)) {
    6117           0 :                         *errp = PTR_ERR(bitmap_bh);
    6118           0 :                         pr_warn("Failed to read block bitmap\n");
    6119           0 :                         return 0;
    6120             :                 }
    6121             : 
    6122           0 :                 while (1) {
    6123           0 :                         i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
    6124             :                                                 blkoff);
    6125           0 :                         if (i >= max)
    6126             :                                 break;
    6127           0 :                         if (ext4_fc_replay_check_excluded(sb,
    6128           0 :                                 ext4_group_first_block_no(sb, group) +
    6129           0 :                                 EXT4_C2B(sbi, i))) {
    6130           0 :                                 blkoff = i + 1;
    6131             :                         } else
    6132             :                                 break;
    6133             :                 }
    6134           0 :                 brelse(bitmap_bh);
    6135           0 :                 if (i < max)
    6136             :                         break;
    6137             : 
    6138           0 :                 if (++group >= ext4_get_groups_count(sb))
    6139           0 :                         group = 0;
    6140             : 
    6141           0 :                 blkoff = 0;
    6142             :         }
    6143             : 
    6144           0 :         if (i >= max) {
    6145           0 :                 *errp = -ENOSPC;
    6146           0 :                 return 0;
    6147             :         }
    6148             : 
    6149           0 :         block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
    6150           0 :         ext4_mb_mark_bb(sb, block, 1, 1);
    6151           0 :         ar->len = 1;
    6152             : 
    6153           0 :         return block;
    6154             : }
    6155             : 
    6156             : /*
    6157             :  * Main entry point into mballoc to allocate blocks
    6158             :  * it tries to use preallocation first, then falls back
    6159             :  * to usual allocation
    6160             :  */
    6161     4055377 : ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
    6162             :                                 struct ext4_allocation_request *ar, int *errp)
    6163             : {
    6164     4055377 :         struct ext4_allocation_context *ac = NULL;
    6165     4055377 :         struct ext4_sb_info *sbi;
    6166     4055377 :         struct super_block *sb;
    6167     4055377 :         ext4_fsblk_t block = 0;
    6168     4055377 :         unsigned int inquota = 0;
    6169     4055377 :         unsigned int reserv_clstrs = 0;
    6170     4055377 :         int retries = 0;
    6171     4055377 :         u64 seq;
    6172             : 
    6173     4055377 :         might_sleep();
    6174     4054422 :         sb = ar->inode->i_sb;
    6175     4054422 :         sbi = EXT4_SB(sb);
    6176             : 
    6177     4054422 :         trace_ext4_request_blocks(ar);
    6178     4054906 :         if (sbi->s_mount_state & EXT4_FC_REPLAY)
    6179           0 :                 return ext4_mb_new_blocks_simple(ar, errp);
    6180             : 
    6181             :         /* Allow to use superuser reservation for quota file */
    6182     4057704 :         if (ext4_is_quota_file(ar->inode))
    6183        2788 :                 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
    6184             : 
    6185     4054906 :         if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
    6186             :                 /* Without delayed allocation we need to verify
    6187             :                  * there is enough free blocks to do block allocation
    6188             :                  * and verify allocation doesn't exceed the quota limits.
    6189             :                  */
    6190     9081287 :                 while (ar->len &&
    6191     4323891 :                         ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
    6192             : 
    6193             :                         /* let others to free the space */
    6194     1588053 :                         cond_resched();
    6195     1588013 :                         ar->len = ar->len >> 1;
    6196             :                 }
    6197     3169343 :                 if (!ar->len) {
    6198      432044 :                         ext4_mb_show_pa(sb);
    6199      432096 :                         *errp = -ENOSPC;
    6200      432096 :                         return 0;
    6201             :                 }
    6202     2737299 :                 reserv_clstrs = ar->len;
    6203     2737299 :                 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
    6204        5516 :                         dquot_alloc_block_nofail(ar->inode,
    6205        2758 :                                                  EXT4_C2B(sbi, ar->len));
    6206             :                 } else {
    6207     5560893 :                         while (ar->len &&
    6208     5555522 :                                 dquot_alloc_block(ar->inode,
    6209     2777761 :                                                   EXT4_C2B(sbi, ar->len))) {
    6210             : 
    6211       48091 :                                 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
    6212       48091 :                                 ar->len--;
    6213             :                         }
    6214             :                 }
    6215     2737799 :                 inquota = ar->len;
    6216     2737799 :                 if (ar->len == 0) {
    6217        4871 :                         *errp = -EDQUOT;
    6218        4871 :                         goto out;
    6219             :                 }
    6220             :         }
    6221             : 
    6222     3619935 :         ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
    6223     3619720 :         if (!ac) {
    6224           0 :                 ar->len = 0;
    6225           0 :                 *errp = -ENOMEM;
    6226           0 :                 goto out;
    6227             :         }
    6228             : 
    6229     3619720 :         ext4_mb_initialize_context(ac, ar);
    6230             : 
    6231     3618191 :         ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
    6232     3618191 :         seq = this_cpu_read(discard_pa_seq);
    6233     3618459 :         if (!ext4_mb_use_preallocated(ac)) {
    6234     2678988 :                 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
    6235     2678988 :                 ext4_mb_normalize_request(ac, ar);
    6236             : 
    6237     2679292 :                 *errp = ext4_mb_pa_alloc(ac);
    6238     2679272 :                 if (*errp)
    6239           0 :                         goto errout;
    6240     2679272 : repeat:
    6241             :                 /* allocate space in core */
    6242     2685024 :                 *errp = ext4_mb_regular_allocator(ac);
    6243             :                 /*
    6244             :                  * pa allocated above is added to grp->bb_prealloc_list only
    6245             :                  * when we were able to allocate some block i.e. when
    6246             :                  * ac->ac_status == AC_STATUS_FOUND.
    6247             :                  * And error from above mean ac->ac_status != AC_STATUS_FOUND
    6248             :                  * So we have to free this pa here itself.
    6249             :                  */
    6250     2685593 :                 if (*errp) {
    6251           1 :                         ext4_mb_pa_put_free(ac);
    6252           1 :                         ext4_discard_allocated_blocks(ac);
    6253           1 :                         goto errout;
    6254             :                 }
    6255     2685592 :                 if (ac->ac_status == AC_STATUS_FOUND &&
    6256     2679551 :                         ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
    6257     1962661 :                         ext4_mb_pa_put_free(ac);
    6258             :         }
    6259     3625166 :         if (likely(ac->ac_status == AC_STATUS_FOUND)) {
    6260     3619196 :                 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
    6261     3619975 :                 if (*errp) {
    6262           0 :                         ext4_discard_allocated_blocks(ac);
    6263           0 :                         goto errout;
    6264             :                 } else {
    6265     3619975 :                         block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
    6266     3619979 :                         ar->len = ac->ac_b_ex.fe_len;
    6267             :                 }
    6268             :         } else {
    6269       11926 :                 if (++retries < 3 &&
    6270        5942 :                     ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
    6271        5752 :                         goto repeat;
    6272             :                 /*
    6273             :                  * If block allocation fails then the pa allocated above
    6274             :                  * needs to be freed here itself.
    6275             :                  */
    6276         232 :                 ext4_mb_pa_put_free(ac);
    6277         233 :                 *errp = -ENOSPC;
    6278             :         }
    6279             : 
    6280     3620212 :         if (*errp) {
    6281         233 : errout:
    6282         234 :                 ac->ac_b_ex.fe_len = 0;
    6283         234 :                 ar->len = 0;
    6284         234 :                 ext4_mb_show_ac(ac);
    6285             :         }
    6286     3620213 :         ext4_mb_release_context(ac);
    6287     3619696 :         kmem_cache_free(ext4_ac_cachep, ac);
    6288     3624729 : out:
    6289     3624729 :         if (inquota && ar->len < inquota)
    6290      680599 :                 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
    6291     3624734 :         if (!ar->len) {
    6292        5101 :                 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
    6293             :                         /* release all the reserved blocks if non delalloc */
    6294        5021 :                         percpu_counter_sub(&sbi->s_dirtyclusters_counter,
    6295             :                                                 reserv_clstrs);
    6296             :         }
    6297             : 
    6298     3624735 :         trace_ext4_allocate_blocks(ar, (unsigned long long)block);
    6299             : 
    6300     3624735 :         return block;
    6301             : }
    6302             : 
    6303             : /*
    6304             :  * We can merge two free data extents only if the physical blocks
    6305             :  * are contiguous, AND the extents were freed by the same transaction,
    6306             :  * AND the blocks are associated with the same group.
    6307             :  */
    6308     4210765 : static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
    6309             :                                         struct ext4_free_data *entry,
    6310             :                                         struct ext4_free_data *new_entry,
    6311             :                                         struct rb_root *entry_rb_root)
    6312             : {
    6313     4210765 :         if ((entry->efd_tid != new_entry->efd_tid) ||
    6314     4188293 :             (entry->efd_group != new_entry->efd_group))
    6315             :                 return;
    6316     4188295 :         if (entry->efd_start_cluster + entry->efd_count ==
    6317     4188295 :             new_entry->efd_start_cluster) {
    6318      408840 :                 new_entry->efd_start_cluster = entry->efd_start_cluster;
    6319      408840 :                 new_entry->efd_count += entry->efd_count;
    6320     3779455 :         } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
    6321             :                    entry->efd_start_cluster) {
    6322      551353 :                 new_entry->efd_count += entry->efd_count;
    6323             :         } else
    6324             :                 return;
    6325      960193 :         spin_lock(&sbi->s_md_lock);
    6326      960194 :         list_del(&entry->efd_list);
    6327      960194 :         spin_unlock(&sbi->s_md_lock);
    6328      960194 :         rb_erase(&entry->efd_node, entry_rb_root);
    6329      960194 :         kmem_cache_free(ext4_free_data_cachep, entry);
    6330             : }
    6331             : 
    6332             : static noinline_for_stack void
    6333     2624605 : ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
    6334             :                       struct ext4_free_data *new_entry)
    6335             : {
    6336     2624605 :         ext4_group_t group = e4b->bd_group;
    6337     2624605 :         ext4_grpblk_t cluster;
    6338     2624605 :         ext4_grpblk_t clusters = new_entry->efd_count;
    6339     2624605 :         struct ext4_free_data *entry;
    6340     2624605 :         struct ext4_group_info *db = e4b->bd_info;
    6341     2624605 :         struct super_block *sb = e4b->bd_sb;
    6342     2624605 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    6343     2624605 :         struct rb_node **n = &db->bb_free_root.rb_node, *node;
    6344     2624605 :         struct rb_node *parent = NULL, *new_node;
    6345             : 
    6346     2624605 :         BUG_ON(!ext4_handle_valid(handle));
    6347     2624605 :         BUG_ON(e4b->bd_bitmap_page == NULL);
    6348     2624605 :         BUG_ON(e4b->bd_buddy_page == NULL);
    6349             : 
    6350     2624605 :         new_node = &new_entry->efd_node;
    6351     2624605 :         cluster = new_entry->efd_start_cluster;
    6352             : 
    6353     2624605 :         if (!*n) {
    6354             :                 /* first free block exent. We need to
    6355             :                    protect buddy cache from being freed,
    6356             :                  * otherwise we'll refresh it from
    6357             :                  * on-disk bitmap and lose not-yet-available
    6358             :                  * blocks */
    6359      212330 :                 get_page(e4b->bd_buddy_page);
    6360      212336 :                 get_page(e4b->bd_bitmap_page);
    6361             :         }
    6362    14323932 :         while (*n) {
    6363    11699321 :                 parent = *n;
    6364    11699321 :                 entry = rb_entry(parent, struct ext4_free_data, efd_node);
    6365    11699321 :                 if (cluster < entry->efd_start_cluster)
    6366     5560455 :                         n = &(*n)->rb_left;
    6367     6138866 :                 else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
    6368     6138866 :                         n = &(*n)->rb_right;
    6369             :                 else {
    6370           0 :                         ext4_grp_locked_error(sb, group, 0,
    6371             :                                 ext4_group_first_block_no(sb, group) +
    6372             :                                 EXT4_C2B(sbi, cluster),
    6373             :                                 "Block already on to-be-freed list");
    6374           0 :                         kmem_cache_free(ext4_free_data_cachep, new_entry);
    6375           0 :                         return;
    6376             :                 }
    6377             :         }
    6378             : 
    6379     2624611 :         rb_link_node(new_node, parent, n);
    6380     2624611 :         rb_insert_color(new_node, &db->bb_free_root);
    6381             : 
    6382             :         /* Now try to see the extent can be merged to left and right */
    6383     2624606 :         node = rb_prev(new_node);
    6384     2624596 :         if (node) {
    6385     2071302 :                 entry = rb_entry(node, struct ext4_free_data, efd_node);
    6386     2071302 :                 ext4_try_merge_freed_extent(sbi, entry, new_entry,
    6387             :                                             &(db->bb_free_root));
    6388             :         }
    6389             : 
    6390     2624591 :         node = rb_next(new_node);
    6391     2624592 :         if (node) {
    6392     2139484 :                 entry = rb_entry(node, struct ext4_free_data, efd_node);
    6393     2139484 :                 ext4_try_merge_freed_extent(sbi, entry, new_entry,
    6394             :                                             &(db->bb_free_root));
    6395             :         }
    6396             : 
    6397     2624587 :         spin_lock(&sbi->s_md_lock);
    6398     2624621 :         list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
    6399     2624621 :         sbi->s_mb_free_pending += clusters;
    6400     2624621 :         spin_unlock(&sbi->s_md_lock);
    6401             : }
    6402             : 
    6403           0 : static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
    6404             :                                         unsigned long count)
    6405             : {
    6406           0 :         struct buffer_head *bitmap_bh;
    6407           0 :         struct super_block *sb = inode->i_sb;
    6408           0 :         struct ext4_group_desc *gdp;
    6409           0 :         struct buffer_head *gdp_bh;
    6410           0 :         ext4_group_t group;
    6411           0 :         ext4_grpblk_t blkoff;
    6412           0 :         int already_freed = 0, err, i;
    6413             : 
    6414           0 :         ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
    6415           0 :         bitmap_bh = ext4_read_block_bitmap(sb, group);
    6416           0 :         if (IS_ERR(bitmap_bh)) {
    6417           0 :                 pr_warn("Failed to read block bitmap\n");
    6418           0 :                 return;
    6419             :         }
    6420           0 :         gdp = ext4_get_group_desc(sb, group, &gdp_bh);
    6421           0 :         if (!gdp)
    6422           0 :                 goto err_out;
    6423             : 
    6424           0 :         for (i = 0; i < count; i++) {
    6425           0 :                 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
    6426           0 :                         already_freed++;
    6427             :         }
    6428           0 :         mb_clear_bits(bitmap_bh->b_data, blkoff, count);
    6429           0 :         err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
    6430           0 :         if (err)
    6431           0 :                 goto err_out;
    6432           0 :         ext4_free_group_clusters_set(
    6433           0 :                 sb, gdp, ext4_free_group_clusters(sb, gdp) +
    6434             :                 count - already_freed);
    6435           0 :         ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
    6436           0 :         ext4_group_desc_csum_set(sb, group, gdp);
    6437           0 :         ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
    6438           0 :         sync_dirty_buffer(bitmap_bh);
    6439           0 :         sync_dirty_buffer(gdp_bh);
    6440             : 
    6441           0 : err_out:
    6442           0 :         brelse(bitmap_bh);
    6443             : }
    6444             : 
    6445             : /**
    6446             :  * ext4_mb_clear_bb() -- helper function for freeing blocks.
    6447             :  *                      Used by ext4_free_blocks()
    6448             :  * @handle:             handle for this transaction
    6449             :  * @inode:              inode
    6450             :  * @block:              starting physical block to be freed
    6451             :  * @count:              number of blocks to be freed
    6452             :  * @flags:              flags used by ext4_free_blocks
    6453             :  */
    6454     2622166 : static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
    6455             :                                ext4_fsblk_t block, unsigned long count,
    6456             :                                int flags)
    6457             : {
    6458     2622166 :         struct buffer_head *bitmap_bh = NULL;
    6459     2622166 :         struct super_block *sb = inode->i_sb;
    6460     2622166 :         struct ext4_group_desc *gdp;
    6461     2622166 :         struct ext4_group_info *grp;
    6462     2622166 :         unsigned int overflow;
    6463     2622166 :         ext4_grpblk_t bit;
    6464     2622166 :         struct buffer_head *gd_bh;
    6465     2622166 :         ext4_group_t block_group;
    6466     2622166 :         struct ext4_sb_info *sbi;
    6467     2622166 :         struct ext4_buddy e4b;
    6468     2622166 :         unsigned int count_clusters;
    6469     2622166 :         int err = 0;
    6470     2622166 :         int ret;
    6471             : 
    6472     2622166 :         sbi = EXT4_SB(sb);
    6473             : 
    6474     2622168 :         if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
    6475           2 :             !ext4_inode_block_valid(inode, block, count)) {
    6476           0 :                 ext4_error(sb, "Freeing blocks in system zone - "
    6477             :                            "Block = %llu, count = %lu", block, count);
    6478             :                 /* err = 0. ext4_std_error should be a no op */
    6479           0 :                 goto error_return;
    6480             :         }
    6481     2622166 :         flags |= EXT4_FREE_BLOCKS_VALIDATED;
    6482             : 
    6483     2624607 : do_more:
    6484     2624607 :         overflow = 0;
    6485     2624607 :         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
    6486             : 
    6487     2624596 :         grp = ext4_get_group_info(sb, block_group);
    6488     2624592 :         if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
    6489             :                 return;
    6490             : 
    6491             :         /*
    6492             :          * Check to see if we are freeing blocks across a group
    6493             :          * boundary.
    6494             :          */
    6495     2624592 :         if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
    6496        2441 :                 overflow = EXT4_C2B(sbi, bit) + count -
    6497        2441 :                         EXT4_BLOCKS_PER_GROUP(sb);
    6498        2441 :                 count -= overflow;
    6499             :                 /* The range changed so it's no longer validated */
    6500        2441 :                 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
    6501             :         }
    6502     2624592 :         count_clusters = EXT4_NUM_B2C(sbi, count);
    6503     2624592 :         bitmap_bh = ext4_read_block_bitmap(sb, block_group);
    6504     2624586 :         if (IS_ERR(bitmap_bh)) {
    6505           0 :                 err = PTR_ERR(bitmap_bh);
    6506           0 :                 bitmap_bh = NULL;
    6507           0 :                 goto error_return;
    6508             :         }
    6509     2624586 :         gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
    6510     2624566 :         if (!gdp) {
    6511           0 :                 err = -EIO;
    6512           0 :                 goto error_return;
    6513             :         }
    6514             : 
    6515     2629448 :         if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
    6516        4882 :             !ext4_inode_block_valid(inode, block, count)) {
    6517           0 :                 ext4_error(sb, "Freeing blocks in system zone - "
    6518             :                            "Block = %llu, count = %lu", block, count);
    6519             :                 /* err = 0. ext4_std_error should be a no op */
    6520           0 :                 goto error_return;
    6521             :         }
    6522             : 
    6523     2624566 :         BUFFER_TRACE(bitmap_bh, "getting write access");
    6524     2624566 :         err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
    6525             :                                             EXT4_JTR_NONE);
    6526     2624610 :         if (err)
    6527           0 :                 goto error_return;
    6528             : 
    6529             :         /*
    6530             :          * We are about to modify some metadata.  Call the journal APIs
    6531             :          * to unshare ->b_data if a currently-committing transaction is
    6532             :          * using it
    6533             :          */
    6534     2624610 :         BUFFER_TRACE(gd_bh, "get_write_access");
    6535     2624610 :         err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
    6536     2624611 :         if (err)
    6537           0 :                 goto error_return;
    6538             : #ifdef AGGRESSIVE_CHECK
    6539             :         {
    6540             :                 int i;
    6541             :                 for (i = 0; i < count_clusters; i++)
    6542             :                         BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
    6543             :         }
    6544             : #endif
    6545     2624611 :         trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
    6546             : 
    6547             :         /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
    6548     2624606 :         err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
    6549             :                                      GFP_NOFS|__GFP_NOFAIL);
    6550     2624594 :         if (err)
    6551           0 :                 goto error_return;
    6552             : 
    6553             :         /*
    6554             :          * We need to make sure we don't reuse the freed block until after the
    6555             :          * transaction is committed. We make an exception if the inode is to be
    6556             :          * written in writeback mode since writeback mode has weak data
    6557             :          * consistency guarantees.
    6558             :          */
    6559     2624594 :         if (ext4_handle_valid(handle) &&
    6560     4964928 :             ((flags & EXT4_FREE_BLOCKS_METADATA) ||
    6561     2624619 :              !ext4_should_writeback_data(inode))) {
    6562     2624593 :                 struct ext4_free_data *new_entry;
    6563             :                 /*
    6564             :                  * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
    6565             :                  * to fail.
    6566             :                  */
    6567     2624593 :                 new_entry = kmem_cache_alloc(ext4_free_data_cachep,
    6568             :                                 GFP_NOFS|__GFP_NOFAIL);
    6569     2624590 :                 new_entry->efd_start_cluster = bit;
    6570     2624590 :                 new_entry->efd_group = block_group;
    6571     2624590 :                 new_entry->efd_count = count_clusters;
    6572     2624590 :                 new_entry->efd_tid = handle->h_transaction->t_tid;
    6573             : 
    6574     2624590 :                 ext4_lock_group(sb, block_group);
    6575     2624590 :                 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
    6576     2624602 :                 ext4_mb_free_metadata(handle, &e4b, new_entry);
    6577             :         } else {
    6578             :                 /* need to update group_info->bb_free and bitmap
    6579             :                  * with group lock held. generate_buddy look at
    6580             :                  * them with group lock_held
    6581             :                  */
    6582           1 :                 if (test_opt(sb, DISCARD)) {
    6583           0 :                         err = ext4_issue_discard(sb, block_group, bit,
    6584             :                                                  count_clusters, NULL);
    6585           0 :                         if (err && err != -EOPNOTSUPP)
    6586           0 :                                 ext4_msg(sb, KERN_WARNING, "discard request in"
    6587             :                                          " group:%u block:%d count:%lu failed"
    6588             :                                          " with %d", block_group, bit, count,
    6589             :                                          err);
    6590             :                 } else
    6591           1 :                         EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
    6592             : 
    6593           1 :                 ext4_lock_group(sb, block_group);
    6594           1 :                 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
    6595           1 :                 mb_free_blocks(inode, &e4b, bit, count_clusters);
    6596             :         }
    6597             : 
    6598     2624620 :         ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
    6599     2624619 :         ext4_free_group_clusters_set(sb, gdp, ret);
    6600     2624616 :         ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
    6601     2624584 :         ext4_group_desc_csum_set(sb, block_group, gdp);
    6602     2624594 :         ext4_unlock_group(sb, block_group);
    6603             : 
    6604     2624553 :         if (sbi->s_log_groups_per_flex) {
    6605     2624400 :                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
    6606     5248834 :                 atomic64_add(count_clusters,
    6607     2624403 :                              &sbi_array_rcu_deref(sbi, s_flex_groups,
    6608             :                                                   flex_group)->free_clusters);
    6609             :         }
    6610             : 
    6611             :         /*
    6612             :          * on a bigalloc file system, defer the s_freeclusters_counter
    6613             :          * update to the caller (ext4_remove_space and friends) so they
    6614             :          * can determine if a cluster freed here should be rereserved
    6615             :          */
    6616     2624602 :         if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
    6617     2624602 :                 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
    6618     2624600 :                         dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
    6619     2624613 :                 percpu_counter_add(&sbi->s_freeclusters_counter,
    6620             :                                    count_clusters);
    6621             :         }
    6622             : 
    6623     2624609 :         ext4_mb_unload_buddy(&e4b);
    6624             : 
    6625             :         /* We dirtied the bitmap block */
    6626     2624607 :         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
    6627     2624607 :         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
    6628             : 
    6629             :         /* And the group descriptor block */
    6630     2624612 :         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
    6631     2624612 :         ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
    6632     2624593 :         if (!err)
    6633     2624607 :                 err = ret;
    6634             : 
    6635     2624593 :         if (overflow && !err) {
    6636        2441 :                 block += count;
    6637        2441 :                 count = overflow;
    6638        2441 :                 put_bh(bitmap_bh);
    6639             :                 /* The range changed so it's no longer validated */
    6640        2441 :                 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
    6641        2441 :                 goto do_more;
    6642             :         }
    6643     2622152 : error_return:
    6644     2622152 :         brelse(bitmap_bh);
    6645     2622172 :         ext4_std_error(sb, err);
    6646             :         return;
    6647             : }
    6648             : 
    6649             : /**
    6650             :  * ext4_free_blocks() -- Free given blocks and update quota
    6651             :  * @handle:             handle for this transaction
    6652             :  * @inode:              inode
    6653             :  * @bh:                 optional buffer of the block to be freed
    6654             :  * @block:              starting physical block to be freed
    6655             :  * @count:              number of blocks to be freed
    6656             :  * @flags:              flags used by ext4_free_blocks
    6657             :  */
    6658     2622126 : void ext4_free_blocks(handle_t *handle, struct inode *inode,
    6659             :                       struct buffer_head *bh, ext4_fsblk_t block,
    6660             :                       unsigned long count, int flags)
    6661             : {
    6662     2622126 :         struct super_block *sb = inode->i_sb;
    6663     2622126 :         unsigned int overflow;
    6664     2622126 :         struct ext4_sb_info *sbi;
    6665             : 
    6666     2622126 :         sbi = EXT4_SB(sb);
    6667             : 
    6668     2622126 :         if (bh) {
    6669       26284 :                 if (block)
    6670           0 :                         BUG_ON(block != bh->b_blocknr);
    6671             :                 else
    6672       26284 :                         block = bh->b_blocknr;
    6673             :         }
    6674             : 
    6675     2622126 :         if (sbi->s_mount_state & EXT4_FC_REPLAY) {
    6676           0 :                 ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
    6677           0 :                 return;
    6678             :         }
    6679             : 
    6680     2622126 :         might_sleep();
    6681             : 
    6682     5244192 :         if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
    6683     2622045 :             !ext4_inode_block_valid(inode, block, count)) {
    6684           0 :                 ext4_error(sb, "Freeing blocks not in datazone - "
    6685             :                            "block = %llu, count = %lu", block, count);
    6686           0 :                 return;
    6687             :         }
    6688     2622147 :         flags |= EXT4_FREE_BLOCKS_VALIDATED;
    6689             : 
    6690     2622147 :         ext4_debug("freeing block %llu\n", block);
    6691     2622147 :         trace_ext4_free_blocks(inode, block, count, flags);
    6692             : 
    6693     2622120 :         if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
    6694       26282 :                 BUG_ON(count > 1);
    6695             : 
    6696       26282 :                 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
    6697             :                             inode, bh, block);
    6698             :         }
    6699             : 
    6700             :         /*
    6701             :          * If the extent to be freed does not begin on a cluster
    6702             :          * boundary, we need to deal with partial clusters at the
    6703             :          * beginning and end of the extent.  Normally we will free
    6704             :          * blocks at the beginning or the end unless we are explicitly
    6705             :          * requested to avoid doing so.
    6706             :          */
    6707     2622122 :         overflow = EXT4_PBLK_COFF(sbi, block);
    6708     2622122 :         if (overflow) {
    6709           0 :                 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
    6710           0 :                         overflow = sbi->s_cluster_ratio - overflow;
    6711           0 :                         block += overflow;
    6712           0 :                         if (count > overflow)
    6713           0 :                                 count -= overflow;
    6714             :                         else
    6715             :                                 return;
    6716             :                 } else {
    6717           0 :                         block -= overflow;
    6718           0 :                         count += overflow;
    6719             :                 }
    6720             :                 /* The range changed so it's no longer validated */
    6721           0 :                 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
    6722             :         }
    6723     2622122 :         overflow = EXT4_LBLK_COFF(sbi, count);
    6724     2622122 :         if (overflow) {
    6725           2 :                 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
    6726           0 :                         if (count > overflow)
    6727           0 :                                 count -= overflow;
    6728             :                         else
    6729             :                                 return;
    6730             :                 } else
    6731           2 :                         count += sbi->s_cluster_ratio - overflow;
    6732             :                 /* The range changed so it's no longer validated */
    6733           2 :                 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
    6734             :         }
    6735             : 
    6736     2622122 :         if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
    6737      257955 :                 int i;
    6738      257955 :                 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
    6739             : 
    6740      520879 :                 for (i = 0; i < count; i++) {
    6741      262884 :                         cond_resched();
    6742      262865 :                         if (is_metadata)
    6743      262863 :                                 bh = sb_find_get_block(inode->i_sb, block + i);
    6744      262912 :                         ext4_forget(handle, is_metadata, inode, bh, block + i);
    6745             :                 }
    6746             :         }
    6747             : 
    6748     2622162 :         ext4_mb_clear_bb(handle, inode, block, count, flags);
    6749     2622162 :         return;
    6750             : }
    6751             : 
    6752             : /**
    6753             :  * ext4_group_add_blocks() -- Add given blocks to an existing group
    6754             :  * @handle:                     handle to this transaction
    6755             :  * @sb:                         super block
    6756             :  * @block:                      start physical block to add to the block group
    6757             :  * @count:                      number of blocks to free
    6758             :  *
    6759             :  * This marks the blocks as free in the bitmap and buddy.
    6760             :  */
    6761          29 : int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
    6762             :                          ext4_fsblk_t block, unsigned long count)
    6763             : {
    6764          29 :         struct buffer_head *bitmap_bh = NULL;
    6765          29 :         struct buffer_head *gd_bh;
    6766          29 :         ext4_group_t block_group;
    6767          29 :         ext4_grpblk_t bit;
    6768          29 :         unsigned int i;
    6769          29 :         struct ext4_group_desc *desc;
    6770          29 :         struct ext4_sb_info *sbi = EXT4_SB(sb);
    6771          29 :         struct ext4_buddy e4b;
    6772          29 :         int err = 0, ret, free_clusters_count;
    6773          29 :         ext4_grpblk_t clusters_freed;
    6774          29 :         ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
    6775          29 :         ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
    6776          29 :         unsigned long cluster_count = last_cluster - first_cluster + 1;
    6777             : 
    6778          29 :         ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
    6779             : 
    6780          29 :         if (count == 0)
    6781             :                 return 0;
    6782             : 
    6783          29 :         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
    6784             :         /*
    6785             :          * Check to see if we are freeing blocks across a group
    6786             :          * boundary.
    6787             :          */
    6788          29 :         if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
    6789           0 :                 ext4_warning(sb, "too many blocks added to group %u",
    6790             :                              block_group);
    6791           0 :                 err = -EINVAL;
    6792           0 :                 goto error_return;
    6793             :         }
    6794             : 
    6795          29 :         bitmap_bh = ext4_read_block_bitmap(sb, block_group);
    6796          29 :         if (IS_ERR(bitmap_bh)) {
    6797           0 :                 err = PTR_ERR(bitmap_bh);
    6798           0 :                 bitmap_bh = NULL;
    6799           0 :                 goto error_return;
    6800             :         }
    6801             : 
    6802          29 :         desc = ext4_get_group_desc(sb, block_group, &gd_bh);
    6803          29 :         if (!desc) {
    6804           0 :                 err = -EIO;
    6805           0 :                 goto error_return;
    6806             :         }
    6807             : 
    6808          29 :         if (!ext4_sb_block_valid(sb, NULL, block, count)) {
    6809           0 :                 ext4_error(sb, "Adding blocks in system zones - "
    6810             :                            "Block = %llu, count = %lu",
    6811             :                            block, count);
    6812           0 :                 err = -EINVAL;
    6813           0 :                 goto error_return;
    6814             :         }
    6815             : 
    6816          29 :         BUFFER_TRACE(bitmap_bh, "getting write access");
    6817          29 :         err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
    6818             :                                             EXT4_JTR_NONE);
    6819          29 :         if (err)
    6820           0 :                 goto error_return;
    6821             : 
    6822             :         /*
    6823             :          * We are about to modify some metadata.  Call the journal APIs
    6824             :          * to unshare ->b_data if a currently-committing transaction is
    6825             :          * using it
    6826             :          */
    6827          29 :         BUFFER_TRACE(gd_bh, "get_write_access");
    6828          29 :         err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
    6829          29 :         if (err)
    6830           0 :                 goto error_return;
    6831             : 
    6832      225307 :         for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
    6833      225278 :                 BUFFER_TRACE(bitmap_bh, "clear bit");
    6834      225278 :                 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
    6835           0 :                         ext4_error(sb, "bit already cleared for block %llu",
    6836             :                                    (ext4_fsblk_t)(block + i));
    6837           0 :                         BUFFER_TRACE(bitmap_bh, "bit already cleared");
    6838             :                 } else {
    6839      225278 :                         clusters_freed++;
    6840             :                 }
    6841             :         }
    6842             : 
    6843          29 :         err = ext4_mb_load_buddy(sb, block_group, &e4b);
    6844          29 :         if (err)
    6845           0 :                 goto error_return;
    6846             : 
    6847             :         /*
    6848             :          * need to update group_info->bb_free and bitmap
    6849             :          * with group lock held. generate_buddy look at
    6850             :          * them with group lock_held
    6851             :          */
    6852          29 :         ext4_lock_group(sb, block_group);
    6853          29 :         mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
    6854          29 :         mb_free_blocks(NULL, &e4b, bit, cluster_count);
    6855          58 :         free_clusters_count = clusters_freed +
    6856          29 :                 ext4_free_group_clusters(sb, desc);
    6857          29 :         ext4_free_group_clusters_set(sb, desc, free_clusters_count);
    6858          29 :         ext4_block_bitmap_csum_set(sb, desc, bitmap_bh);
    6859          29 :         ext4_group_desc_csum_set(sb, block_group, desc);
    6860          29 :         ext4_unlock_group(sb, block_group);
    6861          29 :         percpu_counter_add(&sbi->s_freeclusters_counter,
    6862             :                            clusters_freed);
    6863             : 
    6864          29 :         if (sbi->s_log_groups_per_flex) {
    6865          29 :                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
    6866          58 :                 atomic64_add(clusters_freed,
    6867          29 :                              &sbi_array_rcu_deref(sbi, s_flex_groups,
    6868             :                                                   flex_group)->free_clusters);
    6869             :         }
    6870             : 
    6871          29 :         ext4_mb_unload_buddy(&e4b);
    6872             : 
    6873             :         /* We dirtied the bitmap block */
    6874          29 :         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
    6875          29 :         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
    6876             : 
    6877             :         /* And the group descriptor block */
    6878          29 :         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
    6879          29 :         ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
    6880          29 :         if (!err)
    6881          29 :                 err = ret;
    6882             : 
    6883           0 : error_return:
    6884          29 :         brelse(bitmap_bh);
    6885          29 :         ext4_std_error(sb, err);
    6886             :         return err;
    6887             : }
    6888             : 
    6889             : /**
    6890             :  * ext4_trim_extent -- function to TRIM one single free extent in the group
    6891             :  * @sb:         super block for the file system
    6892             :  * @start:      starting block of the free extent in the alloc. group
    6893             :  * @count:      number of blocks to TRIM
    6894             :  * @e4b:        ext4 buddy for the group
    6895             :  *
    6896             :  * Trim "count" blocks starting at "start" in the "group". To assure that no
    6897             :  * one will allocate those blocks, mark it as used in buddy bitmap. This must
    6898             :  * be called with under the group lock.
    6899             :  */
    6900       17545 : static int ext4_trim_extent(struct super_block *sb,
    6901             :                 int start, int count, struct ext4_buddy *e4b)
    6902             : __releases(bitlock)
    6903             : __acquires(bitlock)
    6904             : {
    6905       17545 :         struct ext4_free_extent ex;
    6906       17545 :         ext4_group_t group = e4b->bd_group;
    6907       17545 :         int ret = 0;
    6908             : 
    6909       17545 :         trace_ext4_trim_extent(sb, group, start, count);
    6910             : 
    6911       17545 :         assert_spin_locked(ext4_group_lock_ptr(sb, group));
    6912             : 
    6913       17545 :         ex.fe_start = start;
    6914       17545 :         ex.fe_group = group;
    6915       17545 :         ex.fe_len = count;
    6916             : 
    6917             :         /*
    6918             :          * Mark blocks used, so no one can reuse them while
    6919             :          * being trimmed.
    6920             :          */
    6921       17545 :         mb_mark_used(e4b, &ex);
    6922       17545 :         ext4_unlock_group(sb, group);
    6923       17545 :         ret = ext4_issue_discard(sb, group, start, count, NULL);
    6924       17545 :         ext4_lock_group(sb, group);
    6925       17545 :         mb_free_blocks(NULL, e4b, start, ex.fe_len);
    6926       17545 :         return ret;
    6927             : }
    6928             : 
    6929       14348 : static int ext4_try_to_trim_range(struct super_block *sb,
    6930             :                 struct ext4_buddy *e4b, ext4_grpblk_t start,
    6931             :                 ext4_grpblk_t max, ext4_grpblk_t minblocks)
    6932             : __acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
    6933             : __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
    6934             : {
    6935       14348 :         ext4_grpblk_t next, count, free_count;
    6936       14348 :         void *bitmap;
    6937             : 
    6938       14348 :         bitmap = e4b->bd_bitmap;
    6939       14348 :         start = (e4b->bd_info->bb_first_free > start) ?
    6940             :                 e4b->bd_info->bb_first_free : start;
    6941       14348 :         count = 0;
    6942       14348 :         free_count = 0;
    6943             : 
    6944       18370 :         while (start <= max) {
    6945       18367 :                 start = mb_find_next_zero_bit(bitmap, max + 1, start);
    6946       18367 :                 if (start > max)
    6947             :                         break;
    6948       18367 :                 next = mb_find_next_bit(bitmap, max + 1, start);
    6949             : 
    6950       18367 :                 if ((next - start) >= minblocks) {
    6951       17545 :                         int ret = ext4_trim_extent(sb, start, next - start, e4b);
    6952             : 
    6953       17545 :                         if (ret && ret != -EOPNOTSUPP)
    6954             :                                 break;
    6955       17545 :                         count += next - start;
    6956             :                 }
    6957       18367 :                 free_count += next - start;
    6958       18367 :                 start = next + 1;
    6959             : 
    6960       18367 :                 if (fatal_signal_pending(current)) {
    6961             :                         count = -ERESTARTSYS;
    6962             :                         break;
    6963             :                 }
    6964             : 
    6965       18366 :                 if (need_resched()) {
    6966          11 :                         ext4_unlock_group(sb, e4b->bd_group);
    6967          11 :                         cond_resched();
    6968          11 :                         ext4_lock_group(sb, e4b->bd_group);
    6969             :                 }
    6970             : 
    6971       18366 :                 if ((e4b->bd_info->bb_free - free_count) < minblocks)
    6972             :                         break;
    6973             :         }
    6974             : 
    6975       14348 :         return count;
    6976             : }
    6977             : 
    6978             : /**
    6979             :  * ext4_trim_all_free -- function to trim all free space in alloc. group
    6980             :  * @sb:                 super block for file system
    6981             :  * @group:              group to be trimmed
    6982             :  * @start:              first group block to examine
    6983             :  * @max:                last group block to examine
    6984             :  * @minblocks:          minimum extent block count
    6985             :  * @set_trimmed:        set the trimmed flag if at least one block is trimmed
    6986             :  *
    6987             :  * ext4_trim_all_free walks through group's block bitmap searching for free
    6988             :  * extents. When the free extent is found, mark it as used in group buddy
    6989             :  * bitmap. Then issue a TRIM command on this extent and free the extent in
    6990             :  * the group buddy bitmap.
    6991             :  */
    6992             : static ext4_grpblk_t
    6993     4679755 : ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
    6994             :                    ext4_grpblk_t start, ext4_grpblk_t max,
    6995             :                    ext4_grpblk_t minblocks, bool set_trimmed)
    6996             : {
    6997     4679755 :         struct ext4_buddy e4b;
    6998     4679755 :         int ret;
    6999             : 
    7000     4679755 :         trace_ext4_trim_all_free(sb, group, start, max);
    7001             : 
    7002     4678655 :         ret = ext4_mb_load_buddy(sb, group, &e4b);
    7003     4691937 :         if (ret) {
    7004           0 :                 ext4_warning(sb, "Error %d loading buddy information for %u",
    7005             :                              ret, group);
    7006           0 :                 return ret;
    7007             :         }
    7008             : 
    7009     4691937 :         ext4_lock_group(sb, group);
    7010             : 
    7011     4696258 :         if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
    7012     4693504 :             minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
    7013       14348 :                 ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
    7014       14348 :                 if (ret >= 0 && set_trimmed)
    7015       14344 :                         EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
    7016             :         } else {
    7017             :                 ret = 0;
    7018             :         }
    7019             : 
    7020     4696258 :         ext4_unlock_group(sb, group);
    7021     4696957 :         ext4_mb_unload_buddy(&e4b);
    7022             : 
    7023     4696957 :         ext4_debug("trimmed %d blocks in the group %d\n",
    7024             :                 ret, group);
    7025             : 
    7026     4696957 :         return ret;
    7027             : }
    7028             : 
    7029             : /**
    7030             :  * ext4_trim_fs() -- trim ioctl handle function
    7031             :  * @sb:                 superblock for filesystem
    7032             :  * @range:              fstrim_range structure
    7033             :  *
    7034             :  * start:       First Byte to trim
    7035             :  * len:         number of Bytes to trim from start
    7036             :  * minlen:      minimum extent length in Bytes
    7037             :  * ext4_trim_fs goes through all allocation groups containing Bytes from
    7038             :  * start to start+len. For each such a group ext4_trim_all_free function
    7039             :  * is invoked to trim all free space.
    7040             :  */
    7041       18121 : int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
    7042             : {
    7043       18121 :         unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
    7044       18121 :         struct ext4_group_info *grp;
    7045       18121 :         ext4_group_t group, first_group, last_group;
    7046       18121 :         ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
    7047       18121 :         uint64_t start, end, minlen, trimmed = 0;
    7048       18121 :         ext4_fsblk_t first_data_blk =
    7049       18121 :                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
    7050       18121 :         ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
    7051       18121 :         bool whole_group, eof = false;
    7052       18121 :         int ret = 0;
    7053             : 
    7054       18121 :         start = range->start >> sb->s_blocksize_bits;
    7055       18121 :         end = start + (range->len >> sb->s_blocksize_bits) - 1;
    7056       18121 :         minlen = EXT4_NUM_B2C(EXT4_SB(sb),
    7057             :                               range->minlen >> sb->s_blocksize_bits);
    7058             : 
    7059       18121 :         if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
    7060       18116 :             start >= max_blks ||
    7061       18116 :             range->len < sb->s_blocksize)
    7062             :                 return -EINVAL;
    7063             :         /* No point to try to trim less than discard granularity */
    7064       18114 :         if (range->minlen < discard_granularity) {
    7065       18040 :                 minlen = EXT4_NUM_B2C(EXT4_SB(sb),
    7066             :                                 discard_granularity >> sb->s_blocksize_bits);
    7067       18040 :                 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
    7068           0 :                         goto out;
    7069             :         }
    7070       18114 :         if (end >= max_blks - 1) {
    7071       18101 :                 end = max_blks - 1;
    7072       18101 :                 eof = true;
    7073             :         }
    7074       18114 :         if (end <= first_data_blk)
    7075           0 :                 goto out;
    7076       18114 :         if (start < first_data_blk)
    7077             :                 start = first_data_blk;
    7078             : 
    7079             :         /* Determine first and last group to examine based on start and end */
    7080       18114 :         ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
    7081             :                                      &first_group, &first_cluster);
    7082       18142 :         ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
    7083             :                                      &last_group, &last_cluster);
    7084             : 
    7085             :         /* end now represents the last cluster to discard in this group */
    7086       18149 :         end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
    7087       18149 :         whole_group = true;
    7088             : 
    7089     4857958 :         for (group = first_group; group <= last_group; group++) {
    7090     4839789 :                 grp = ext4_get_group_info(sb, group);
    7091     4824593 :                 if (!grp)
    7092           0 :                         continue;
    7093             :                 /* We only do this if the grp has never been initialized */
    7094     4824593 :                 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
    7095        2379 :                         ret = ext4_mb_init_group(sb, group, GFP_NOFS);
    7096        2379 :                         if (ret)
    7097             :                                 break;
    7098             :                 }
    7099             : 
    7100             :                 /*
    7101             :                  * For all the groups except the last one, last cluster will
    7102             :                  * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
    7103             :                  * change it for the last group, note that last_cluster is
    7104             :                  * already computed earlier by ext4_get_group_no_and_offset()
    7105             :                  */
    7106     4824593 :                 if (group == last_group) {
    7107       18123 :                         end = last_cluster;
    7108       18123 :                         whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1;
    7109             :                 }
    7110     4824593 :                 if (grp->bb_free >= minlen) {
    7111     4679780 :                         cnt = ext4_trim_all_free(sb, group, first_cluster,
    7112             :                                                  end, minlen, whole_group);
    7113     4694997 :                         if (cnt < 0) {
    7114             :                                 ret = cnt;
    7115             :                                 break;
    7116             :                         }
    7117     4694996 :                         trimmed += cnt;
    7118             :                 }
    7119             : 
    7120             :                 /*
    7121             :                  * For every group except the first one, we are sure
    7122             :                  * that the first cluster to discard will be cluster #0.
    7123             :                  */
    7124     4839809 :                 first_cluster = 0;
    7125             :         }
    7126             : 
    7127       18170 :         if (!ret)
    7128       18169 :                 EXT4_SB(sb)->s_last_trim_minblks = minlen;
    7129             : 
    7130           1 : out:
    7131       18170 :         range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
    7132       18170 :         return ret;
    7133             : }
    7134             : 
    7135             : /* Iterate all the free extents in the group. */
    7136             : int
    7137        2475 : ext4_mballoc_query_range(
    7138             :         struct super_block              *sb,
    7139             :         ext4_group_t                    group,
    7140             :         ext4_grpblk_t                   start,
    7141             :         ext4_grpblk_t                   end,
    7142             :         ext4_mballoc_query_range_fn     formatter,
    7143             :         void                            *priv)
    7144             : {
    7145        2475 :         void                            *bitmap;
    7146        2475 :         ext4_grpblk_t                   next;
    7147        2475 :         struct ext4_buddy               e4b;
    7148        2475 :         int                             error;
    7149             : 
    7150        2475 :         error = ext4_mb_load_buddy(sb, group, &e4b);
    7151        2475 :         if (error)
    7152             :                 return error;
    7153        2475 :         bitmap = e4b.bd_bitmap;
    7154             : 
    7155        2475 :         ext4_lock_group(sb, group);
    7156             : 
    7157        2475 :         start = (e4b.bd_info->bb_first_free > start) ?
    7158             :                 e4b.bd_info->bb_first_free : start;
    7159        2475 :         if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
    7160        2469 :                 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
    7161             : 
    7162        4897 :         while (start <= end) {
    7163        3231 :                 start = mb_find_next_zero_bit(bitmap, end + 1, start);
    7164        3231 :                 if (start > end)
    7165             :                         break;
    7166        3229 :                 next = mb_find_next_bit(bitmap, end + 1, start);
    7167             : 
    7168        3229 :                 ext4_unlock_group(sb, group);
    7169        3229 :                 error = formatter(sb, group, start, next - start, priv);
    7170        3229 :                 if (error)
    7171         807 :                         goto out_unload;
    7172        2422 :                 ext4_lock_group(sb, group);
    7173             : 
    7174        2422 :                 start = next + 1;
    7175             :         }
    7176             : 
    7177        1668 :         ext4_unlock_group(sb, group);
    7178        2475 : out_unload:
    7179        2475 :         ext4_mb_unload_buddy(&e4b);
    7180             : 
    7181        2475 :         return error;
    7182             : }

Generated by: LCOV version 1.14