LCOV - code coverage report
Current view: top level - fs/btrfs - defrag.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsx @ Mon Jul 31 20:08:34 PDT 2023 Lines: 487 637 76.5 %
Date: 2023-07-31 20:08:34 Functions: 19 20 95.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (C) 2007 Oracle.  All rights reserved.
       4             :  */
       5             : 
       6             : #include <linux/sched.h>
       7             : #include "ctree.h"
       8             : #include "disk-io.h"
       9             : #include "print-tree.h"
      10             : #include "transaction.h"
      11             : #include "locking.h"
      12             : #include "accessors.h"
      13             : #include "messages.h"
      14             : #include "delalloc-space.h"
      15             : #include "subpage.h"
      16             : #include "defrag.h"
      17             : #include "file-item.h"
      18             : #include "super.h"
      19             : 
      20             : static struct kmem_cache *btrfs_inode_defrag_cachep;
      21             : 
      22             : /*
      23             :  * When auto defrag is enabled we queue up these defrag structs to remember
      24             :  * which inodes need defragging passes.
      25             :  */
      26             : struct inode_defrag {
      27             :         struct rb_node rb_node;
      28             :         /* Inode number */
      29             :         u64 ino;
      30             :         /*
      31             :          * Transid where the defrag was added, we search for extents newer than
      32             :          * this.
      33             :          */
      34             :         u64 transid;
      35             : 
      36             :         /* Root objectid */
      37             :         u64 root;
      38             : 
      39             :         /*
      40             :          * The extent size threshold for autodefrag.
      41             :          *
      42             :          * This value is different for compressed/non-compressed extents, thus
      43             :          * needs to be passed from higher layer.
      44             :          * (aka, inode_should_defrag())
      45             :          */
      46             :         u32 extent_thresh;
      47             : };
      48             : 
      49             : static int __compare_inode_defrag(struct inode_defrag *defrag1,
      50             :                                   struct inode_defrag *defrag2)
      51             : {
      52           2 :         if (defrag1->root > defrag2->root)
      53             :                 return 1;
      54           4 :         else if (defrag1->root < defrag2->root)
      55             :                 return -1;
      56           0 :         else if (defrag1->ino > defrag2->ino)
      57             :                 return 1;
      58           0 :         else if (defrag1->ino < defrag2->ino)
      59             :                 return -1;
      60             :         else
      61             :                 return 0;
      62             : }
      63             : 
      64             : /*
      65             :  * Pop a record for an inode into the defrag tree.  The lock must be held
      66             :  * already.
      67             :  *
      68             :  * If you're inserting a record for an older transid than an existing record,
      69             :  * the transid already in the tree is lowered.
      70             :  *
      71             :  * If an existing record is found the defrag item you pass in is freed.
      72             :  */
      73           2 : static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
      74             :                                     struct inode_defrag *defrag)
      75             : {
      76           2 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
      77           2 :         struct inode_defrag *entry;
      78           2 :         struct rb_node **p;
      79           2 :         struct rb_node *parent = NULL;
      80           2 :         int ret;
      81             : 
      82           2 :         p = &fs_info->defrag_inodes.rb_node;
      83           2 :         while (*p) {
      84           0 :                 parent = *p;
      85           0 :                 entry = rb_entry(parent, struct inode_defrag, rb_node);
      86             : 
      87           0 :                 ret = __compare_inode_defrag(defrag, entry);
      88             :                 if (ret < 0)
      89           0 :                         p = &parent->rb_left;
      90           0 :                 else if (ret > 0)
      91           0 :                         p = &parent->rb_right;
      92             :                 else {
      93             :                         /*
      94             :                          * If we're reinserting an entry for an old defrag run,
      95             :                          * make sure to lower the transid of our existing
      96             :                          * record.
      97             :                          */
      98           0 :                         if (defrag->transid < entry->transid)
      99           0 :                                 entry->transid = defrag->transid;
     100           0 :                         entry->extent_thresh = min(defrag->extent_thresh,
     101             :                                                    entry->extent_thresh);
     102           0 :                         return -EEXIST;
     103             :                 }
     104             :         }
     105           2 :         set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
     106           2 :         rb_link_node(&defrag->rb_node, parent, p);
     107           2 :         rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
     108           2 :         return 0;
     109             : }
     110             : 
     111     1550441 : static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
     112             : {
     113     1550441 :         if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
     114             :                 return 0;
     115             : 
     116          35 :         if (btrfs_fs_closing(fs_info))
     117           0 :                 return 0;
     118             : 
     119             :         return 1;
     120             : }
     121             : 
     122             : /*
     123             :  * Insert a defrag record for this inode if auto defrag is enabled.
     124             :  */
     125     1506502 : int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
     126             :                            struct btrfs_inode *inode, u32 extent_thresh)
     127             : {
     128     1506502 :         struct btrfs_root *root = inode->root;
     129     1506502 :         struct btrfs_fs_info *fs_info = root->fs_info;
     130     1506502 :         struct inode_defrag *defrag;
     131     1506502 :         u64 transid;
     132     1506502 :         int ret;
     133             : 
     134     1506502 :         if (!__need_auto_defrag(fs_info))
     135             :                 return 0;
     136             : 
     137          34 :         if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
     138             :                 return 0;
     139             : 
     140           2 :         if (trans)
     141           0 :                 transid = trans->transid;
     142             :         else
     143           2 :                 transid = inode->root->last_trans;
     144             : 
     145           2 :         defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
     146           2 :         if (!defrag)
     147             :                 return -ENOMEM;
     148             : 
     149           2 :         defrag->ino = btrfs_ino(inode);
     150           2 :         defrag->transid = transid;
     151           2 :         defrag->root = root->root_key.objectid;
     152           2 :         defrag->extent_thresh = extent_thresh;
     153             : 
     154           2 :         spin_lock(&fs_info->defrag_inodes_lock);
     155           2 :         if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
     156             :                 /*
     157             :                  * If we set IN_DEFRAG flag and evict the inode from memory,
     158             :                  * and then re-read this inode, this new inode doesn't have
     159             :                  * IN_DEFRAG flag. At the case, we may find the existed defrag.
     160             :                  */
     161           2 :                 ret = __btrfs_add_inode_defrag(inode, defrag);
     162           2 :                 if (ret)
     163           0 :                         kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
     164             :         } else {
     165           0 :                 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
     166             :         }
     167           2 :         spin_unlock(&fs_info->defrag_inodes_lock);
     168           2 :         return 0;
     169             : }
     170             : 
     171             : /*
     172             :  * Pick the defragable inode that we want, if it doesn't exist, we will get the
     173             :  * next one.
     174             :  */
     175          14 : static struct inode_defrag *btrfs_pick_defrag_inode(
     176             :                         struct btrfs_fs_info *fs_info, u64 root, u64 ino)
     177             : {
     178          14 :         struct inode_defrag *entry = NULL;
     179          14 :         struct inode_defrag tmp;
     180          14 :         struct rb_node *p;
     181          14 :         struct rb_node *parent = NULL;
     182          14 :         int ret;
     183             : 
     184          14 :         tmp.ino = ino;
     185          14 :         tmp.root = root;
     186             : 
     187          14 :         spin_lock(&fs_info->defrag_inodes_lock);
     188          14 :         p = fs_info->defrag_inodes.rb_node;
     189          16 :         while (p) {
     190           2 :                 parent = p;
     191           2 :                 entry = rb_entry(parent, struct inode_defrag, rb_node);
     192             : 
     193           2 :                 ret = __compare_inode_defrag(&tmp, entry);
     194             :                 if (ret < 0)
     195           2 :                         p = parent->rb_left;
     196           0 :                 else if (ret > 0)
     197           0 :                         p = parent->rb_right;
     198             :                 else
     199           0 :                         goto out;
     200             :         }
     201             : 
     202          14 :         if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
     203           0 :                 parent = rb_next(parent);
     204           0 :                 if (parent)
     205             :                         entry = rb_entry(parent, struct inode_defrag, rb_node);
     206             :                 else
     207             :                         entry = NULL;
     208             :         }
     209          12 : out:
     210          14 :         if (entry)
     211           2 :                 rb_erase(parent, &fs_info->defrag_inodes);
     212          14 :         spin_unlock(&fs_info->defrag_inodes_lock);
     213          14 :         return entry;
     214             : }
     215             : 
     216        3219 : void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
     217             : {
     218        3219 :         struct inode_defrag *defrag;
     219        3219 :         struct rb_node *node;
     220             : 
     221        3219 :         spin_lock(&fs_info->defrag_inodes_lock);
     222        3219 :         node = rb_first(&fs_info->defrag_inodes);
     223        3219 :         while (node) {
     224           0 :                 rb_erase(node, &fs_info->defrag_inodes);
     225           0 :                 defrag = rb_entry(node, struct inode_defrag, rb_node);
     226           0 :                 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
     227             : 
     228           0 :                 cond_resched_lock(&fs_info->defrag_inodes_lock);
     229             : 
     230           0 :                 node = rb_first(&fs_info->defrag_inodes);
     231             :         }
     232        3219 :         spin_unlock(&fs_info->defrag_inodes_lock);
     233        3219 : }
     234             : 
     235             : #define BTRFS_DEFRAG_BATCH      1024
     236             : 
     237           2 : static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
     238             :                                     struct inode_defrag *defrag)
     239             : {
     240           2 :         struct btrfs_root *inode_root;
     241           2 :         struct inode *inode;
     242           2 :         struct btrfs_ioctl_defrag_range_args range;
     243           2 :         int ret = 0;
     244           2 :         u64 cur = 0;
     245             : 
     246           4 : again:
     247           8 :         if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
     248           0 :                 goto cleanup;
     249           4 :         if (!__need_auto_defrag(fs_info))
     250           0 :                 goto cleanup;
     251             : 
     252             :         /* Get the inode */
     253           4 :         inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
     254           4 :         if (IS_ERR(inode_root)) {
     255           0 :                 ret = PTR_ERR(inode_root);
     256           0 :                 goto cleanup;
     257             :         }
     258             : 
     259           4 :         inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
     260           4 :         btrfs_put_root(inode_root);
     261           4 :         if (IS_ERR(inode)) {
     262           0 :                 ret = PTR_ERR(inode);
     263           0 :                 goto cleanup;
     264             :         }
     265             : 
     266           4 :         if (cur >= i_size_read(inode)) {
     267           2 :                 iput(inode);
     268           2 :                 goto cleanup;
     269             :         }
     270             : 
     271             :         /* Do a chunk of defrag */
     272           2 :         clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
     273           2 :         memset(&range, 0, sizeof(range));
     274           2 :         range.len = (u64)-1;
     275           2 :         range.start = cur;
     276           2 :         range.extent_thresh = defrag->extent_thresh;
     277             : 
     278           2 :         sb_start_write(fs_info->sb);
     279           2 :         ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
     280             :                                        BTRFS_DEFRAG_BATCH);
     281           2 :         sb_end_write(fs_info->sb);
     282           2 :         iput(inode);
     283             : 
     284           2 :         if (ret < 0)
     285           0 :                 goto cleanup;
     286             : 
     287           2 :         cur = max(cur + fs_info->sectorsize, range.start);
     288           2 :         goto again;
     289             : 
     290           2 : cleanup:
     291           2 :         kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
     292           2 :         return ret;
     293             : }
     294             : 
     295             : /*
     296             :  * Run through the list of inodes in the FS that need defragging.
     297             :  */
     298       44012 : int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
     299             : {
     300       44012 :         struct inode_defrag *defrag;
     301       44012 :         u64 first_ino = 0;
     302       44012 :         u64 root_objectid = 0;
     303             : 
     304       44012 :         atomic_inc(&fs_info->defrag_running);
     305       44017 :         while (1) {
     306             :                 /* Pause the auto defragger. */
     307       88034 :                 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
     308             :                         break;
     309             : 
     310       43962 :                 if (!__need_auto_defrag(fs_info))
     311             :                         break;
     312             : 
     313             :                 /* find an inode to defrag */
     314          14 :                 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
     315          14 :                 if (!defrag) {
     316          12 :                         if (root_objectid || first_ino) {
     317           2 :                                 root_objectid = 0;
     318           2 :                                 first_ino = 0;
     319           2 :                                 continue;
     320             :                         } else {
     321             :                                 break;
     322             :                         }
     323             :                 }
     324             : 
     325           2 :                 first_ino = defrag->ino + 1;
     326           2 :                 root_objectid = defrag->root;
     327             : 
     328           2 :                 __btrfs_run_defrag_inode(fs_info, defrag);
     329             :         }
     330       44013 :         atomic_dec(&fs_info->defrag_running);
     331             : 
     332             :         /*
     333             :          * During unmount, we use the transaction_wait queue to wait for the
     334             :          * defragger to stop.
     335             :          */
     336       44013 :         wake_up(&fs_info->transaction_wait);
     337       44013 :         return 0;
     338             : }
     339             : 
     340             : /*
     341             :  * Defrag all the leaves in a given btree.
     342             :  * Read all the leaves and try to get key order to
     343             :  * better reflect disk order
     344             :  */
     345             : 
     346           2 : int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
     347             :                         struct btrfs_root *root)
     348             : {
     349           2 :         struct btrfs_path *path = NULL;
     350           2 :         struct btrfs_key key;
     351           2 :         int ret = 0;
     352           2 :         int wret;
     353           2 :         int level;
     354           2 :         int next_key_ret = 0;
     355           2 :         u64 last_ret = 0;
     356             : 
     357           2 :         if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
     358           0 :                 goto out;
     359             : 
     360           2 :         path = btrfs_alloc_path();
     361           2 :         if (!path) {
     362           0 :                 ret = -ENOMEM;
     363           0 :                 goto out;
     364             :         }
     365             : 
     366           2 :         level = btrfs_header_level(root->node);
     367             : 
     368           2 :         if (level == 0)
     369           2 :                 goto out;
     370             : 
     371           0 :         if (root->defrag_progress.objectid == 0) {
     372           0 :                 struct extent_buffer *root_node;
     373           0 :                 u32 nritems;
     374             : 
     375           0 :                 root_node = btrfs_lock_root_node(root);
     376           0 :                 nritems = btrfs_header_nritems(root_node);
     377           0 :                 root->defrag_max.objectid = 0;
     378             :                 /* from above we know this is not a leaf */
     379           0 :                 btrfs_node_key_to_cpu(root_node, &root->defrag_max,
     380           0 :                                       nritems - 1);
     381           0 :                 btrfs_tree_unlock(root_node);
     382           0 :                 free_extent_buffer(root_node);
     383           0 :                 memset(&key, 0, sizeof(key));
     384             :         } else {
     385           0 :                 memcpy(&key, &root->defrag_progress, sizeof(key));
     386             :         }
     387             : 
     388           0 :         path->keep_locks = 1;
     389             : 
     390           0 :         ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
     391           0 :         if (ret < 0)
     392           0 :                 goto out;
     393           0 :         if (ret > 0) {
     394           0 :                 ret = 0;
     395           0 :                 goto out;
     396             :         }
     397           0 :         btrfs_release_path(path);
     398             :         /*
     399             :          * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
     400             :          * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
     401             :          * a deadlock (attempting to write lock an already write locked leaf).
     402             :          */
     403           0 :         path->lowest_level = 1;
     404           0 :         wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
     405             : 
     406           0 :         if (wret < 0) {
     407           0 :                 ret = wret;
     408           0 :                 goto out;
     409             :         }
     410           0 :         if (!path->nodes[1]) {
     411           0 :                 ret = 0;
     412           0 :                 goto out;
     413             :         }
     414             :         /*
     415             :          * The node at level 1 must always be locked when our path has
     416             :          * keep_locks set and lowest_level is 1, regardless of the value of
     417             :          * path->slots[1].
     418             :          */
     419           0 :         BUG_ON(path->locks[1] == 0);
     420           0 :         ret = btrfs_realloc_node(trans, root,
     421             :                                  path->nodes[1], 0,
     422             :                                  &last_ret,
     423             :                                  &root->defrag_progress);
     424           0 :         if (ret) {
     425           0 :                 WARN_ON(ret == -EAGAIN);
     426           0 :                 goto out;
     427             :         }
     428             :         /*
     429             :          * Now that we reallocated the node we can find the next key. Note that
     430             :          * btrfs_find_next_key() can release our path and do another search
     431             :          * without COWing, this is because even with path->keep_locks = 1,
     432             :          * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
     433             :          * node when path->slots[node_level - 1] does not point to the last
     434             :          * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
     435             :          * we search for the next key after reallocating our node.
     436             :          */
     437           0 :         path->slots[1] = btrfs_header_nritems(path->nodes[1]);
     438           0 :         next_key_ret = btrfs_find_next_key(root, path, &key, 1,
     439             :                                            BTRFS_OLDEST_GENERATION);
     440           0 :         if (next_key_ret == 0) {
     441           0 :                 memcpy(&root->defrag_progress, &key, sizeof(key));
     442           0 :                 ret = -EAGAIN;
     443             :         }
     444           0 : out:
     445           2 :         btrfs_free_path(path);
     446           2 :         if (ret == -EAGAIN) {
     447           0 :                 if (root->defrag_max.objectid > root->defrag_progress.objectid)
     448           0 :                         goto done;
     449           0 :                 if (root->defrag_max.type > root->defrag_progress.type)
     450           0 :                         goto done;
     451           0 :                 if (root->defrag_max.offset > root->defrag_progress.offset)
     452           0 :                         goto done;
     453             :                 ret = 0;
     454             :         }
     455           2 : done:
     456           2 :         if (ret != -EAGAIN)
     457           4 :                 memset(&root->defrag_progress, 0,
     458             :                        sizeof(root->defrag_progress));
     459             : 
     460           2 :         return ret;
     461             : }
     462             : 
     463             : /*
     464             :  * Defrag specific helper to get an extent map.
     465             :  *
     466             :  * Differences between this and btrfs_get_extent() are:
     467             :  *
     468             :  * - No extent_map will be added to inode->extent_tree
     469             :  *   To reduce memory usage in the long run.
     470             :  *
     471             :  * - Extra optimization to skip file extents older than @newer_than
     472             :  *   By using btrfs_search_forward() we can skip entire file ranges that
     473             :  *   have extents created in past transactions, because btrfs_search_forward()
     474             :  *   will not visit leaves and nodes with a generation smaller than given
     475             :  *   minimal generation threshold (@newer_than).
     476             :  *
     477             :  * Return valid em if we find a file extent matching the requirement.
     478             :  * Return NULL if we can not find a file extent matching the requirement.
     479             :  *
     480             :  * Return ERR_PTR() for error.
     481             :  */
     482        2111 : static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
     483             :                                             u64 start, u64 newer_than)
     484             : {
     485        2111 :         struct btrfs_root *root = inode->root;
     486        2111 :         struct btrfs_file_extent_item *fi;
     487        2111 :         struct btrfs_path path = { 0 };
     488        2111 :         struct extent_map *em;
     489        2111 :         struct btrfs_key key;
     490        2111 :         u64 ino = btrfs_ino(inode);
     491        2111 :         int ret;
     492             : 
     493        2111 :         em = alloc_extent_map();
     494        2111 :         if (!em) {
     495           0 :                 ret = -ENOMEM;
     496           0 :                 goto err;
     497             :         }
     498             : 
     499        2111 :         key.objectid = ino;
     500        2111 :         key.type = BTRFS_EXTENT_DATA_KEY;
     501        2111 :         key.offset = start;
     502             : 
     503        2111 :         if (newer_than) {
     504          62 :                 ret = btrfs_search_forward(root, &key, &path, newer_than);
     505          62 :                 if (ret < 0)
     506           0 :                         goto err;
     507             :                 /* Can't find anything newer */
     508          62 :                 if (ret > 0)
     509           0 :                         goto not_found;
     510             :         } else {
     511        2049 :                 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
     512        2049 :                 if (ret < 0)
     513           0 :                         goto err;
     514             :         }
     515        2111 :         if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
     516             :                 /*
     517             :                  * If btrfs_search_slot() makes path to point beyond nritems,
     518             :                  * we should not have an empty leaf, as this inode must at
     519             :                  * least have its INODE_ITEM.
     520             :                  */
     521           1 :                 ASSERT(btrfs_header_nritems(path.nodes[0]));
     522           1 :                 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
     523             :         }
     524        2111 :         btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
     525             :         /* Perfect match, no need to go one slot back */
     526        2111 :         if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
     527        2102 :             key.offset == start)
     528        2051 :                 goto iterate;
     529             : 
     530             :         /* We didn't find a perfect match, needs to go one slot back */
     531          60 :         if (path.slots[0] > 0) {
     532          60 :                 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
     533          60 :                 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
     534          51 :                         path.slots[0]--;
     535             :         }
     536             : 
     537           9 : iterate:
     538             :         /* Iterate through the path to find a file extent covering @start */
     539        2226 :         while (true) {
     540        2226 :                 u64 extent_end;
     541             : 
     542        2226 :                 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
     543           0 :                         goto next;
     544             : 
     545        2226 :                 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
     546             : 
     547             :                 /*
     548             :                  * We may go one slot back to INODE_REF/XATTR item, then
     549             :                  * need to go forward until we reach an EXTENT_DATA.
     550             :                  * But we should still has the correct ino as key.objectid.
     551             :                  */
     552        2226 :                 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
     553         124 :                         goto next;
     554             : 
     555             :                 /* It's beyond our target range, definitely not extent found */
     556        2102 :                 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
     557           0 :                         goto not_found;
     558             : 
     559             :                 /*
     560             :                  *      |       |<- File extent ->|
     561             :                  *      \- start
     562             :                  *
     563             :                  * This means there is a hole between start and key.offset.
     564             :                  */
     565        2102 :                 if (key.offset > start) {
     566          51 :                         em->start = start;
     567          51 :                         em->orig_start = start;
     568          51 :                         em->block_start = EXTENT_MAP_HOLE;
     569          51 :                         em->len = key.offset - start;
     570          51 :                         break;
     571             :                 }
     572             : 
     573        2051 :                 fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
     574             :                                     struct btrfs_file_extent_item);
     575        2051 :                 extent_end = btrfs_file_extent_end(&path);
     576             : 
     577             :                 /*
     578             :                  *      |<- file extent ->|       |
     579             :                  *                              \- start
     580             :                  *
     581             :                  * We haven't reached start, search next slot.
     582             :                  */
     583        2051 :                 if (extent_end <= start)
     584           0 :                         goto next;
     585             : 
     586             :                 /* Now this extent covers @start, convert it to em */
     587        2051 :                 btrfs_extent_item_to_extent_map(inode, &path, fi, em);
     588        2051 :                 break;
     589         124 : next:
     590         124 :                 ret = btrfs_next_item(root, &path);
     591         124 :                 if (ret < 0)
     592           0 :                         goto err;
     593         124 :                 if (ret > 0)
     594           9 :                         goto not_found;
     595             :         }
     596        2102 :         btrfs_release_path(&path);
     597        2102 :         return em;
     598             : 
     599           9 : not_found:
     600           9 :         btrfs_release_path(&path);
     601           9 :         free_extent_map(em);
     602           9 :         return NULL;
     603             : 
     604           0 : err:
     605           0 :         btrfs_release_path(&path);
     606           0 :         free_extent_map(em);
     607           0 :         return ERR_PTR(ret);
     608             : }
     609             : 
     610       13504 : static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
     611             :                                                u64 newer_than, bool locked)
     612             : {
     613       13504 :         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
     614       13504 :         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
     615       13504 :         struct extent_map *em;
     616       13504 :         const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
     617             : 
     618             :         /*
     619             :          * Hopefully we have this extent in the tree already, try without the
     620             :          * full extent lock.
     621             :          */
     622       13504 :         read_lock(&em_tree->lock);
     623       13504 :         em = lookup_extent_mapping(em_tree, start, sectorsize);
     624       13504 :         read_unlock(&em_tree->lock);
     625             : 
     626             :         /*
     627             :          * We can get a merged extent, in that case, we need to re-search
     628             :          * tree to get the original em for defrag.
     629             :          *
     630             :          * If @newer_than is 0 or em::generation < newer_than, we can trust
     631             :          * this em, as either we don't care about the generation, or the
     632             :          * merged extent map will be rejected anyway.
     633             :          */
     634       24959 :         if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
     635          62 :             newer_than && em->generation >= newer_than) {
     636          62 :                 free_extent_map(em);
     637          62 :                 em = NULL;
     638             :         }
     639             : 
     640       13504 :         if (!em) {
     641        2111 :                 struct extent_state *cached = NULL;
     642        2111 :                 u64 end = start + sectorsize - 1;
     643             : 
     644             :                 /* Get the big lock and read metadata off disk. */
     645        2111 :                 if (!locked)
     646        2080 :                         lock_extent(io_tree, start, end, &cached);
     647        2111 :                 em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
     648        2111 :                 if (!locked)
     649        2080 :                         unlock_extent(io_tree, start, end, &cached);
     650             : 
     651        2111 :                 if (IS_ERR(em))
     652           0 :                         return NULL;
     653             :         }
     654             : 
     655             :         return em;
     656             : }
     657             : 
     658        6270 : static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
     659             :                                    const struct extent_map *em)
     660             : {
     661       12540 :         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
     662             :                 return BTRFS_MAX_COMPRESSED;
     663        6142 :         return fs_info->max_extent_size;
     664             : }
     665             : 
     666        3783 : static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
     667             :                                      u32 extent_thresh, u64 newer_than, bool locked)
     668             : {
     669        3783 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
     670        3783 :         struct extent_map *next;
     671        3783 :         bool ret = false;
     672             : 
     673             :         /* This is the last extent */
     674        3783 :         if (em->start + em->len >= i_size_read(inode))
     675             :                 return false;
     676             : 
     677             :         /*
     678             :          * Here we need to pass @newer_then when checking the next extent, or
     679             :          * we will hit a case we mark current extent for defrag, but the next
     680             :          * one will not be a target.
     681             :          * This will just cause extra IO without really reducing the fragments.
     682             :          */
     683        3662 :         next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
     684             :         /* No more em or hole */
     685        3662 :         if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
     686          32 :                 goto out;
     687        7260 :         if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
     688        1271 :                 goto out;
     689             :         /*
     690             :          * If the next extent is at its max capacity, defragging current extent
     691             :          * makes no sense, as the total number of extents won't change.
     692             :          */
     693        2359 :         if (next->len >= get_extent_max_capacity(fs_info, em))
     694           0 :                 goto out;
     695             :         /* Skip older extent */
     696        2359 :         if (next->generation < newer_than)
     697           1 :                 goto out;
     698             :         /* Also check extent size */
     699        2358 :         if (next->len >= extent_thresh)
     700           0 :                 goto out;
     701             : 
     702             :         ret = true;
     703        3662 : out:
     704        3662 :         free_extent_map(next);
     705        3662 :         return ret;
     706             : }
     707             : 
     708             : /*
     709             :  * Prepare one page to be defragged.
     710             :  *
     711             :  * This will ensure:
     712             :  *
     713             :  * - Returned page is locked and has been set up properly.
     714             :  * - No ordered extent exists in the page.
     715             :  * - The page is uptodate.
     716             :  *
     717             :  * NOTE: Caller should also wait for page writeback after the cluster is
     718             :  * prepared, here we don't do writeback wait for each page.
     719             :  */
     720        1723 : static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index)
     721             : {
     722        1723 :         struct address_space *mapping = inode->vfs_inode.i_mapping;
     723        1723 :         gfp_t mask = btrfs_alloc_write_mask(mapping);
     724        1723 :         u64 page_start = (u64)index << PAGE_SHIFT;
     725        1723 :         u64 page_end = page_start + PAGE_SIZE - 1;
     726        1723 :         struct extent_state *cached_state = NULL;
     727        1723 :         struct page *page;
     728        1723 :         int ret;
     729             : 
     730             : again:
     731        1723 :         page = find_or_create_page(mapping, index, mask);
     732        1723 :         if (!page)
     733             :                 return ERR_PTR(-ENOMEM);
     734             : 
     735             :         /*
     736             :          * Since we can defragment files opened read-only, we can encounter
     737             :          * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
     738             :          * can't do I/O using huge pages yet, so return an error for now.
     739             :          * Filesystem transparent huge pages are typically only used for
     740             :          * executables that explicitly enable them, so this isn't very
     741             :          * restrictive.
     742             :          */
     743        1723 :         if (PageCompound(page)) {
     744           0 :                 unlock_page(page);
     745           0 :                 put_page(page);
     746           0 :                 return ERR_PTR(-ETXTBSY);
     747             :         }
     748             : 
     749        1723 :         ret = set_page_extent_mapped(page);
     750        1723 :         if (ret < 0) {
     751           0 :                 unlock_page(page);
     752           0 :                 put_page(page);
     753           0 :                 return ERR_PTR(ret);
     754             :         }
     755             : 
     756             :         /* Wait for any existing ordered extent in the range */
     757        1723 :         while (1) {
     758        1723 :                 struct btrfs_ordered_extent *ordered;
     759             : 
     760        1723 :                 lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
     761        1723 :                 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
     762        1723 :                 unlock_extent(&inode->io_tree, page_start, page_end,
     763             :                               &cached_state);
     764        1723 :                 if (!ordered)
     765             :                         break;
     766             : 
     767           0 :                 unlock_page(page);
     768           0 :                 btrfs_start_ordered_extent(ordered);
     769           0 :                 btrfs_put_ordered_extent(ordered);
     770           0 :                 lock_page(page);
     771             :                 /*
     772             :                  * We unlocked the page above, so we need check if it was
     773             :                  * released or not.
     774             :                  */
     775           0 :                 if (page->mapping != mapping || !PagePrivate(page)) {
     776           0 :                         unlock_page(page);
     777           0 :                         put_page(page);
     778           0 :                         goto again;
     779             :                 }
     780             :         }
     781             : 
     782             :         /*
     783             :          * Now the page range has no ordered extent any more.  Read the page to
     784             :          * make it uptodate.
     785             :          */
     786        1723 :         if (!PageUptodate(page)) {
     787           0 :                 btrfs_read_folio(NULL, page_folio(page));
     788           0 :                 lock_page(page);
     789           0 :                 if (page->mapping != mapping || !PagePrivate(page)) {
     790           0 :                         unlock_page(page);
     791           0 :                         put_page(page);
     792           0 :                         goto again;
     793             :                 }
     794           0 :                 if (!PageUptodate(page)) {
     795           0 :                         unlock_page(page);
     796           0 :                         put_page(page);
     797           0 :                         return ERR_PTR(-EIO);
     798             :                 }
     799             :         }
     800             :         return page;
     801             : }
     802             : 
     803             : struct defrag_target_range {
     804             :         struct list_head list;
     805             :         u64 start;
     806             :         u64 len;
     807             : };
     808             : 
     809             : /*
     810             :  * Collect all valid target extents.
     811             :  *
     812             :  * @start:         file offset to lookup
     813             :  * @len:           length to lookup
     814             :  * @extent_thresh: file extent size threshold, any extent size >= this value
     815             :  *                 will be ignored
     816             :  * @newer_than:    only defrag extents newer than this value
     817             :  * @do_compress:   whether the defrag is doing compression
     818             :  *                 if true, @extent_thresh will be ignored and all regular
     819             :  *                 file extents meeting @newer_than will be targets.
     820             :  * @locked:        if the range has already held extent lock
     821             :  * @target_list:   list of targets file extents
     822             :  */
     823        2198 : static int defrag_collect_targets(struct btrfs_inode *inode,
     824             :                                   u64 start, u64 len, u32 extent_thresh,
     825             :                                   u64 newer_than, bool do_compress,
     826             :                                   bool locked, struct list_head *target_list,
     827             :                                   u64 *last_scanned_ret)
     828             : {
     829        2198 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
     830        2198 :         bool last_is_target = false;
     831        2198 :         u64 cur = start;
     832        2198 :         int ret = 0;
     833             : 
     834       12031 :         while (cur < start + len) {
     835        9842 :                 struct extent_map *em;
     836        9842 :                 struct defrag_target_range *new;
     837        9842 :                 bool next_mergeable = true;
     838        9842 :                 u64 range_len;
     839             : 
     840        9842 :                 last_is_target = false;
     841        9842 :                 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked);
     842        9842 :                 if (!em)
     843             :                         break;
     844             : 
     845             :                 /*
     846             :                  * If the file extent is an inlined one, we may still want to
     847             :                  * defrag it (fallthrough) if it will cause a regular extent.
     848             :                  * This is for users who want to convert inline extents to
     849             :                  * regular ones through max_inline= mount option.
     850             :                  */
     851        9833 :                 if (em->block_start == EXTENT_MAP_INLINE &&
     852           0 :                     em->len <= inode->root->fs_info->max_inline)
     853           0 :                         goto next;
     854             : 
     855             :                 /* Skip hole/delalloc/preallocated extents */
     856        9833 :                 if (em->block_start == EXTENT_MAP_HOLE ||
     857        8177 :                     em->block_start == EXTENT_MAP_DELALLOC ||
     858        8177 :                     test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
     859        5652 :                         goto next;
     860             : 
     861             :                 /* Skip older extent */
     862        4181 :                 if (em->generation < newer_than)
     863           2 :                         goto next;
     864             : 
     865             :                 /* This em is under writeback, no need to defrag */
     866        4179 :                 if (em->generation == (u64)-1)
     867           0 :                         goto next;
     868             : 
     869             :                 /*
     870             :                  * Our start offset might be in the middle of an existing extent
     871             :                  * map, so take that into account.
     872             :                  */
     873        4179 :                 range_len = em->len - (cur - em->start);
     874             :                 /*
     875             :                  * If this range of the extent map is already flagged for delalloc,
     876             :                  * skip it, because:
     877             :                  *
     878             :                  * 1) We could deadlock later, when trying to reserve space for
     879             :                  *    delalloc, because in case we can't immediately reserve space
     880             :                  *    the flusher can start delalloc and wait for the respective
     881             :                  *    ordered extents to complete. The deadlock would happen
     882             :                  *    because we do the space reservation while holding the range
     883             :                  *    locked, and starting writeback, or finishing an ordered
     884             :                  *    extent, requires locking the range;
     885             :                  *
     886             :                  * 2) If there's delalloc there, it means there's dirty pages for
     887             :                  *    which writeback has not started yet (we clean the delalloc
     888             :                  *    flag when starting writeback and after creating an ordered
     889             :                  *    extent). If we mark pages in an adjacent range for defrag,
     890             :                  *    then we will have a larger contiguous range for delalloc,
     891             :                  *    very likely resulting in a larger extent after writeback is
     892             :                  *    triggered (except in a case of free space fragmentation).
     893             :                  */
     894        4179 :                 if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
     895             :                                    EXTENT_DELALLOC, 0, NULL))
     896           6 :                         goto next;
     897             : 
     898             :                 /*
     899             :                  * For do_compress case, we want to compress all valid file
     900             :                  * extents, thus no @extent_thresh or mergeable check.
     901             :                  */
     902        4173 :                 if (do_compress)
     903         262 :                         goto add;
     904             : 
     905             :                 /* Skip too large extent */
     906        3911 :                 if (range_len >= extent_thresh)
     907           0 :                         goto next;
     908             : 
     909             :                 /*
     910             :                  * Skip extents already at its max capacity, this is mostly for
     911             :                  * compressed extents, which max cap is only 128K.
     912             :                  */
     913        3911 :                 if (em->len >= get_extent_max_capacity(fs_info, em))
     914         128 :                         goto next;
     915             : 
     916             :                 /*
     917             :                  * Normally there are no more extents after an inline one, thus
     918             :                  * @next_mergeable will normally be false and not defragged.
     919             :                  * So if an inline extent passed all above checks, just add it
     920             :                  * for defrag, and be converted to regular extents.
     921             :                  */
     922        3783 :                 if (em->block_start == EXTENT_MAP_INLINE)
     923           0 :                         goto add;
     924             : 
     925        3783 :                 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
     926             :                                                 extent_thresh, newer_than, locked);
     927        3783 :                 if (!next_mergeable) {
     928        1425 :                         struct defrag_target_range *last;
     929             : 
     930             :                         /* Empty target list, no way to merge with last entry */
     931        1425 :                         if (list_empty(target_list))
     932        1290 :                                 goto next;
     933         135 :                         last = list_entry(target_list->prev,
     934             :                                           struct defrag_target_range, list);
     935             :                         /* Not mergeable with last entry */
     936         135 :                         if (last->start + last->len != cur)
     937           3 :                                 goto next;
     938             : 
     939             :                         /* Mergeable, fall through to add it to @target_list. */
     940             :                 }
     941             : 
     942        2490 : add:
     943        2752 :                 last_is_target = true;
     944        2752 :                 range_len = min(extent_map_end(em), start + len) - cur;
     945             :                 /*
     946             :                  * This one is a good target, check if it can be merged into
     947             :                  * last range of the target list.
     948             :                  */
     949        2752 :                 if (!list_empty(target_list)) {
     950        2596 :                         struct defrag_target_range *last;
     951             : 
     952        2596 :                         last = list_entry(target_list->prev,
     953             :                                           struct defrag_target_range, list);
     954        2596 :                         ASSERT(last->start + last->len <= cur);
     955        2596 :                         if (last->start + last->len == cur) {
     956             :                                 /* Mergeable, enlarge the last entry */
     957        2594 :                                 last->len += range_len;
     958        2594 :                                 goto next;
     959             :                         }
     960             :                         /* Fall through to allocate a new entry */
     961             :                 }
     962             : 
     963             :                 /* Allocate new defrag_target_range */
     964         158 :                 new = kmalloc(sizeof(*new), GFP_NOFS);
     965         158 :                 if (!new) {
     966           0 :                         free_extent_map(em);
     967           0 :                         ret = -ENOMEM;
     968           0 :                         break;
     969             :                 }
     970         158 :                 new->start = cur;
     971         158 :                 new->len = range_len;
     972         158 :                 list_add_tail(&new->list, target_list);
     973             : 
     974        9833 : next:
     975        9833 :                 cur = extent_map_end(em);
     976        9833 :                 free_extent_map(em);
     977             :         }
     978           0 :         if (ret < 0) {
     979           0 :                 struct defrag_target_range *entry;
     980           0 :                 struct defrag_target_range *tmp;
     981             : 
     982           0 :                 list_for_each_entry_safe(entry, tmp, target_list, list) {
     983           0 :                         list_del_init(&entry->list);
     984           0 :                         kfree(entry);
     985             :                 }
     986             :         }
     987        2198 :         if (!ret && last_scanned_ret) {
     988             :                 /*
     989             :                  * If the last extent is not a target, the caller can skip to
     990             :                  * the end of that extent.
     991             :                  * Otherwise, we can only go the end of the specified range.
     992             :                  */
     993          79 :                 if (!last_is_target)
     994           0 :                         *last_scanned_ret = max(cur, *last_scanned_ret);
     995             :                 else
     996          79 :                         *last_scanned_ret = max(start + len, *last_scanned_ret);
     997             :         }
     998        2198 :         return ret;
     999             : }
    1000             : 
    1001             : #define CLUSTER_SIZE    (SZ_256K)
    1002             : static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
    1003             : 
    1004             : /*
    1005             :  * Defrag one contiguous target range.
    1006             :  *
    1007             :  * @inode:      target inode
    1008             :  * @target:     target range to defrag
    1009             :  * @pages:      locked pages covering the defrag range
    1010             :  * @nr_pages:   number of locked pages
    1011             :  *
    1012             :  * Caller should ensure:
    1013             :  *
    1014             :  * - Pages are prepared
    1015             :  *   Pages should be locked, no ordered extent in the pages range,
    1016             :  *   no writeback.
    1017             :  *
    1018             :  * - Extent bits are locked
    1019             :  */
    1020          79 : static int defrag_one_locked_target(struct btrfs_inode *inode,
    1021             :                                     struct defrag_target_range *target,
    1022             :                                     struct page **pages, int nr_pages,
    1023             :                                     struct extent_state **cached_state)
    1024             : {
    1025          79 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    1026          79 :         struct extent_changeset *data_reserved = NULL;
    1027          79 :         const u64 start = target->start;
    1028          79 :         const u64 len = target->len;
    1029          79 :         unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
    1030          79 :         unsigned long start_index = start >> PAGE_SHIFT;
    1031          79 :         unsigned long first_index = page_index(pages[0]);
    1032          79 :         int ret = 0;
    1033          79 :         int i;
    1034             : 
    1035          79 :         ASSERT(last_index - first_index + 1 <= nr_pages);
    1036             : 
    1037          79 :         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
    1038          79 :         if (ret < 0)
    1039             :                 return ret;
    1040          79 :         clear_extent_bit(&inode->io_tree, start, start + len - 1,
    1041             :                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
    1042             :                          EXTENT_DEFRAG, cached_state);
    1043          79 :         set_extent_bit(&inode->io_tree, start, start + len - 1,
    1044             :                        EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
    1045             : 
    1046             :         /* Update the page status */
    1047        1802 :         for (i = start_index - first_index; i <= last_index - first_index; i++) {
    1048        1723 :                 ClearPageChecked(pages[i]);
    1049        1723 :                 btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
    1050             :         }
    1051          79 :         btrfs_delalloc_release_extents(inode, len);
    1052          79 :         extent_changeset_free(data_reserved);
    1053             : 
    1054          79 :         return ret;
    1055             : }
    1056             : 
    1057          79 : static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
    1058             :                             u32 extent_thresh, u64 newer_than, bool do_compress,
    1059             :                             u64 *last_scanned_ret)
    1060             : {
    1061          79 :         struct extent_state *cached_state = NULL;
    1062          79 :         struct defrag_target_range *entry;
    1063          79 :         struct defrag_target_range *tmp;
    1064          79 :         LIST_HEAD(target_list);
    1065          79 :         struct page **pages;
    1066          79 :         const u32 sectorsize = inode->root->fs_info->sectorsize;
    1067          79 :         u64 last_index = (start + len - 1) >> PAGE_SHIFT;
    1068          79 :         u64 start_index = start >> PAGE_SHIFT;
    1069          79 :         unsigned int nr_pages = last_index - start_index + 1;
    1070          79 :         int ret = 0;
    1071          79 :         int i;
    1072             : 
    1073          79 :         ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
    1074          79 :         ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
    1075             : 
    1076          79 :         pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
    1077          79 :         if (!pages)
    1078             :                 return -ENOMEM;
    1079             : 
    1080             :         /* Prepare all pages */
    1081        1802 :         for (i = 0; i < nr_pages; i++) {
    1082        1723 :                 pages[i] = defrag_prepare_one_page(inode, start_index + i);
    1083        1723 :                 if (IS_ERR(pages[i])) {
    1084           0 :                         ret = PTR_ERR(pages[i]);
    1085           0 :                         pages[i] = NULL;
    1086           0 :                         goto free_pages;
    1087             :                 }
    1088             :         }
    1089        1802 :         for (i = 0; i < nr_pages; i++)
    1090        1723 :                 wait_on_page_writeback(pages[i]);
    1091             : 
    1092             :         /* Lock the pages range */
    1093          79 :         lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
    1094          79 :                     (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
    1095             :                     &cached_state);
    1096             :         /*
    1097             :          * Now we have a consistent view about the extent map, re-check
    1098             :          * which range really needs to be defragged.
    1099             :          *
    1100             :          * And this time we have extent locked already, pass @locked = true
    1101             :          * so that we won't relock the extent range and cause deadlock.
    1102             :          */
    1103          79 :         ret = defrag_collect_targets(inode, start, len, extent_thresh,
    1104             :                                      newer_than, do_compress, true,
    1105             :                                      &target_list, last_scanned_ret);
    1106          79 :         if (ret < 0)
    1107           0 :                 goto unlock_extent;
    1108             : 
    1109         158 :         list_for_each_entry(entry, &target_list, list) {
    1110          79 :                 ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
    1111             :                                                &cached_state);
    1112          79 :                 if (ret < 0)
    1113             :                         break;
    1114             :         }
    1115             : 
    1116         158 :         list_for_each_entry_safe(entry, tmp, &target_list, list) {
    1117          79 :                 list_del_init(&entry->list);
    1118          79 :                 kfree(entry);
    1119             :         }
    1120          79 : unlock_extent:
    1121          79 :         unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
    1122             :                       (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
    1123             :                       &cached_state);
    1124          79 : free_pages:
    1125        1802 :         for (i = 0; i < nr_pages; i++) {
    1126        1723 :                 if (pages[i]) {
    1127        1723 :                         unlock_page(pages[i]);
    1128        1723 :                         put_page(pages[i]);
    1129             :                 }
    1130             :         }
    1131          79 :         kfree(pages);
    1132          79 :         return ret;
    1133             : }
    1134             : 
    1135        2119 : static int defrag_one_cluster(struct btrfs_inode *inode,
    1136             :                               struct file_ra_state *ra,
    1137             :                               u64 start, u32 len, u32 extent_thresh,
    1138             :                               u64 newer_than, bool do_compress,
    1139             :                               unsigned long *sectors_defragged,
    1140             :                               unsigned long max_sectors,
    1141             :                               u64 *last_scanned_ret)
    1142             : {
    1143        2119 :         const u32 sectorsize = inode->root->fs_info->sectorsize;
    1144        2119 :         struct defrag_target_range *entry;
    1145        2119 :         struct defrag_target_range *tmp;
    1146        2119 :         LIST_HEAD(target_list);
    1147        2119 :         int ret;
    1148             : 
    1149        2119 :         ret = defrag_collect_targets(inode, start, len, extent_thresh,
    1150             :                                      newer_than, do_compress, false,
    1151             :                                      &target_list, NULL);
    1152        2119 :         if (ret < 0)
    1153           0 :                 goto out;
    1154             : 
    1155        2198 :         list_for_each_entry(entry, &target_list, list) {
    1156          79 :                 u32 range_len = entry->len;
    1157             : 
    1158             :                 /* Reached or beyond the limit */
    1159          79 :                 if (max_sectors && *sectors_defragged >= max_sectors) {
    1160             :                         ret = 1;
    1161             :                         break;
    1162             :                 }
    1163             : 
    1164          79 :                 if (max_sectors)
    1165           1 :                         range_len = min_t(u32, range_len,
    1166             :                                 (max_sectors - *sectors_defragged) * sectorsize);
    1167             : 
    1168             :                 /*
    1169             :                  * If defrag_one_range() has updated last_scanned_ret,
    1170             :                  * our range may already be invalid (e.g. hole punched).
    1171             :                  * Skip if our range is before last_scanned_ret, as there is
    1172             :                  * no need to defrag the range anymore.
    1173             :                  */
    1174          79 :                 if (entry->start + range_len <= *last_scanned_ret)
    1175           0 :                         continue;
    1176             : 
    1177          79 :                 if (ra)
    1178          79 :                         page_cache_sync_readahead(inode->vfs_inode.i_mapping,
    1179             :                                 ra, NULL, entry->start >> PAGE_SHIFT,
    1180          79 :                                 ((entry->start + range_len - 1) >> PAGE_SHIFT) -
    1181          79 :                                 (entry->start >> PAGE_SHIFT) + 1);
    1182             :                 /*
    1183             :                  * Here we may not defrag any range if holes are punched before
    1184             :                  * we locked the pages.
    1185             :                  * But that's fine, it only affects the @sectors_defragged
    1186             :                  * accounting.
    1187             :                  */
    1188          79 :                 ret = defrag_one_range(inode, entry->start, range_len,
    1189             :                                        extent_thresh, newer_than, do_compress,
    1190             :                                        last_scanned_ret);
    1191          79 :                 if (ret < 0)
    1192             :                         break;
    1193          79 :                 *sectors_defragged += range_len >>
    1194          79 :                                       inode->root->fs_info->sectorsize_bits;
    1195             :         }
    1196        2119 : out:
    1197        2198 :         list_for_each_entry_safe(entry, tmp, &target_list, list) {
    1198          79 :                 list_del_init(&entry->list);
    1199          79 :                 kfree(entry);
    1200             :         }
    1201        2119 :         if (ret >= 0)
    1202        2119 :                 *last_scanned_ret = max(*last_scanned_ret, start + len);
    1203        2119 :         return ret;
    1204             : }
    1205             : 
    1206             : /*
    1207             :  * Entry point to file defragmentation.
    1208             :  *
    1209             :  * @inode:         inode to be defragged
    1210             :  * @ra:            readahead state (can be NUL)
    1211             :  * @range:         defrag options including range and flags
    1212             :  * @newer_than:    minimum transid to defrag
    1213             :  * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
    1214             :  *                 will be defragged.
    1215             :  *
    1216             :  * Return <0 for error.
    1217             :  * Return >=0 for the number of sectors defragged, and range->start will be updated
    1218             :  * to indicate the file offset where next defrag should be started at.
    1219             :  * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
    1220             :  *  defragging all the range).
    1221             :  */
    1222         189 : int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
    1223             :                       struct btrfs_ioctl_defrag_range_args *range,
    1224             :                       u64 newer_than, unsigned long max_to_defrag)
    1225             : {
    1226         189 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    1227         189 :         unsigned long sectors_defragged = 0;
    1228         189 :         u64 isize = i_size_read(inode);
    1229         189 :         u64 cur;
    1230         189 :         u64 last_byte;
    1231         189 :         bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
    1232         189 :         bool ra_allocated = false;
    1233         189 :         int compress_type = BTRFS_COMPRESS_ZLIB;
    1234         189 :         int ret = 0;
    1235         189 :         u32 extent_thresh = range->extent_thresh;
    1236         189 :         pgoff_t start_index;
    1237             : 
    1238         189 :         if (isize == 0)
    1239             :                 return 0;
    1240             : 
    1241          86 :         if (range->start >= isize)
    1242             :                 return -EINVAL;
    1243             : 
    1244          86 :         if (do_compress) {
    1245           4 :                 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
    1246             :                         return -EINVAL;
    1247           4 :                 if (range->compress_type)
    1248           4 :                         compress_type = range->compress_type;
    1249             :         }
    1250             : 
    1251          86 :         if (extent_thresh == 0)
    1252           0 :                 extent_thresh = SZ_256K;
    1253             : 
    1254          86 :         if (range->start + range->len > range->start) {
    1255             :                 /* Got a specific range */
    1256          86 :                 last_byte = min(isize, range->start + range->len);
    1257             :         } else {
    1258             :                 /* Defrag until file end */
    1259             :                 last_byte = isize;
    1260             :         }
    1261             : 
    1262             :         /* Align the range */
    1263          86 :         cur = round_down(range->start, fs_info->sectorsize);
    1264          86 :         last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
    1265             : 
    1266             :         /*
    1267             :          * If we were not given a ra, allocate a readahead context. As
    1268             :          * readahead is just an optimization, defrag will work without it so
    1269             :          * we don't error out.
    1270             :          */
    1271          86 :         if (!ra) {
    1272           2 :                 ra_allocated = true;
    1273           2 :                 ra = kzalloc(sizeof(*ra), GFP_KERNEL);
    1274           2 :                 if (ra)
    1275           2 :                         file_ra_state_init(ra, inode->i_mapping);
    1276             :         }
    1277             : 
    1278             :         /*
    1279             :          * Make writeback start from the beginning of the range, so that the
    1280             :          * defrag range can be written sequentially.
    1281             :          */
    1282          86 :         start_index = cur >> PAGE_SHIFT;
    1283          86 :         if (start_index < inode->i_mapping->writeback_index)
    1284          22 :                 inode->i_mapping->writeback_index = start_index;
    1285             : 
    1286        2205 :         while (cur < last_byte) {
    1287        2120 :                 const unsigned long prev_sectors_defragged = sectors_defragged;
    1288        2120 :                 u64 last_scanned = cur;
    1289        2120 :                 u64 cluster_end;
    1290             : 
    1291        2120 :                 if (btrfs_defrag_cancelled(fs_info)) {
    1292             :                         ret = -EAGAIN;
    1293           1 :                         break;
    1294             :                 }
    1295             : 
    1296             :                 /* We want the cluster end at page boundary when possible */
    1297        2120 :                 cluster_end = (((cur >> PAGE_SHIFT) +
    1298        2120 :                                (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
    1299        2120 :                 cluster_end = min(cluster_end, last_byte);
    1300             : 
    1301        2120 :                 btrfs_inode_lock(BTRFS_I(inode), 0);
    1302        2120 :                 if (IS_SWAPFILE(inode)) {
    1303           1 :                         ret = -ETXTBSY;
    1304           1 :                         btrfs_inode_unlock(BTRFS_I(inode), 0);
    1305           1 :                         break;
    1306             :                 }
    1307        2119 :                 if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
    1308           0 :                         btrfs_inode_unlock(BTRFS_I(inode), 0);
    1309           0 :                         break;
    1310             :                 }
    1311        2119 :                 if (do_compress)
    1312           7 :                         BTRFS_I(inode)->defrag_compress = compress_type;
    1313        2119 :                 ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
    1314        2119 :                                 cluster_end + 1 - cur, extent_thresh,
    1315             :                                 newer_than, do_compress, &sectors_defragged,
    1316             :                                 max_to_defrag, &last_scanned);
    1317             : 
    1318        2119 :                 if (sectors_defragged > prev_sectors_defragged)
    1319          77 :                         balance_dirty_pages_ratelimited(inode->i_mapping);
    1320             : 
    1321        2119 :                 btrfs_inode_unlock(BTRFS_I(inode), 0);
    1322        2119 :                 if (ret < 0)
    1323             :                         break;
    1324        2119 :                 cur = max(cluster_end + 1, last_scanned);
    1325        2119 :                 if (ret > 0) {
    1326             :                         ret = 0;
    1327             :                         break;
    1328             :                 }
    1329        2119 :                 cond_resched();
    1330             :         }
    1331             : 
    1332          86 :         if (ra_allocated)
    1333           2 :                 kfree(ra);
    1334             :         /*
    1335             :          * Update range.start for autodefrag, this will indicate where to start
    1336             :          * in next run.
    1337             :          */
    1338          86 :         range->start = cur;
    1339          86 :         if (sectors_defragged) {
    1340             :                 /*
    1341             :                  * We have defragged some sectors, for compression case they
    1342             :                  * need to be written back immediately.
    1343             :                  */
    1344          68 :                 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
    1345          54 :                         filemap_flush(inode->i_mapping);
    1346          54 :                         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
    1347             :                                      &BTRFS_I(inode)->runtime_flags))
    1348           3 :                                 filemap_flush(inode->i_mapping);
    1349             :                 }
    1350          68 :                 if (range->compress_type == BTRFS_COMPRESS_LZO)
    1351           0 :                         btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
    1352          68 :                 else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
    1353           2 :                         btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
    1354          68 :                 ret = sectors_defragged;
    1355             :         }
    1356          86 :         if (do_compress) {
    1357           4 :                 btrfs_inode_lock(BTRFS_I(inode), 0);
    1358           4 :                 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
    1359           4 :                 btrfs_inode_unlock(BTRFS_I(inode), 0);
    1360             :         }
    1361             :         return ret;
    1362             : }
    1363             : 
    1364           0 : void __cold btrfs_auto_defrag_exit(void)
    1365             : {
    1366           0 :         kmem_cache_destroy(btrfs_inode_defrag_cachep);
    1367           0 : }
    1368             : 
    1369          11 : int __init btrfs_auto_defrag_init(void)
    1370             : {
    1371          11 :         btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
    1372             :                                         sizeof(struct inode_defrag), 0,
    1373             :                                         SLAB_MEM_SPREAD,
    1374             :                                         NULL);
    1375          11 :         if (!btrfs_inode_defrag_cachep)
    1376           0 :                 return -ENOMEM;
    1377             : 
    1378             :         return 0;
    1379             : }

Generated by: LCOV version 1.14