LCOV - code coverage report
Current view: top level - fs/btrfs - tree-log.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwx @ Mon Jul 31 20:08:22 PDT 2023 Lines: 2912 3451 84.4 %
Date: 2023-07-31 20:08:22 Functions: 92 96 95.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (C) 2008 Oracle.  All rights reserved.
       4             :  */
       5             : 
       6             : #include <linux/sched.h>
       7             : #include <linux/slab.h>
       8             : #include <linux/blkdev.h>
       9             : #include <linux/list_sort.h>
      10             : #include <linux/iversion.h>
      11             : #include "misc.h"
      12             : #include "ctree.h"
      13             : #include "tree-log.h"
      14             : #include "disk-io.h"
      15             : #include "locking.h"
      16             : #include "print-tree.h"
      17             : #include "backref.h"
      18             : #include "compression.h"
      19             : #include "qgroup.h"
      20             : #include "block-group.h"
      21             : #include "space-info.h"
      22             : #include "zoned.h"
      23             : #include "inode-item.h"
      24             : #include "fs.h"
      25             : #include "accessors.h"
      26             : #include "extent-tree.h"
      27             : #include "root-tree.h"
      28             : #include "dir-item.h"
      29             : #include "file-item.h"
      30             : #include "file.h"
      31             : #include "orphan.h"
      32             : #include "tree-checker.h"
      33             : 
      34             : #define MAX_CONFLICT_INODES 10
      35             : 
      36             : /* magic values for the inode_only field in btrfs_log_inode:
      37             :  *
      38             :  * LOG_INODE_ALL means to log everything
      39             :  * LOG_INODE_EXISTS means to log just enough to recreate the inode
      40             :  * during log replay
      41             :  */
      42             : enum {
      43             :         LOG_INODE_ALL,
      44             :         LOG_INODE_EXISTS,
      45             : };
      46             : 
      47             : /*
      48             :  * directory trouble cases
      49             :  *
      50             :  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
      51             :  * log, we must force a full commit before doing an fsync of the directory
      52             :  * where the unlink was done.
      53             :  * ---> record transid of last unlink/rename per directory
      54             :  *
      55             :  * mkdir foo/some_dir
      56             :  * normal commit
      57             :  * rename foo/some_dir foo2/some_dir
      58             :  * mkdir foo/some_dir
      59             :  * fsync foo/some_dir/some_file
      60             :  *
      61             :  * The fsync above will unlink the original some_dir without recording
      62             :  * it in its new location (foo2).  After a crash, some_dir will be gone
      63             :  * unless the fsync of some_file forces a full commit
      64             :  *
      65             :  * 2) we must log any new names for any file or dir that is in the fsync
      66             :  * log. ---> check inode while renaming/linking.
      67             :  *
      68             :  * 2a) we must log any new names for any file or dir during rename
      69             :  * when the directory they are being removed from was logged.
      70             :  * ---> check inode and old parent dir during rename
      71             :  *
      72             :  *  2a is actually the more important variant.  With the extra logging
      73             :  *  a crash might unlink the old name without recreating the new one
      74             :  *
      75             :  * 3) after a crash, we must go through any directories with a link count
      76             :  * of zero and redo the rm -rf
      77             :  *
      78             :  * mkdir f1/foo
      79             :  * normal commit
      80             :  * rm -rf f1/foo
      81             :  * fsync(f1)
      82             :  *
      83             :  * The directory f1 was fully removed from the FS, but fsync was never
      84             :  * called on f1, only its parent dir.  After a crash the rm -rf must
      85             :  * be replayed.  This must be able to recurse down the entire
      86             :  * directory tree.  The inode link count fixup code takes care of the
      87             :  * ugly details.
      88             :  */
      89             : 
      90             : /*
      91             :  * stages for the tree walking.  The first
      92             :  * stage (0) is to only pin down the blocks we find
      93             :  * the second stage (1) is to make sure that all the inodes
      94             :  * we find in the log are created in the subvolume.
      95             :  *
      96             :  * The last stage is to deal with directories and links and extents
      97             :  * and all the other fun semantics
      98             :  */
      99             : enum {
     100             :         LOG_WALK_PIN_ONLY,
     101             :         LOG_WALK_REPLAY_INODES,
     102             :         LOG_WALK_REPLAY_DIR_INDEX,
     103             :         LOG_WALK_REPLAY_ALL,
     104             : };
     105             : 
     106             : static int btrfs_log_inode(struct btrfs_trans_handle *trans,
     107             :                            struct btrfs_inode *inode,
     108             :                            int inode_only,
     109             :                            struct btrfs_log_ctx *ctx);
     110             : static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
     111             :                              struct btrfs_root *root,
     112             :                              struct btrfs_path *path, u64 objectid);
     113             : static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
     114             :                                        struct btrfs_root *root,
     115             :                                        struct btrfs_root *log,
     116             :                                        struct btrfs_path *path,
     117             :                                        u64 dirid, int del_all);
     118             : static void wait_log_commit(struct btrfs_root *root, int transid);
     119             : 
     120             : /*
     121             :  * tree logging is a special write ahead log used to make sure that
     122             :  * fsyncs and O_SYNCs can happen without doing full tree commits.
     123             :  *
     124             :  * Full tree commits are expensive because they require commonly
     125             :  * modified blocks to be recowed, creating many dirty pages in the
     126             :  * extent tree an 4x-6x higher write load than ext3.
     127             :  *
     128             :  * Instead of doing a tree commit on every fsync, we use the
     129             :  * key ranges and transaction ids to find items for a given file or directory
     130             :  * that have changed in this transaction.  Those items are copied into
     131             :  * a special tree (one per subvolume root), that tree is written to disk
     132             :  * and then the fsync is considered complete.
     133             :  *
     134             :  * After a crash, items are copied out of the log-tree back into the
     135             :  * subvolume tree.  Any file data extents found are recorded in the extent
     136             :  * allocation tree, and the log-tree freed.
     137             :  *
     138             :  * The log tree is read three times, once to pin down all the extents it is
     139             :  * using in ram and once, once to create all the inodes logged in the tree
     140             :  * and once to do all the other items.
     141             :  */
     142             : 
     143             : /*
     144             :  * start a sub transaction and setup the log tree
     145             :  * this increments the log tree writer count to make the people
     146             :  * syncing the tree wait for us to finish
     147             :  */
     148      253735 : static int start_log_trans(struct btrfs_trans_handle *trans,
     149             :                            struct btrfs_root *root,
     150             :                            struct btrfs_log_ctx *ctx)
     151             : {
     152      253735 :         struct btrfs_fs_info *fs_info = root->fs_info;
     153      253735 :         struct btrfs_root *tree_root = fs_info->tree_root;
     154      253735 :         const bool zoned = btrfs_is_zoned(fs_info);
     155      253735 :         int ret = 0;
     156      253735 :         bool created = false;
     157             : 
     158             :         /*
     159             :          * First check if the log root tree was already created. If not, create
     160             :          * it before locking the root's log_mutex, just to keep lockdep happy.
     161             :          */
     162      253735 :         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
     163        4782 :                 mutex_lock(&tree_root->log_mutex);
     164        4783 :                 if (!fs_info->log_root_tree) {
     165        4757 :                         ret = btrfs_init_log_root_tree(trans, fs_info);
     166        4757 :                         if (!ret) {
     167        4757 :                                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
     168        4757 :                                 created = true;
     169             :                         }
     170             :                 }
     171        4783 :                 mutex_unlock(&tree_root->log_mutex);
     172        4783 :                 if (ret)
     173             :                         return ret;
     174             :         }
     175             : 
     176      253736 :         mutex_lock(&root->log_mutex);
     177             : 
     178      253741 : again:
     179      253741 :         if (root->log_root) {
     180      248932 :                 int index = (root->log_transid + 1) % 2;
     181             : 
     182      248932 :                 if (btrfs_need_log_full_commit(trans)) {
     183         260 :                         ret = BTRFS_LOG_FORCE_COMMIT;
     184         260 :                         goto out;
     185             :                 }
     186             : 
     187      248672 :                 if (zoned && atomic_read(&root->log_commit[index])) {
     188           0 :                         wait_log_commit(root, root->log_transid - 1);
     189           0 :                         goto again;
     190             :                 }
     191             : 
     192      248672 :                 if (!root->log_start_pid) {
     193      237154 :                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
     194      237158 :                         root->log_start_pid = current->pid;
     195       11518 :                 } else if (root->log_start_pid != current->pid) {
     196       10663 :                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
     197             :                 }
     198             :         } else {
     199             :                 /*
     200             :                  * This means fs_info->log_root_tree was already created
     201             :                  * for some other FS trees. Do the full commit not to mix
     202             :                  * nodes from multiple log transactions to do sequential
     203             :                  * writing.
     204             :                  */
     205        4809 :                 if (zoned && !created) {
     206           0 :                         ret = BTRFS_LOG_FORCE_COMMIT;
     207           0 :                         goto out;
     208             :                 }
     209             : 
     210        4809 :                 ret = btrfs_add_log_tree(trans, root);
     211        4809 :                 if (ret)
     212           0 :                         goto out;
     213             : 
     214        4809 :                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
     215        4809 :                 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
     216        4809 :                 root->log_start_pid = current->pid;
     217             :         }
     218             : 
     219      253485 :         atomic_inc(&root->log_writers);
     220      253486 :         if (!ctx->logging_new_name) {
     221      252276 :                 int index = root->log_transid % 2;
     222      252276 :                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
     223      252275 :                 ctx->log_transid = root->log_transid;
     224             :         }
     225             : 
     226        1210 : out:
     227      253745 :         mutex_unlock(&root->log_mutex);
     228      253745 :         return ret;
     229             : }
     230             : 
     231             : /*
     232             :  * returns 0 if there was a log transaction running and we were able
     233             :  * to join, or returns -ENOENT if there were not transactions
     234             :  * in progress
     235             :  */
     236        1737 : static int join_running_log_trans(struct btrfs_root *root)
     237             : {
     238        1737 :         const bool zoned = btrfs_is_zoned(root->fs_info);
     239        1737 :         int ret = -ENOENT;
     240             : 
     241        1737 :         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
     242             :                 return ret;
     243             : 
     244        1737 :         mutex_lock(&root->log_mutex);
     245        1737 : again:
     246        1737 :         if (root->log_root) {
     247        1737 :                 int index = (root->log_transid + 1) % 2;
     248             : 
     249        1737 :                 ret = 0;
     250        1737 :                 if (zoned && atomic_read(&root->log_commit[index])) {
     251           0 :                         wait_log_commit(root, root->log_transid - 1);
     252           0 :                         goto again;
     253             :                 }
     254        1737 :                 atomic_inc(&root->log_writers);
     255             :         }
     256        1737 :         mutex_unlock(&root->log_mutex);
     257        1737 :         return ret;
     258             : }
     259             : 
     260             : /*
     261             :  * This either makes the current running log transaction wait
     262             :  * until you call btrfs_end_log_trans() or it makes any future
     263             :  * log transactions wait until you call btrfs_end_log_trans()
     264             :  */
     265       39366 : void btrfs_pin_log_trans(struct btrfs_root *root)
     266             : {
     267       39366 :         atomic_inc(&root->log_writers);
     268       39366 : }
     269             : 
     270             : /*
     271             :  * indicate we're done making changes to the log tree
     272             :  * and wake up anyone waiting to do a sync
     273             :  */
     274      294519 : void btrfs_end_log_trans(struct btrfs_root *root)
     275             : {
     276      294519 :         if (atomic_dec_and_test(&root->log_writers)) {
     277             :                 /* atomic_dec_and_test implies a barrier */
     278      272988 :                 cond_wake_up_nomb(&root->log_writer_wait);
     279             :         }
     280      294555 : }
     281             : 
     282             : /*
     283             :  * the walk control struct is used to pass state down the chain when
     284             :  * processing the log tree.  The stage field tells us which part
     285             :  * of the log tree processing we are currently doing.  The others
     286             :  * are state fields used for that specific part
     287             :  */
     288             : struct walk_control {
     289             :         /* should we free the extent on disk when done?  This is used
     290             :          * at transaction commit time while freeing a log tree
     291             :          */
     292             :         int free;
     293             : 
     294             :         /* pin only walk, we record which extents on disk belong to the
     295             :          * log trees
     296             :          */
     297             :         int pin;
     298             : 
     299             :         /* what stage of the replay code we're currently in */
     300             :         int stage;
     301             : 
     302             :         /*
     303             :          * Ignore any items from the inode currently being processed. Needs
     304             :          * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
     305             :          * the LOG_WALK_REPLAY_INODES stage.
     306             :          */
     307             :         bool ignore_cur_inode;
     308             : 
     309             :         /* the root we are currently replaying */
     310             :         struct btrfs_root *replay_dest;
     311             : 
     312             :         /* the trans handle for the current replay */
     313             :         struct btrfs_trans_handle *trans;
     314             : 
     315             :         /* the function that gets used to process blocks we find in the
     316             :          * tree.  Note the extent_buffer might not be up to date when it is
     317             :          * passed in, and it must be checked or read if you need the data
     318             :          * inside it
     319             :          */
     320             :         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
     321             :                             struct walk_control *wc, u64 gen, int level);
     322             : };
     323             : 
     324             : /*
     325             :  * process_func used to pin down extents, write them or wait on them
     326             :  */
     327       29650 : static int process_one_buffer(struct btrfs_root *log,
     328             :                               struct extent_buffer *eb,
     329             :                               struct walk_control *wc, u64 gen, int level)
     330             : {
     331       29650 :         struct btrfs_fs_info *fs_info = log->fs_info;
     332       29650 :         int ret = 0;
     333             : 
     334             :         /*
     335             :          * If this fs is mixed then we need to be able to process the leaves to
     336             :          * pin down any logged extents, so we have to read the block.
     337             :          */
     338       29650 :         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
     339           0 :                 struct btrfs_tree_parent_check check = {
     340             :                         .level = level,
     341             :                         .transid = gen
     342             :                 };
     343             : 
     344           0 :                 ret = btrfs_read_extent_buffer(eb, &check);
     345           0 :                 if (ret)
     346           0 :                         return ret;
     347             :         }
     348             : 
     349       29650 :         if (wc->pin) {
     350        5132 :                 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
     351        5132 :                                                       eb->len);
     352        5132 :                 if (ret)
     353             :                         return ret;
     354             : 
     355        5132 :                 if (btrfs_buffer_uptodate(eb, gen, 0) &&
     356             :                     btrfs_header_level(eb) == 0)
     357        4583 :                         ret = btrfs_exclude_logged_extents(eb);
     358             :         }
     359             :         return ret;
     360             : }
     361             : 
     362             : /*
     363             :  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
     364             :  * to the src data we are copying out.
     365             :  *
     366             :  * root is the tree we are copying into, and path is a scratch
     367             :  * path for use in this function (it should be released on entry and
     368             :  * will be released on exit).
     369             :  *
     370             :  * If the key is already in the destination tree the existing item is
     371             :  * overwritten.  If the existing item isn't big enough, it is extended.
     372             :  * If it is too large, it is truncated.
     373             :  *
     374             :  * If the key isn't in the destination yet, a new item is inserted.
     375             :  */
     376       10616 : static int overwrite_item(struct btrfs_trans_handle *trans,
     377             :                           struct btrfs_root *root,
     378             :                           struct btrfs_path *path,
     379             :                           struct extent_buffer *eb, int slot,
     380             :                           struct btrfs_key *key)
     381             : {
     382       10616 :         int ret;
     383       10616 :         u32 item_size;
     384       10616 :         u64 saved_i_size = 0;
     385       10616 :         int save_old_i_size = 0;
     386       10616 :         unsigned long src_ptr;
     387       10616 :         unsigned long dst_ptr;
     388       10616 :         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
     389             : 
     390             :         /*
     391             :          * This is only used during log replay, so the root is always from a
     392             :          * fs/subvolume tree. In case we ever need to support a log root, then
     393             :          * we'll have to clone the leaf in the path, release the path and use
     394             :          * the leaf before writing into the log tree. See the comments at
     395             :          * copy_items() for more details.
     396             :          */
     397       10616 :         ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
     398             : 
     399       10616 :         item_size = btrfs_item_size(eb, slot);
     400       10616 :         src_ptr = btrfs_item_ptr_offset(eb, slot);
     401             : 
     402             :         /* Look for the key in the destination tree. */
     403       10616 :         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
     404       10616 :         if (ret < 0)
     405             :                 return ret;
     406             : 
     407       10616 :         if (ret == 0) {
     408       10271 :                 char *src_copy;
     409       10271 :                 char *dst_copy;
     410       10271 :                 u32 dst_size = btrfs_item_size(path->nodes[0],
     411             :                                                   path->slots[0]);
     412       10271 :                 if (dst_size != item_size)
     413           4 :                         goto insert;
     414             : 
     415       10267 :                 if (item_size == 0) {
     416           0 :                         btrfs_release_path(path);
     417           0 :                         return 0;
     418             :                 }
     419       10267 :                 dst_copy = kmalloc(item_size, GFP_NOFS);
     420       10267 :                 src_copy = kmalloc(item_size, GFP_NOFS);
     421       10267 :                 if (!dst_copy || !src_copy) {
     422           0 :                         btrfs_release_path(path);
     423           0 :                         kfree(dst_copy);
     424           0 :                         kfree(src_copy);
     425           0 :                         return -ENOMEM;
     426             :                 }
     427             : 
     428       10267 :                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
     429             : 
     430       10267 :                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
     431       10267 :                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
     432             :                                    item_size);
     433       10267 :                 ret = memcmp(dst_copy, src_copy, item_size);
     434             : 
     435       10267 :                 kfree(dst_copy);
     436       10267 :                 kfree(src_copy);
     437             :                 /*
     438             :                  * they have the same contents, just return, this saves
     439             :                  * us from cowing blocks in the destination tree and doing
     440             :                  * extra writes that may not have been done by a previous
     441             :                  * sync
     442             :                  */
     443       10267 :                 if (ret == 0) {
     444        5858 :                         btrfs_release_path(path);
     445        5858 :                         return 0;
     446             :                 }
     447             : 
     448             :                 /*
     449             :                  * We need to load the old nbytes into the inode so when we
     450             :                  * replay the extents we've logged we get the right nbytes.
     451             :                  */
     452        4409 :                 if (inode_item) {
     453        4409 :                         struct btrfs_inode_item *item;
     454        4409 :                         u64 nbytes;
     455        4409 :                         u32 mode;
     456             : 
     457        4409 :                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
     458             :                                               struct btrfs_inode_item);
     459        4409 :                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
     460        4409 :                         item = btrfs_item_ptr(eb, slot,
     461             :                                               struct btrfs_inode_item);
     462        4409 :                         btrfs_set_inode_nbytes(eb, item, nbytes);
     463             : 
     464             :                         /*
     465             :                          * If this is a directory we need to reset the i_size to
     466             :                          * 0 so that we can set it up properly when replaying
     467             :                          * the rest of the items in this log.
     468             :                          */
     469        4409 :                         mode = btrfs_inode_mode(eb, item);
     470        4409 :                         if (S_ISDIR(mode))
     471          23 :                                 btrfs_set_inode_size(eb, item, 0);
     472             :                 }
     473         345 :         } else if (inode_item) {
     474         149 :                 struct btrfs_inode_item *item;
     475         149 :                 u32 mode;
     476             : 
     477             :                 /*
     478             :                  * New inode, set nbytes to 0 so that the nbytes comes out
     479             :                  * properly when we replay the extents.
     480             :                  */
     481         149 :                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
     482         149 :                 btrfs_set_inode_nbytes(eb, item, 0);
     483             : 
     484             :                 /*
     485             :                  * If this is a directory we need to reset the i_size to 0 so
     486             :                  * that we can set it up properly when replaying the rest of
     487             :                  * the items in this log.
     488             :                  */
     489         149 :                 mode = btrfs_inode_mode(eb, item);
     490         149 :                 if (S_ISDIR(mode))
     491          37 :                         btrfs_set_inode_size(eb, item, 0);
     492             :         }
     493         308 : insert:
     494        4758 :         btrfs_release_path(path);
     495             :         /* try to insert the key into the destination tree */
     496        4758 :         path->skip_release_on_error = 1;
     497        4758 :         ret = btrfs_insert_empty_item(trans, root, path,
     498             :                                       key, item_size);
     499        4758 :         path->skip_release_on_error = 0;
     500             : 
     501             :         /* make sure any existing item is the correct size */
     502        4758 :         if (ret == -EEXIST || ret == -EOVERFLOW) {
     503        4413 :                 u32 found_size;
     504        4413 :                 found_size = btrfs_item_size(path->nodes[0],
     505             :                                                 path->slots[0]);
     506        4413 :                 if (found_size > item_size)
     507           0 :                         btrfs_truncate_item(path, item_size, 1);
     508        4413 :                 else if (found_size < item_size)
     509           4 :                         btrfs_extend_item(path, item_size - found_size);
     510         345 :         } else if (ret) {
     511             :                 return ret;
     512             :         }
     513        4758 :         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
     514             :                                         path->slots[0]);
     515             : 
     516             :         /* don't overwrite an existing inode if the generation number
     517             :          * was logged as zero.  This is done when the tree logging code
     518             :          * is just logging an inode to make sure it exists after recovery.
     519             :          *
     520             :          * Also, don't overwrite i_size on directories during replay.
     521             :          * log replay inserts and removes directory items based on the
     522             :          * state of the tree found in the subvolume, and i_size is modified
     523             :          * as it goes
     524             :          */
     525        4758 :         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
     526        4409 :                 struct btrfs_inode_item *src_item;
     527        4409 :                 struct btrfs_inode_item *dst_item;
     528             : 
     529        4409 :                 src_item = (struct btrfs_inode_item *)src_ptr;
     530        4409 :                 dst_item = (struct btrfs_inode_item *)dst_ptr;
     531             : 
     532        4409 :                 if (btrfs_inode_generation(eb, src_item) == 0) {
     533          16 :                         struct extent_buffer *dst_eb = path->nodes[0];
     534          16 :                         const u64 ino_size = btrfs_inode_size(eb, src_item);
     535             : 
     536             :                         /*
     537             :                          * For regular files an ino_size == 0 is used only when
     538             :                          * logging that an inode exists, as part of a directory
     539             :                          * fsync, and the inode wasn't fsynced before. In this
     540             :                          * case don't set the size of the inode in the fs/subvol
     541             :                          * tree, otherwise we would be throwing valid data away.
     542             :                          */
     543          16 :                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
     544          14 :                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
     545             :                             ino_size != 0)
     546           2 :                                 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
     547          16 :                         goto no_copy;
     548             :                 }
     549             : 
     550        4393 :                 if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
     551          21 :                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
     552          21 :                         save_old_i_size = 1;
     553          21 :                         saved_i_size = btrfs_inode_size(path->nodes[0],
     554             :                                                         dst_item);
     555             :                 }
     556             :         }
     557             : 
     558        4742 :         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
     559             :                            src_ptr, item_size);
     560             : 
     561        4742 :         if (save_old_i_size) {
     562          21 :                 struct btrfs_inode_item *dst_item;
     563          21 :                 dst_item = (struct btrfs_inode_item *)dst_ptr;
     564          21 :                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
     565             :         }
     566             : 
     567             :         /* make sure the generation is filled in */
     568        4742 :         if (key->type == BTRFS_INODE_ITEM_KEY) {
     569        4542 :                 struct btrfs_inode_item *dst_item;
     570        4542 :                 dst_item = (struct btrfs_inode_item *)dst_ptr;
     571        4542 :                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
     572          34 :                         btrfs_set_inode_generation(path->nodes[0], dst_item,
     573             :                                                    trans->transid);
     574             :                 }
     575             :         }
     576        4708 : no_copy:
     577        4758 :         btrfs_mark_buffer_dirty(path->nodes[0]);
     578        4758 :         btrfs_release_path(path);
     579        4758 :         return 0;
     580             : }
     581             : 
     582        8003 : static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
     583             :                                struct fscrypt_str *name)
     584             : {
     585        8003 :         char *buf;
     586             : 
     587        8003 :         buf = kmalloc(len, GFP_NOFS);
     588        8003 :         if (!buf)
     589             :                 return -ENOMEM;
     590             : 
     591        8003 :         read_extent_buffer(eb, buf, (unsigned long)start, len);
     592        8003 :         name->name = buf;
     593        8003 :         name->len = len;
     594        8003 :         return 0;
     595             : }
     596             : 
     597             : /*
     598             :  * simple helper to read an inode off the disk from a given root
     599             :  * This can only be called for subvolume roots and not for the log
     600             :  */
     601      122677 : static noinline struct inode *read_one_inode(struct btrfs_root *root,
     602             :                                              u64 objectid)
     603             : {
     604      122677 :         struct inode *inode;
     605             : 
     606      122677 :         inode = btrfs_iget(root->fs_info->sb, objectid, root);
     607      122677 :         if (IS_ERR(inode))
     608          37 :                 inode = NULL;
     609      122677 :         return inode;
     610             : }
     611             : 
     612             : /* replays a single extent in 'eb' at 'slot' with 'key' into the
     613             :  * subvolume 'root'.  path is released on entry and should be released
     614             :  * on exit.
     615             :  *
     616             :  * extents in the log tree have not been allocated out of the extent
     617             :  * tree yet.  So, this completes the allocation, taking a reference
     618             :  * as required if the extent already exists or creating a new extent
     619             :  * if it isn't in the extent allocation tree yet.
     620             :  *
     621             :  * The extent is inserted into the file, dropping any existing extents
     622             :  * from the file that overlap the new one.
     623             :  */
     624      100838 : static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
     625             :                                       struct btrfs_root *root,
     626             :                                       struct btrfs_path *path,
     627             :                                       struct extent_buffer *eb, int slot,
     628             :                                       struct btrfs_key *key)
     629             : {
     630      100838 :         struct btrfs_drop_extents_args drop_args = { 0 };
     631      100838 :         struct btrfs_fs_info *fs_info = root->fs_info;
     632      100838 :         int found_type;
     633      100838 :         u64 extent_end;
     634      100838 :         u64 start = key->offset;
     635      100838 :         u64 nbytes = 0;
     636      100838 :         struct btrfs_file_extent_item *item;
     637      100838 :         struct inode *inode = NULL;
     638      100838 :         unsigned long size;
     639      100838 :         int ret = 0;
     640             : 
     641      100838 :         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
     642      100838 :         found_type = btrfs_file_extent_type(eb, item);
     643             : 
     644      100838 :         if (found_type == BTRFS_FILE_EXTENT_REG ||
     645             :             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
     646      100826 :                 nbytes = btrfs_file_extent_num_bytes(eb, item);
     647      100826 :                 extent_end = start + nbytes;
     648             : 
     649             :                 /*
     650             :                  * We don't add to the inodes nbytes if we are prealloc or a
     651             :                  * hole.
     652             :                  */
     653      100826 :                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
     654        6330 :                         nbytes = 0;
     655          12 :         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
     656          12 :                 size = btrfs_file_extent_ram_bytes(eb, item);
     657          12 :                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
     658          12 :                 extent_end = ALIGN(start + size,
     659             :                                    fs_info->sectorsize);
     660             :         } else {
     661           0 :                 ret = 0;
     662           0 :                 goto out;
     663             :         }
     664             : 
     665      100838 :         inode = read_one_inode(root, key->objectid);
     666      100838 :         if (!inode) {
     667           0 :                 ret = -EIO;
     668           0 :                 goto out;
     669             :         }
     670             : 
     671             :         /*
     672             :          * first check to see if we already have this extent in the
     673             :          * file.  This must be done before the btrfs_drop_extents run
     674             :          * so we don't try to drop this extent.
     675             :          */
     676      100838 :         ret = btrfs_lookup_file_extent(trans, root, path,
     677             :                         btrfs_ino(BTRFS_I(inode)), start, 0);
     678             : 
     679      100838 :         if (ret == 0 &&
     680             :             (found_type == BTRFS_FILE_EXTENT_REG ||
     681             :              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
     682       17736 :                 struct btrfs_file_extent_item cmp1;
     683       17736 :                 struct btrfs_file_extent_item cmp2;
     684       17736 :                 struct btrfs_file_extent_item *existing;
     685       17736 :                 struct extent_buffer *leaf;
     686             : 
     687       17736 :                 leaf = path->nodes[0];
     688       17736 :                 existing = btrfs_item_ptr(leaf, path->slots[0],
     689             :                                           struct btrfs_file_extent_item);
     690             : 
     691       17736 :                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
     692             :                                    sizeof(cmp1));
     693       17736 :                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
     694             :                                    sizeof(cmp2));
     695             : 
     696             :                 /*
     697             :                  * we already have a pointer to this exact extent,
     698             :                  * we don't have to do anything
     699             :                  */
     700       17736 :                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
     701         253 :                         btrfs_release_path(path);
     702         253 :                         goto out;
     703             :                 }
     704             :         }
     705      100585 :         btrfs_release_path(path);
     706             : 
     707             :         /* drop any overlapping extents */
     708      100585 :         drop_args.start = start;
     709      100585 :         drop_args.end = extent_end;
     710      100585 :         drop_args.drop_cache = true;
     711      100585 :         ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
     712      100585 :         if (ret)
     713           0 :                 goto out;
     714             : 
     715      100585 :         if (found_type == BTRFS_FILE_EXTENT_REG ||
     716             :             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
     717      100573 :                 u64 offset;
     718      100573 :                 unsigned long dest_offset;
     719      100573 :                 struct btrfs_key ins;
     720             : 
     721      100573 :                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
     722        6330 :                     btrfs_fs_incompat(fs_info, NO_HOLES))
     723        6326 :                         goto update_inode;
     724             : 
     725       94247 :                 ret = btrfs_insert_empty_item(trans, root, path, key,
     726             :                                               sizeof(*item));
     727       94247 :                 if (ret)
     728           0 :                         goto out;
     729       94247 :                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
     730             :                                                     path->slots[0]);
     731       94247 :                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
     732             :                                 (unsigned long)item,  sizeof(*item));
     733             : 
     734       94247 :                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
     735       94247 :                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
     736       94247 :                 ins.type = BTRFS_EXTENT_ITEM_KEY;
     737       94247 :                 offset = key->offset - btrfs_file_extent_offset(eb, item);
     738             : 
     739             :                 /*
     740             :                  * Manually record dirty extent, as here we did a shallow
     741             :                  * file extent item copy and skip normal backref update,
     742             :                  * but modifying extent tree all by ourselves.
     743             :                  * So need to manually record dirty extent for qgroup,
     744             :                  * as the owner of the file extent changed from log tree
     745             :                  * (doesn't affect qgroup) to fs/file tree(affects qgroup)
     746             :                  */
     747      188494 :                 ret = btrfs_qgroup_trace_extent(trans,
     748             :                                 btrfs_file_extent_disk_bytenr(eb, item),
     749             :                                 btrfs_file_extent_disk_num_bytes(eb, item));
     750       94247 :                 if (ret < 0)
     751           0 :                         goto out;
     752             : 
     753       94247 :                 if (ins.objectid > 0) {
     754       94243 :                         struct btrfs_ref ref = { 0 };
     755       94243 :                         u64 csum_start;
     756       94243 :                         u64 csum_end;
     757       94243 :                         LIST_HEAD(ordered_sums);
     758             : 
     759             :                         /*
     760             :                          * is this extent already allocated in the extent
     761             :                          * allocation tree?  If so, just add a reference
     762             :                          */
     763       94243 :                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
     764             :                                                 ins.offset);
     765       94243 :                         if (ret < 0) {
     766           0 :                                 goto out;
     767       94243 :                         } else if (ret == 0) {
     768       11191 :                                 btrfs_init_generic_ref(&ref,
     769             :                                                 BTRFS_ADD_DELAYED_REF,
     770             :                                                 ins.objectid, ins.offset, 0);
     771       11191 :                                 btrfs_init_data_ref(&ref,
     772             :                                                 root->root_key.objectid,
     773             :                                                 key->objectid, offset, 0, false);
     774       11191 :                                 ret = btrfs_inc_extent_ref(trans, &ref);
     775       11191 :                                 if (ret)
     776           0 :                                         goto out;
     777             :                         } else {
     778             :                                 /*
     779             :                                  * insert the extent pointer in the extent
     780             :                                  * allocation tree
     781             :                                  */
     782       83052 :                                 ret = btrfs_alloc_logged_file_extent(trans,
     783             :                                                 root->root_key.objectid,
     784             :                                                 key->objectid, offset, &ins);
     785       83052 :                                 if (ret)
     786           0 :                                         goto out;
     787             :                         }
     788       94243 :                         btrfs_release_path(path);
     789             : 
     790       94243 :                         if (btrfs_file_extent_compression(eb, item)) {
     791          16 :                                 csum_start = ins.objectid;
     792          16 :                                 csum_end = csum_start + ins.offset;
     793             :                         } else {
     794       94227 :                                 csum_start = ins.objectid +
     795             :                                         btrfs_file_extent_offset(eb, item);
     796       94227 :                                 csum_end = csum_start +
     797             :                                         btrfs_file_extent_num_bytes(eb, item);
     798             :                         }
     799             : 
     800       94243 :                         ret = btrfs_lookup_csums_list(root->log_root,
     801             :                                                 csum_start, csum_end - 1,
     802             :                                                 &ordered_sums, 0, false);
     803       94243 :                         if (ret)
     804           0 :                                 goto out;
     805             :                         /*
     806             :                          * Now delete all existing cums in the csum root that
     807             :                          * cover our range. We do this because we can have an
     808             :                          * extent that is completely referenced by one file
     809             :                          * extent item and partially referenced by another
     810             :                          * file extent item (like after using the clone or
     811             :                          * extent_same ioctls). In this case if we end up doing
     812             :                          * the replay of the one that partially references the
     813             :                          * extent first, and we do not do the csum deletion
     814             :                          * below, we can get 2 csum items in the csum tree that
     815             :                          * overlap each other. For example, imagine our log has
     816             :                          * the two following file extent items:
     817             :                          *
     818             :                          * key (257 EXTENT_DATA 409600)
     819             :                          *     extent data disk byte 12845056 nr 102400
     820             :                          *     extent data offset 20480 nr 20480 ram 102400
     821             :                          *
     822             :                          * key (257 EXTENT_DATA 819200)
     823             :                          *     extent data disk byte 12845056 nr 102400
     824             :                          *     extent data offset 0 nr 102400 ram 102400
     825             :                          *
     826             :                          * Where the second one fully references the 100K extent
     827             :                          * that starts at disk byte 12845056, and the log tree
     828             :                          * has a single csum item that covers the entire range
     829             :                          * of the extent:
     830             :                          *
     831             :                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
     832             :                          *
     833             :                          * After the first file extent item is replayed, the
     834             :                          * csum tree gets the following csum item:
     835             :                          *
     836             :                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
     837             :                          *
     838             :                          * Which covers the 20K sub-range starting at offset 20K
     839             :                          * of our extent. Now when we replay the second file
     840             :                          * extent item, if we do not delete existing csum items
     841             :                          * that cover any of its blocks, we end up getting two
     842             :                          * csum items in our csum tree that overlap each other:
     843             :                          *
     844             :                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
     845             :                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
     846             :                          *
     847             :                          * Which is a problem, because after this anyone trying
     848             :                          * to lookup up for the checksum of any block of our
     849             :                          * extent starting at an offset of 40K or higher, will
     850             :                          * end up looking at the second csum item only, which
     851             :                          * does not contain the checksum for any block starting
     852             :                          * at offset 40K or higher of our extent.
     853             :                          */
     854      183209 :                         while (!list_empty(&ordered_sums)) {
     855       88966 :                                 struct btrfs_ordered_sum *sums;
     856       88966 :                                 struct btrfs_root *csum_root;
     857             : 
     858       88966 :                                 sums = list_entry(ordered_sums.next,
     859             :                                                 struct btrfs_ordered_sum,
     860             :                                                 list);
     861       88966 :                                 csum_root = btrfs_csum_root(fs_info,
     862             :                                                             sums->logical);
     863       88966 :                                 if (!ret)
     864       88966 :                                         ret = btrfs_del_csums(trans, csum_root,
     865             :                                                               sums->logical,
     866       88966 :                                                               sums->len);
     867       88966 :                                 if (!ret)
     868       88966 :                                         ret = btrfs_csum_file_blocks(trans,
     869             :                                                                      csum_root,
     870             :                                                                      sums);
     871       88966 :                                 list_del(&sums->list);
     872       88966 :                                 kfree(sums);
     873             :                         }
     874       94243 :                         if (ret)
     875           0 :                                 goto out;
     876             :                 } else {
     877           4 :                         btrfs_release_path(path);
     878             :                 }
     879          12 :         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
     880             :                 /* inline extents are easy, we just overwrite them */
     881          12 :                 ret = overwrite_item(trans, root, path, eb, slot, key);
     882          12 :                 if (ret)
     883           0 :                         goto out;
     884             :         }
     885             : 
     886       94259 :         ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
     887             :                                                 extent_end - start);
     888       94259 :         if (ret)
     889           0 :                 goto out;
     890             : 
     891       94259 : update_inode:
     892      100585 :         btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
     893      100585 :         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
     894      100838 : out:
     895      100838 :         iput(inode);
     896      100838 :         return ret;
     897             : }
     898             : 
     899          20 : static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
     900             :                                        struct btrfs_inode *dir,
     901             :                                        struct btrfs_inode *inode,
     902             :                                        const struct fscrypt_str *name)
     903             : {
     904          20 :         int ret;
     905             : 
     906          20 :         ret = btrfs_unlink_inode(trans, dir, inode, name);
     907          20 :         if (ret)
     908             :                 return ret;
     909             :         /*
     910             :          * Whenever we need to check if a name exists or not, we check the
     911             :          * fs/subvolume tree. So after an unlink we must run delayed items, so
     912             :          * that future checks for a name during log replay see that the name
     913             :          * does not exists anymore.
     914             :          */
     915          20 :         return btrfs_run_delayed_items(trans);
     916             : }
     917             : 
     918             : /*
     919             :  * when cleaning up conflicts between the directory names in the
     920             :  * subvolume, directory names in the log and directory names in the
     921             :  * inode back references, we may have to unlink inodes from directories.
     922             :  *
     923             :  * This is a helper function to do the unlink of a specific directory
     924             :  * item
     925             :  */
     926           4 : static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
     927             :                                       struct btrfs_path *path,
     928             :                                       struct btrfs_inode *dir,
     929             :                                       struct btrfs_dir_item *di)
     930             : {
     931           4 :         struct btrfs_root *root = dir->root;
     932           4 :         struct inode *inode;
     933           4 :         struct fscrypt_str name;
     934           4 :         struct extent_buffer *leaf;
     935           4 :         struct btrfs_key location;
     936           4 :         int ret;
     937             : 
     938           4 :         leaf = path->nodes[0];
     939             : 
     940           4 :         btrfs_dir_item_key_to_cpu(leaf, di, &location);
     941           4 :         ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
     942           4 :         if (ret)
     943             :                 return -ENOMEM;
     944             : 
     945           4 :         btrfs_release_path(path);
     946             : 
     947           4 :         inode = read_one_inode(root, location.objectid);
     948           4 :         if (!inode) {
     949           0 :                 ret = -EIO;
     950           0 :                 goto out;
     951             :         }
     952             : 
     953           4 :         ret = link_to_fixup_dir(trans, root, path, location.objectid);
     954           4 :         if (ret)
     955           0 :                 goto out;
     956             : 
     957           4 :         ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
     958           4 : out:
     959           4 :         kfree(name.name);
     960           4 :         iput(inode);
     961           4 :         return ret;
     962             : }
     963             : 
     964             : /*
     965             :  * See if a given name and sequence number found in an inode back reference are
     966             :  * already in a directory and correctly point to this inode.
     967             :  *
     968             :  * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
     969             :  * exists.
     970             :  */
     971        4070 : static noinline int inode_in_dir(struct btrfs_root *root,
     972             :                                  struct btrfs_path *path,
     973             :                                  u64 dirid, u64 objectid, u64 index,
     974             :                                  struct fscrypt_str *name)
     975             : {
     976        4070 :         struct btrfs_dir_item *di;
     977        4070 :         struct btrfs_key location;
     978        4070 :         int ret = 0;
     979             : 
     980        4070 :         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
     981             :                                          index, name, 0);
     982        4070 :         if (IS_ERR(di)) {
     983           0 :                 ret = PTR_ERR(di);
     984           0 :                 goto out;
     985        4070 :         } else if (di) {
     986        3855 :                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
     987        3855 :                 if (location.objectid != objectid)
     988           0 :                         goto out;
     989             :         } else {
     990         215 :                 goto out;
     991             :         }
     992             : 
     993        3855 :         btrfs_release_path(path);
     994        3855 :         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
     995        3855 :         if (IS_ERR(di)) {
     996           0 :                 ret = PTR_ERR(di);
     997           0 :                 goto out;
     998        3855 :         } else if (di) {
     999        3855 :                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
    1000        3855 :                 if (location.objectid == objectid)
    1001        3855 :                         ret = 1;
    1002             :         }
    1003           0 : out:
    1004        4070 :         btrfs_release_path(path);
    1005        4070 :         return ret;
    1006             : }
    1007             : 
    1008             : /*
    1009             :  * helper function to check a log tree for a named back reference in
    1010             :  * an inode.  This is used to decide if a back reference that is
    1011             :  * found in the subvolume conflicts with what we find in the log.
    1012             :  *
    1013             :  * inode backreferences may have multiple refs in a single item,
    1014             :  * during replay we process one reference at a time, and we don't
    1015             :  * want to delete valid links to a file from the subvolume if that
    1016             :  * link is also in the log.
    1017             :  */
    1018          64 : static noinline int backref_in_log(struct btrfs_root *log,
    1019             :                                    struct btrfs_key *key,
    1020             :                                    u64 ref_objectid,
    1021             :                                    const struct fscrypt_str *name)
    1022             : {
    1023          64 :         struct btrfs_path *path;
    1024          64 :         int ret;
    1025             : 
    1026          64 :         path = btrfs_alloc_path();
    1027          64 :         if (!path)
    1028             :                 return -ENOMEM;
    1029             : 
    1030          64 :         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
    1031          64 :         if (ret < 0) {
    1032           0 :                 goto out;
    1033          64 :         } else if (ret == 1) {
    1034           0 :                 ret = 0;
    1035           0 :                 goto out;
    1036             :         }
    1037             : 
    1038          64 :         if (key->type == BTRFS_INODE_EXTREF_KEY)
    1039           0 :                 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
    1040             :                                                        path->slots[0],
    1041             :                                                        ref_objectid, name);
    1042             :         else
    1043          64 :                 ret = !!btrfs_find_name_in_backref(path->nodes[0],
    1044             :                                                    path->slots[0], name);
    1045          64 : out:
    1046          64 :         btrfs_free_path(path);
    1047          64 :         return ret;
    1048             : }
    1049             : 
    1050         215 : static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
    1051             :                                   struct btrfs_root *root,
    1052             :                                   struct btrfs_path *path,
    1053             :                                   struct btrfs_root *log_root,
    1054             :                                   struct btrfs_inode *dir,
    1055             :                                   struct btrfs_inode *inode,
    1056             :                                   u64 inode_objectid, u64 parent_objectid,
    1057             :                                   u64 ref_index, struct fscrypt_str *name)
    1058             : {
    1059         222 :         int ret;
    1060         222 :         struct extent_buffer *leaf;
    1061         222 :         struct btrfs_dir_item *di;
    1062         222 :         struct btrfs_key search_key;
    1063         222 :         struct btrfs_inode_extref *extref;
    1064             : 
    1065             : again:
    1066             :         /* Search old style refs */
    1067         222 :         search_key.objectid = inode_objectid;
    1068         222 :         search_key.type = BTRFS_INODE_REF_KEY;
    1069         222 :         search_key.offset = parent_objectid;
    1070         222 :         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
    1071         222 :         if (ret == 0) {
    1072          22 :                 struct btrfs_inode_ref *victim_ref;
    1073          22 :                 unsigned long ptr;
    1074          22 :                 unsigned long ptr_end;
    1075             : 
    1076          22 :                 leaf = path->nodes[0];
    1077             : 
    1078             :                 /* are we trying to overwrite a back ref for the root directory
    1079             :                  * if so, just jump out, we're done
    1080             :                  */
    1081          22 :                 if (search_key.objectid == search_key.offset)
    1082             :                         return 1;
    1083             : 
    1084             :                 /* check all the names in this back reference to see
    1085             :                  * if they are in the log.  if so, we allow them to stay
    1086             :                  * otherwise they must be unlinked as a conflict
    1087             :                  */
    1088          13 :                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
    1089          13 :                 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
    1090          20 :                 while (ptr < ptr_end) {
    1091          14 :                         struct fscrypt_str victim_name;
    1092             : 
    1093          14 :                         victim_ref = (struct btrfs_inode_ref *)ptr;
    1094          14 :                         ret = read_alloc_one_name(leaf, (victim_ref + 1),
    1095             :                                  btrfs_inode_ref_name_len(leaf, victim_ref),
    1096             :                                  &victim_name);
    1097          14 :                         if (ret)
    1098           0 :                                 return ret;
    1099             : 
    1100          14 :                         ret = backref_in_log(log_root, &search_key,
    1101             :                                              parent_objectid, &victim_name);
    1102          14 :                         if (ret < 0) {
    1103           0 :                                 kfree(victim_name.name);
    1104           0 :                                 return ret;
    1105          14 :                         } else if (!ret) {
    1106           7 :                                 inc_nlink(&inode->vfs_inode);
    1107           7 :                                 btrfs_release_path(path);
    1108             : 
    1109           7 :                                 ret = unlink_inode_for_log_replay(trans, dir, inode,
    1110             :                                                 &victim_name);
    1111           7 :                                 kfree(victim_name.name);
    1112           7 :                                 if (ret)
    1113           0 :                                         return ret;
    1114           7 :                                 goto again;
    1115             :                         }
    1116           7 :                         kfree(victim_name.name);
    1117             : 
    1118           7 :                         ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
    1119             :                 }
    1120             :         }
    1121         206 :         btrfs_release_path(path);
    1122             : 
    1123             :         /* Same search but for extended refs */
    1124         206 :         extref = btrfs_lookup_inode_extref(NULL, root, path, name,
    1125             :                                            inode_objectid, parent_objectid, 0,
    1126             :                                            0);
    1127         206 :         if (IS_ERR(extref)) {
    1128           0 :                 return PTR_ERR(extref);
    1129         206 :         } else if (extref) {
    1130           0 :                 u32 item_size;
    1131           0 :                 u32 cur_offset = 0;
    1132           0 :                 unsigned long base;
    1133           0 :                 struct inode *victim_parent;
    1134             : 
    1135           0 :                 leaf = path->nodes[0];
    1136             : 
    1137           0 :                 item_size = btrfs_item_size(leaf, path->slots[0]);
    1138           0 :                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
    1139             : 
    1140           0 :                 while (cur_offset < item_size) {
    1141           0 :                         struct fscrypt_str victim_name;
    1142             : 
    1143           0 :                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
    1144             : 
    1145           0 :                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
    1146           0 :                                 goto next;
    1147             : 
    1148           0 :                         ret = read_alloc_one_name(leaf, &extref->name,
    1149             :                                  btrfs_inode_extref_name_len(leaf, extref),
    1150             :                                  &victim_name);
    1151           0 :                         if (ret)
    1152           0 :                                 return ret;
    1153             : 
    1154           0 :                         search_key.objectid = inode_objectid;
    1155           0 :                         search_key.type = BTRFS_INODE_EXTREF_KEY;
    1156           0 :                         search_key.offset = btrfs_extref_hash(parent_objectid,
    1157           0 :                                                               victim_name.name,
    1158           0 :                                                               victim_name.len);
    1159           0 :                         ret = backref_in_log(log_root, &search_key,
    1160             :                                              parent_objectid, &victim_name);
    1161           0 :                         if (ret < 0) {
    1162           0 :                                 kfree(victim_name.name);
    1163           0 :                                 return ret;
    1164           0 :                         } else if (!ret) {
    1165           0 :                                 ret = -ENOENT;
    1166           0 :                                 victim_parent = read_one_inode(root,
    1167             :                                                 parent_objectid);
    1168           0 :                                 if (victim_parent) {
    1169           0 :                                         inc_nlink(&inode->vfs_inode);
    1170           0 :                                         btrfs_release_path(path);
    1171             : 
    1172           0 :                                         ret = unlink_inode_for_log_replay(trans,
    1173             :                                                         BTRFS_I(victim_parent),
    1174             :                                                         inode, &victim_name);
    1175             :                                 }
    1176           0 :                                 iput(victim_parent);
    1177           0 :                                 kfree(victim_name.name);
    1178           0 :                                 if (ret)
    1179           0 :                                         return ret;
    1180           0 :                                 goto again;
    1181             :                         }
    1182           0 :                         kfree(victim_name.name);
    1183           0 : next:
    1184           0 :                         cur_offset += victim_name.len + sizeof(*extref);
    1185             :                 }
    1186             :         }
    1187         206 :         btrfs_release_path(path);
    1188             : 
    1189             :         /* look for a conflicting sequence number */
    1190         206 :         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
    1191             :                                          ref_index, name, 0);
    1192         206 :         if (IS_ERR(di)) {
    1193           0 :                 return PTR_ERR(di);
    1194         206 :         } else if (di) {
    1195           0 :                 ret = drop_one_dir_item(trans, path, dir, di);
    1196           0 :                 if (ret)
    1197             :                         return ret;
    1198             :         }
    1199         206 :         btrfs_release_path(path);
    1200             : 
    1201             :         /* look for a conflicting name */
    1202         206 :         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
    1203         206 :         if (IS_ERR(di)) {
    1204           0 :                 return PTR_ERR(di);
    1205         206 :         } else if (di) {
    1206           4 :                 ret = drop_one_dir_item(trans, path, dir, di);
    1207           4 :                 if (ret)
    1208             :                         return ret;
    1209             :         }
    1210         206 :         btrfs_release_path(path);
    1211             : 
    1212         206 :         return 0;
    1213             : }
    1214             : 
    1215           0 : static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
    1216             :                              struct fscrypt_str *name, u64 *index,
    1217             :                              u64 *parent_objectid)
    1218             : {
    1219           0 :         struct btrfs_inode_extref *extref;
    1220           0 :         int ret;
    1221             : 
    1222           0 :         extref = (struct btrfs_inode_extref *)ref_ptr;
    1223             : 
    1224           0 :         ret = read_alloc_one_name(eb, &extref->name,
    1225             :                                   btrfs_inode_extref_name_len(eb, extref), name);
    1226           0 :         if (ret)
    1227             :                 return ret;
    1228             : 
    1229           0 :         if (index)
    1230           0 :                 *index = btrfs_inode_extref_index(eb, extref);
    1231           0 :         if (parent_objectid)
    1232           0 :                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
    1233             : 
    1234             :         return 0;
    1235             : }
    1236             : 
    1237        7927 : static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
    1238             :                           struct fscrypt_str *name, u64 *index)
    1239             : {
    1240        7927 :         struct btrfs_inode_ref *ref;
    1241        7927 :         int ret;
    1242             : 
    1243        7927 :         ref = (struct btrfs_inode_ref *)ref_ptr;
    1244             : 
    1245        7927 :         ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
    1246             :                                   name);
    1247        7927 :         if (ret)
    1248             :                 return ret;
    1249             : 
    1250        7927 :         if (index)
    1251        4070 :                 *index = btrfs_inode_ref_index(eb, ref);
    1252             : 
    1253             :         return 0;
    1254             : }
    1255             : 
    1256             : /*
    1257             :  * Take an inode reference item from the log tree and iterate all names from the
    1258             :  * inode reference item in the subvolume tree with the same key (if it exists).
    1259             :  * For any name that is not in the inode reference item from the log tree, do a
    1260             :  * proper unlink of that name (that is, remove its entry from the inode
    1261             :  * reference item and both dir index keys).
    1262             :  */
    1263        4037 : static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
    1264             :                                  struct btrfs_root *root,
    1265             :                                  struct btrfs_path *path,
    1266             :                                  struct btrfs_inode *inode,
    1267             :                                  struct extent_buffer *log_eb,
    1268             :                                  int log_slot,
    1269             :                                  struct btrfs_key *key)
    1270             : {
    1271        4038 :         int ret;
    1272        4038 :         unsigned long ref_ptr;
    1273        4038 :         unsigned long ref_end;
    1274        4038 :         struct extent_buffer *eb;
    1275             : 
    1276        4038 : again:
    1277        4038 :         btrfs_release_path(path);
    1278        4038 :         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
    1279        4038 :         if (ret > 0) {
    1280         183 :                 ret = 0;
    1281         183 :                 goto out;
    1282             :         }
    1283        3855 :         if (ret < 0)
    1284           0 :                 goto out;
    1285             : 
    1286        3855 :         eb = path->nodes[0];
    1287        3855 :         ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
    1288        3855 :         ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
    1289        7711 :         while (ref_ptr < ref_end) {
    1290        3857 :                 struct fscrypt_str name;
    1291        3857 :                 u64 parent_id;
    1292             : 
    1293        3857 :                 if (key->type == BTRFS_INODE_EXTREF_KEY) {
    1294           0 :                         ret = extref_get_fields(eb, ref_ptr, &name,
    1295             :                                                 NULL, &parent_id);
    1296             :                 } else {
    1297        3857 :                         parent_id = key->offset;
    1298        3857 :                         ret = ref_get_fields(eb, ref_ptr, &name, NULL);
    1299             :                 }
    1300        3857 :                 if (ret)
    1301           0 :                         goto out;
    1302             : 
    1303        3857 :                 if (key->type == BTRFS_INODE_EXTREF_KEY)
    1304           0 :                         ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
    1305             :                                                                parent_id, &name);
    1306             :                 else
    1307        3857 :                         ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
    1308             : 
    1309        3857 :                 if (!ret) {
    1310           1 :                         struct inode *dir;
    1311             : 
    1312           1 :                         btrfs_release_path(path);
    1313           1 :                         dir = read_one_inode(root, parent_id);
    1314           1 :                         if (!dir) {
    1315           0 :                                 ret = -ENOENT;
    1316           0 :                                 kfree(name.name);
    1317           0 :                                 goto out;
    1318             :                         }
    1319           1 :                         ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
    1320             :                                                  inode, &name);
    1321           1 :                         kfree(name.name);
    1322           1 :                         iput(dir);
    1323           1 :                         if (ret)
    1324           0 :                                 goto out;
    1325           1 :                         goto again;
    1326             :                 }
    1327             : 
    1328        3856 :                 kfree(name.name);
    1329        3856 :                 ref_ptr += name.len;
    1330        3856 :                 if (key->type == BTRFS_INODE_EXTREF_KEY)
    1331           0 :                         ref_ptr += sizeof(struct btrfs_inode_extref);
    1332             :                 else
    1333        3856 :                         ref_ptr += sizeof(struct btrfs_inode_ref);
    1334             :         }
    1335             :         ret = 0;
    1336        4037 :  out:
    1337        4037 :         btrfs_release_path(path);
    1338        4037 :         return ret;
    1339             : }
    1340             : 
    1341             : /*
    1342             :  * replay one inode back reference item found in the log tree.
    1343             :  * eb, slot and key refer to the buffer and key found in the log tree.
    1344             :  * root is the destination we are replaying into, and path is for temp
    1345             :  * use by this function.  (it should be released on return).
    1346             :  */
    1347        4046 : static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
    1348             :                                   struct btrfs_root *root,
    1349             :                                   struct btrfs_root *log,
    1350             :                                   struct btrfs_path *path,
    1351             :                                   struct extent_buffer *eb, int slot,
    1352             :                                   struct btrfs_key *key)
    1353             : {
    1354        4046 :         struct inode *dir = NULL;
    1355        4046 :         struct inode *inode = NULL;
    1356        4046 :         unsigned long ref_ptr;
    1357        4046 :         unsigned long ref_end;
    1358        4046 :         struct fscrypt_str name;
    1359        4046 :         int ret;
    1360        4046 :         int log_ref_ver = 0;
    1361        4046 :         u64 parent_objectid;
    1362        4046 :         u64 inode_objectid;
    1363        4046 :         u64 ref_index = 0;
    1364        4046 :         int ref_struct_size;
    1365             : 
    1366        4046 :         ref_ptr = btrfs_item_ptr_offset(eb, slot);
    1367        4046 :         ref_end = ref_ptr + btrfs_item_size(eb, slot);
    1368             : 
    1369        4046 :         if (key->type == BTRFS_INODE_EXTREF_KEY) {
    1370           0 :                 struct btrfs_inode_extref *r;
    1371             : 
    1372           0 :                 ref_struct_size = sizeof(struct btrfs_inode_extref);
    1373           0 :                 log_ref_ver = 1;
    1374           0 :                 r = (struct btrfs_inode_extref *)ref_ptr;
    1375           0 :                 parent_objectid = btrfs_inode_extref_parent(eb, r);
    1376             :         } else {
    1377        4046 :                 ref_struct_size = sizeof(struct btrfs_inode_ref);
    1378        4046 :                 parent_objectid = key->offset;
    1379             :         }
    1380        4046 :         inode_objectid = key->objectid;
    1381             : 
    1382             :         /*
    1383             :          * it is possible that we didn't log all the parent directories
    1384             :          * for a given inode.  If we don't find the dir, just don't
    1385             :          * copy the back ref in.  The link count fixup code will take
    1386             :          * care of the rest
    1387             :          */
    1388        4046 :         dir = read_one_inode(root, parent_objectid);
    1389        4046 :         if (!dir) {
    1390           0 :                 ret = -ENOENT;
    1391           0 :                 goto out;
    1392             :         }
    1393             : 
    1394        4046 :         inode = read_one_inode(root, inode_objectid);
    1395        4046 :         if (!inode) {
    1396           0 :                 ret = -EIO;
    1397           0 :                 goto out;
    1398             :         }
    1399             : 
    1400        8107 :         while (ref_ptr < ref_end) {
    1401        4070 :                 if (log_ref_ver) {
    1402           0 :                         ret = extref_get_fields(eb, ref_ptr, &name,
    1403             :                                                 &ref_index, &parent_objectid);
    1404             :                         /*
    1405             :                          * parent object can change from one array
    1406             :                          * item to another.
    1407             :                          */
    1408           0 :                         if (!dir)
    1409           0 :                                 dir = read_one_inode(root, parent_objectid);
    1410           0 :                         if (!dir) {
    1411           0 :                                 ret = -ENOENT;
    1412           0 :                                 goto out;
    1413             :                         }
    1414             :                 } else {
    1415        4070 :                         ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
    1416             :                 }
    1417        4070 :                 if (ret)
    1418           0 :                         goto out;
    1419             : 
    1420        4070 :                 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
    1421             :                                    btrfs_ino(BTRFS_I(inode)), ref_index, &name);
    1422        4070 :                 if (ret < 0) {
    1423           0 :                         goto out;
    1424        4070 :                 } else if (ret == 0) {
    1425             :                         /*
    1426             :                          * look for a conflicting back reference in the
    1427             :                          * metadata. if we find one we have to unlink that name
    1428             :                          * of the file before we add our new link.  Later on, we
    1429             :                          * overwrite any existing back reference, and we don't
    1430             :                          * want to create dangling pointers in the directory.
    1431             :                          */
    1432         215 :                         ret = __add_inode_ref(trans, root, path, log,
    1433             :                                               BTRFS_I(dir), BTRFS_I(inode),
    1434             :                                               inode_objectid, parent_objectid,
    1435             :                                               ref_index, &name);
    1436         215 :                         if (ret) {
    1437           9 :                                 if (ret == 1)
    1438           9 :                                         ret = 0;
    1439           9 :                                 goto out;
    1440             :                         }
    1441             : 
    1442             :                         /* insert our name */
    1443         206 :                         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
    1444             :                                              &name, 0, ref_index);
    1445         206 :                         if (ret)
    1446           0 :                                 goto out;
    1447             : 
    1448         206 :                         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    1449         206 :                         if (ret)
    1450           0 :                                 goto out;
    1451             :                 }
    1452             :                 /* Else, ret == 1, we already have a perfect match, we're done. */
    1453             : 
    1454        4061 :                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
    1455        4061 :                 kfree(name.name);
    1456        4061 :                 name.name = NULL;
    1457        4061 :                 if (log_ref_ver) {
    1458           0 :                         iput(dir);
    1459           0 :                         dir = NULL;
    1460             :                 }
    1461             :         }
    1462             : 
    1463             :         /*
    1464             :          * Before we overwrite the inode reference item in the subvolume tree
    1465             :          * with the item from the log tree, we must unlink all names from the
    1466             :          * parent directory that are in the subvolume's tree inode reference
    1467             :          * item, otherwise we end up with an inconsistent subvolume tree where
    1468             :          * dir index entries exist for a name but there is no inode reference
    1469             :          * item with the same name.
    1470             :          */
    1471        4037 :         ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
    1472             :                                     key);
    1473        4037 :         if (ret)
    1474           0 :                 goto out;
    1475             : 
    1476             :         /* finally write the back reference in the inode */
    1477        4037 :         ret = overwrite_item(trans, root, path, eb, slot, key);
    1478        4046 : out:
    1479        4046 :         btrfs_release_path(path);
    1480        4046 :         kfree(name.name);
    1481        4046 :         iput(dir);
    1482        4046 :         iput(inode);
    1483        4046 :         return ret;
    1484             : }
    1485             : 
    1486        4559 : static int count_inode_extrefs(struct btrfs_root *root,
    1487             :                 struct btrfs_inode *inode, struct btrfs_path *path)
    1488             : {
    1489        4559 :         int ret = 0;
    1490        4559 :         int name_len;
    1491        4559 :         unsigned int nlink = 0;
    1492        4559 :         u32 item_size;
    1493        4559 :         u32 cur_offset = 0;
    1494        4559 :         u64 inode_objectid = btrfs_ino(inode);
    1495        4559 :         u64 offset = 0;
    1496        4559 :         unsigned long ptr;
    1497        4559 :         struct btrfs_inode_extref *extref;
    1498        4559 :         struct extent_buffer *leaf;
    1499             : 
    1500        4559 :         while (1) {
    1501        4559 :                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
    1502             :                                             &extref, &offset);
    1503        4559 :                 if (ret)
    1504             :                         break;
    1505             : 
    1506           0 :                 leaf = path->nodes[0];
    1507           0 :                 item_size = btrfs_item_size(leaf, path->slots[0]);
    1508           0 :                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
    1509           0 :                 cur_offset = 0;
    1510             : 
    1511           0 :                 while (cur_offset < item_size) {
    1512           0 :                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
    1513           0 :                         name_len = btrfs_inode_extref_name_len(leaf, extref);
    1514             : 
    1515           0 :                         nlink++;
    1516             : 
    1517           0 :                         cur_offset += name_len + sizeof(*extref);
    1518             :                 }
    1519             : 
    1520           0 :                 offset++;
    1521           0 :                 btrfs_release_path(path);
    1522             :         }
    1523        4559 :         btrfs_release_path(path);
    1524             : 
    1525        4559 :         if (ret < 0 && ret != -ENOENT)
    1526             :                 return ret;
    1527        4559 :         return nlink;
    1528             : }
    1529             : 
    1530        4559 : static int count_inode_refs(struct btrfs_root *root,
    1531             :                         struct btrfs_inode *inode, struct btrfs_path *path)
    1532             : {
    1533        4559 :         int ret;
    1534        4559 :         struct btrfs_key key;
    1535        4559 :         unsigned int nlink = 0;
    1536        4559 :         unsigned long ptr;
    1537        4559 :         unsigned long ptr_end;
    1538        4559 :         int name_len;
    1539        4559 :         u64 ino = btrfs_ino(inode);
    1540             : 
    1541        4559 :         key.objectid = ino;
    1542        4559 :         key.type = BTRFS_INODE_REF_KEY;
    1543        4559 :         key.offset = (u64)-1;
    1544             : 
    1545        4559 :         while (1) {
    1546        4559 :                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    1547        4559 :                 if (ret < 0)
    1548             :                         break;
    1549        4559 :                 if (ret > 0) {
    1550        4559 :                         if (path->slots[0] == 0)
    1551             :                                 break;
    1552        4559 :                         path->slots[0]--;
    1553             :                 }
    1554           0 : process_slot:
    1555        9139 :                 btrfs_item_key_to_cpu(path->nodes[0], &key,
    1556             :                                       path->slots[0]);
    1557        9139 :                 if (key.objectid != ino ||
    1558        9139 :                     key.type != BTRFS_INODE_REF_KEY)
    1559             :                         break;
    1560        4580 :                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
    1561        4580 :                 ptr_end = ptr + btrfs_item_size(path->nodes[0],
    1562             :                                                    path->slots[0]);
    1563        9184 :                 while (ptr < ptr_end) {
    1564        4604 :                         struct btrfs_inode_ref *ref;
    1565             : 
    1566        4604 :                         ref = (struct btrfs_inode_ref *)ptr;
    1567        4604 :                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
    1568             :                                                             ref);
    1569        4604 :                         ptr = (unsigned long)(ref + 1) + name_len;
    1570        4604 :                         nlink++;
    1571             :                 }
    1572             : 
    1573        4580 :                 if (key.offset == 0)
    1574             :                         break;
    1575        4580 :                 if (path->slots[0] > 0) {
    1576        4580 :                         path->slots[0]--;
    1577        4580 :                         goto process_slot;
    1578             :                 }
    1579           0 :                 key.offset--;
    1580           0 :                 btrfs_release_path(path);
    1581             :         }
    1582        4559 :         btrfs_release_path(path);
    1583             : 
    1584        4559 :         return nlink;
    1585             : }
    1586             : 
    1587             : /*
    1588             :  * There are a few corners where the link count of the file can't
    1589             :  * be properly maintained during replay.  So, instead of adding
    1590             :  * lots of complexity to the log code, we just scan the backrefs
    1591             :  * for any file that has been through replay.
    1592             :  *
    1593             :  * The scan will update the link count on the inode to reflect the
    1594             :  * number of back refs found.  If it goes down to zero, the iput
    1595             :  * will free the inode.
    1596             :  */
    1597        4559 : static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
    1598             :                                            struct btrfs_root *root,
    1599             :                                            struct inode *inode)
    1600             : {
    1601        4559 :         struct btrfs_path *path;
    1602        4559 :         int ret;
    1603        4559 :         u64 nlink = 0;
    1604        4559 :         u64 ino = btrfs_ino(BTRFS_I(inode));
    1605             : 
    1606        4559 :         path = btrfs_alloc_path();
    1607        4559 :         if (!path)
    1608             :                 return -ENOMEM;
    1609             : 
    1610        4559 :         ret = count_inode_refs(root, BTRFS_I(inode), path);
    1611        4559 :         if (ret < 0)
    1612           0 :                 goto out;
    1613             : 
    1614        4559 :         nlink = ret;
    1615             : 
    1616        4559 :         ret = count_inode_extrefs(root, BTRFS_I(inode), path);
    1617        4559 :         if (ret < 0)
    1618           0 :                 goto out;
    1619             : 
    1620        4559 :         nlink += ret;
    1621             : 
    1622        4559 :         ret = 0;
    1623             : 
    1624        4559 :         if (nlink != inode->i_nlink) {
    1625        4546 :                 set_nlink(inode, nlink);
    1626        4546 :                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    1627        4546 :                 if (ret)
    1628           0 :                         goto out;
    1629             :         }
    1630        4559 :         BTRFS_I(inode)->index_cnt = (u64)-1;
    1631             : 
    1632        4559 :         if (inode->i_nlink == 0) {
    1633           2 :                 if (S_ISDIR(inode->i_mode)) {
    1634           0 :                         ret = replay_dir_deletes(trans, root, NULL, path,
    1635             :                                                  ino, 1);
    1636           0 :                         if (ret)
    1637           0 :                                 goto out;
    1638             :                 }
    1639           2 :                 ret = btrfs_insert_orphan_item(trans, root, ino);
    1640           2 :                 if (ret == -EEXIST)
    1641           0 :                         ret = 0;
    1642             :         }
    1643             : 
    1644        4559 : out:
    1645        4559 :         btrfs_free_path(path);
    1646        4559 :         return ret;
    1647             : }
    1648             : 
    1649        4448 : static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
    1650             :                                             struct btrfs_root *root,
    1651             :                                             struct btrfs_path *path)
    1652             : {
    1653        4448 :         int ret;
    1654        4448 :         struct btrfs_key key;
    1655        4448 :         struct inode *inode;
    1656             : 
    1657        4448 :         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
    1658        4448 :         key.type = BTRFS_ORPHAN_ITEM_KEY;
    1659        4448 :         key.offset = (u64)-1;
    1660       13566 :         while (1) {
    1661        9007 :                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    1662        9007 :                 if (ret < 0)
    1663             :                         break;
    1664             : 
    1665        9007 :                 if (ret == 1) {
    1666        9007 :                         ret = 0;
    1667        9007 :                         if (path->slots[0] == 0)
    1668             :                                 break;
    1669        9007 :                         path->slots[0]--;
    1670             :                 }
    1671             : 
    1672        9007 :                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
    1673        9007 :                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
    1674        4559 :                     key.type != BTRFS_ORPHAN_ITEM_KEY)
    1675             :                         break;
    1676             : 
    1677        4559 :                 ret = btrfs_del_item(trans, root, path);
    1678        4559 :                 if (ret)
    1679             :                         break;
    1680             : 
    1681        4559 :                 btrfs_release_path(path);
    1682        4559 :                 inode = read_one_inode(root, key.offset);
    1683        4559 :                 if (!inode) {
    1684             :                         ret = -EIO;
    1685             :                         break;
    1686             :                 }
    1687             : 
    1688        4559 :                 ret = fixup_inode_link_count(trans, root, inode);
    1689        4559 :                 iput(inode);
    1690        4559 :                 if (ret)
    1691             :                         break;
    1692             : 
    1693             :                 /*
    1694             :                  * fixup on a directory may create new entries,
    1695             :                  * make sure we always look for the highset possible
    1696             :                  * offset
    1697             :                  */
    1698        4559 :                 key.offset = (u64)-1;
    1699             :         }
    1700        4448 :         btrfs_release_path(path);
    1701        4448 :         return ret;
    1702             : }
    1703             : 
    1704             : 
    1705             : /*
    1706             :  * record a given inode in the fixup dir so we can check its link
    1707             :  * count when replay is done.  The link count is incremented here
    1708             :  * so the inode won't go away until we check it
    1709             :  */
    1710        4570 : static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
    1711             :                                       struct btrfs_root *root,
    1712             :                                       struct btrfs_path *path,
    1713             :                                       u64 objectid)
    1714             : {
    1715        4570 :         struct btrfs_key key;
    1716        4570 :         int ret = 0;
    1717        4570 :         struct inode *inode;
    1718             : 
    1719        4570 :         inode = read_one_inode(root, objectid);
    1720        4570 :         if (!inode)
    1721             :                 return -EIO;
    1722             : 
    1723        4570 :         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
    1724        4570 :         key.type = BTRFS_ORPHAN_ITEM_KEY;
    1725        4570 :         key.offset = objectid;
    1726             : 
    1727        4570 :         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
    1728             : 
    1729        4570 :         btrfs_release_path(path);
    1730        4570 :         if (ret == 0) {
    1731        4559 :                 if (!inode->i_nlink)
    1732           0 :                         set_nlink(inode, 1);
    1733             :                 else
    1734        4559 :                         inc_nlink(inode);
    1735        4559 :                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    1736          11 :         } else if (ret == -EEXIST) {
    1737          11 :                 ret = 0;
    1738             :         }
    1739        4570 :         iput(inode);
    1740             : 
    1741        4570 :         return ret;
    1742             : }
    1743             : 
    1744             : /*
    1745             :  * when replaying the log for a directory, we only insert names
    1746             :  * for inodes that actually exist.  This means an fsync on a directory
    1747             :  * does not implicitly fsync all the new files in it
    1748             :  */
    1749           0 : static noinline int insert_one_name(struct btrfs_trans_handle *trans,
    1750             :                                     struct btrfs_root *root,
    1751             :                                     u64 dirid, u64 index,
    1752             :                                     const struct fscrypt_str *name,
    1753             :                                     struct btrfs_key *location)
    1754             : {
    1755           0 :         struct inode *inode;
    1756           0 :         struct inode *dir;
    1757           0 :         int ret;
    1758             : 
    1759           0 :         inode = read_one_inode(root, location->objectid);
    1760           0 :         if (!inode)
    1761             :                 return -ENOENT;
    1762             : 
    1763           0 :         dir = read_one_inode(root, dirid);
    1764           0 :         if (!dir) {
    1765           0 :                 iput(inode);
    1766           0 :                 return -EIO;
    1767             :         }
    1768             : 
    1769           0 :         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
    1770             :                              1, index);
    1771             : 
    1772             :         /* FIXME, put inode into FIXUP list */
    1773             : 
    1774           0 :         iput(inode);
    1775           0 :         iput(dir);
    1776           0 :         return ret;
    1777             : }
    1778             : 
    1779           0 : static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
    1780             :                                         struct btrfs_inode *dir,
    1781             :                                         struct btrfs_path *path,
    1782             :                                         struct btrfs_dir_item *dst_di,
    1783             :                                         const struct btrfs_key *log_key,
    1784             :                                         u8 log_flags,
    1785             :                                         bool exists)
    1786             : {
    1787           0 :         struct btrfs_key found_key;
    1788             : 
    1789           0 :         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
    1790             :         /* The existing dentry points to the same inode, don't delete it. */
    1791           0 :         if (found_key.objectid == log_key->objectid &&
    1792           0 :             found_key.type == log_key->type &&
    1793           0 :             found_key.offset == log_key->offset &&
    1794           0 :             btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
    1795             :                 return 1;
    1796             : 
    1797             :         /*
    1798             :          * Don't drop the conflicting directory entry if the inode for the new
    1799             :          * entry doesn't exist.
    1800             :          */
    1801           0 :         if (!exists)
    1802             :                 return 0;
    1803             : 
    1804           0 :         return drop_one_dir_item(trans, path, dir, dst_di);
    1805             : }
    1806             : 
    1807             : /*
    1808             :  * take a single entry in a log directory item and replay it into
    1809             :  * the subvolume.
    1810             :  *
    1811             :  * if a conflicting item exists in the subdirectory already,
    1812             :  * the inode it points to is unlinked and put into the link count
    1813             :  * fix up tree.
    1814             :  *
    1815             :  * If a name from the log points to a file or directory that does
    1816             :  * not exist in the FS, it is skipped.  fsyncs on directories
    1817             :  * do not force down inodes inside that directory, just changes to the
    1818             :  * names or unlinks in a directory.
    1819             :  *
    1820             :  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
    1821             :  * non-existing inode) and 1 if the name was replayed.
    1822             :  */
    1823          50 : static noinline int replay_one_name(struct btrfs_trans_handle *trans,
    1824             :                                     struct btrfs_root *root,
    1825             :                                     struct btrfs_path *path,
    1826             :                                     struct extent_buffer *eb,
    1827             :                                     struct btrfs_dir_item *di,
    1828             :                                     struct btrfs_key *key)
    1829             : {
    1830          50 :         struct fscrypt_str name;
    1831          50 :         struct btrfs_dir_item *dir_dst_di;
    1832          50 :         struct btrfs_dir_item *index_dst_di;
    1833          50 :         bool dir_dst_matches = false;
    1834          50 :         bool index_dst_matches = false;
    1835          50 :         struct btrfs_key log_key;
    1836          50 :         struct btrfs_key search_key;
    1837          50 :         struct inode *dir;
    1838          50 :         u8 log_flags;
    1839          50 :         bool exists;
    1840          50 :         int ret;
    1841          50 :         bool update_size = true;
    1842          50 :         bool name_added = false;
    1843             : 
    1844          50 :         dir = read_one_inode(root, key->objectid);
    1845          50 :         if (!dir)
    1846             :                 return -EIO;
    1847             : 
    1848          50 :         ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
    1849          50 :         if (ret)
    1850           0 :                 goto out;
    1851             : 
    1852          50 :         log_flags = btrfs_dir_flags(eb, di);
    1853          50 :         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
    1854          50 :         ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
    1855          50 :         btrfs_release_path(path);
    1856          50 :         if (ret < 0)
    1857           0 :                 goto out;
    1858          50 :         exists = (ret == 0);
    1859          50 :         ret = 0;
    1860             : 
    1861          50 :         dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
    1862             :                                            &name, 1);
    1863          50 :         if (IS_ERR(dir_dst_di)) {
    1864           0 :                 ret = PTR_ERR(dir_dst_di);
    1865           0 :                 goto out;
    1866          50 :         } else if (dir_dst_di) {
    1867           0 :                 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
    1868             :                                                    dir_dst_di, &log_key,
    1869             :                                                    log_flags, exists);
    1870           0 :                 if (ret < 0)
    1871           0 :                         goto out;
    1872           0 :                 dir_dst_matches = (ret == 1);
    1873             :         }
    1874             : 
    1875          50 :         btrfs_release_path(path);
    1876             : 
    1877          50 :         index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
    1878             :                                                    key->objectid, key->offset,
    1879             :                                                    &name, 1);
    1880          50 :         if (IS_ERR(index_dst_di)) {
    1881           0 :                 ret = PTR_ERR(index_dst_di);
    1882           0 :                 goto out;
    1883          50 :         } else if (index_dst_di) {
    1884           0 :                 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
    1885             :                                                    index_dst_di, &log_key,
    1886             :                                                    log_flags, exists);
    1887           0 :                 if (ret < 0)
    1888           0 :                         goto out;
    1889           0 :                 index_dst_matches = (ret == 1);
    1890             :         }
    1891             : 
    1892          50 :         btrfs_release_path(path);
    1893             : 
    1894          50 :         if (dir_dst_matches && index_dst_matches) {
    1895           0 :                 ret = 0;
    1896           0 :                 update_size = false;
    1897           0 :                 goto out;
    1898             :         }
    1899             : 
    1900             :         /*
    1901             :          * Check if the inode reference exists in the log for the given name,
    1902             :          * inode and parent inode
    1903             :          */
    1904          50 :         search_key.objectid = log_key.objectid;
    1905          50 :         search_key.type = BTRFS_INODE_REF_KEY;
    1906          50 :         search_key.offset = key->objectid;
    1907          50 :         ret = backref_in_log(root->log_root, &search_key, 0, &name);
    1908          50 :         if (ret < 0) {
    1909           0 :                 goto out;
    1910          50 :         } else if (ret) {
    1911             :                 /* The dentry will be added later. */
    1912          50 :                 ret = 0;
    1913          50 :                 update_size = false;
    1914          50 :                 goto out;
    1915             :         }
    1916             : 
    1917           0 :         search_key.objectid = log_key.objectid;
    1918           0 :         search_key.type = BTRFS_INODE_EXTREF_KEY;
    1919           0 :         search_key.offset = key->objectid;
    1920           0 :         ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
    1921           0 :         if (ret < 0) {
    1922           0 :                 goto out;
    1923           0 :         } else if (ret) {
    1924             :                 /* The dentry will be added later. */
    1925           0 :                 ret = 0;
    1926           0 :                 update_size = false;
    1927           0 :                 goto out;
    1928             :         }
    1929           0 :         btrfs_release_path(path);
    1930           0 :         ret = insert_one_name(trans, root, key->objectid, key->offset,
    1931             :                               &name, &log_key);
    1932           0 :         if (ret && ret != -ENOENT && ret != -EEXIST)
    1933           0 :                 goto out;
    1934           0 :         if (!ret)
    1935           0 :                 name_added = true;
    1936             :         update_size = false;
    1937             :         ret = 0;
    1938             : 
    1939          50 : out:
    1940          50 :         if (!ret && update_size) {
    1941           0 :                 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
    1942           0 :                 ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
    1943             :         }
    1944          50 :         kfree(name.name);
    1945          50 :         iput(dir);
    1946          50 :         if (!ret && name_added)
    1947           0 :                 ret = 1;
    1948             :         return ret;
    1949             : }
    1950             : 
    1951             : /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
    1952          50 : static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
    1953             :                                         struct btrfs_root *root,
    1954             :                                         struct btrfs_path *path,
    1955             :                                         struct extent_buffer *eb, int slot,
    1956             :                                         struct btrfs_key *key)
    1957             : {
    1958          50 :         int ret;
    1959          50 :         struct btrfs_dir_item *di;
    1960             : 
    1961             :         /* We only log dir index keys, which only contain a single dir item. */
    1962          50 :         ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
    1963             : 
    1964          50 :         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
    1965          50 :         ret = replay_one_name(trans, root, path, eb, di, key);
    1966          50 :         if (ret < 0)
    1967             :                 return ret;
    1968             : 
    1969             :         /*
    1970             :          * If this entry refers to a non-directory (directories can not have a
    1971             :          * link count > 1) and it was added in the transaction that was not
    1972             :          * committed, make sure we fixup the link count of the inode the entry
    1973             :          * points to. Otherwise something like the following would result in a
    1974             :          * directory pointing to an inode with a wrong link that does not account
    1975             :          * for this dir entry:
    1976             :          *
    1977             :          * mkdir testdir
    1978             :          * touch testdir/foo
    1979             :          * touch testdir/bar
    1980             :          * sync
    1981             :          *
    1982             :          * ln testdir/bar testdir/bar_link
    1983             :          * ln testdir/foo testdir/foo_link
    1984             :          * xfs_io -c "fsync" testdir/bar
    1985             :          *
    1986             :          * <power failure>
    1987             :          *
    1988             :          * mount fs, log replay happens
    1989             :          *
    1990             :          * File foo would remain with a link count of 1 when it has two entries
    1991             :          * pointing to it in the directory testdir. This would make it impossible
    1992             :          * to ever delete the parent directory has it would result in stale
    1993             :          * dentries that can never be deleted.
    1994             :          */
    1995          50 :         if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
    1996           0 :                 struct btrfs_path *fixup_path;
    1997           0 :                 struct btrfs_key di_key;
    1998             : 
    1999           0 :                 fixup_path = btrfs_alloc_path();
    2000           0 :                 if (!fixup_path)
    2001           0 :                         return -ENOMEM;
    2002             : 
    2003           0 :                 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
    2004           0 :                 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
    2005           0 :                 btrfs_free_path(fixup_path);
    2006             :         }
    2007             : 
    2008             :         return ret;
    2009             : }
    2010             : 
    2011             : /*
    2012             :  * directory replay has two parts.  There are the standard directory
    2013             :  * items in the log copied from the subvolume, and range items
    2014             :  * created in the log while the subvolume was logged.
    2015             :  *
    2016             :  * The range items tell us which parts of the key space the log
    2017             :  * is authoritative for.  During replay, if a key in the subvolume
    2018             :  * directory is in a logged range item, but not actually in the log
    2019             :  * that means it was deleted from the directory before the fsync
    2020             :  * and should be removed.
    2021             :  */
    2022          23 : static noinline int find_dir_range(struct btrfs_root *root,
    2023             :                                    struct btrfs_path *path,
    2024             :                                    u64 dirid,
    2025             :                                    u64 *start_ret, u64 *end_ret)
    2026             : {
    2027          23 :         struct btrfs_key key;
    2028          23 :         u64 found_end;
    2029          23 :         struct btrfs_dir_log_item *item;
    2030          23 :         int ret;
    2031          23 :         int nritems;
    2032             : 
    2033          23 :         if (*start_ret == (u64)-1)
    2034             :                 return 1;
    2035             : 
    2036          23 :         key.objectid = dirid;
    2037          23 :         key.type = BTRFS_DIR_LOG_INDEX_KEY;
    2038          23 :         key.offset = *start_ret;
    2039             : 
    2040          23 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    2041          23 :         if (ret < 0)
    2042           0 :                 goto out;
    2043          23 :         if (ret > 0) {
    2044          23 :                 if (path->slots[0] == 0)
    2045           0 :                         goto out;
    2046          23 :                 path->slots[0]--;
    2047             :         }
    2048          23 :         if (ret != 0)
    2049          23 :                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
    2050             : 
    2051          23 :         if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
    2052          23 :                 ret = 1;
    2053          23 :                 goto next;
    2054             :         }
    2055           0 :         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
    2056             :                               struct btrfs_dir_log_item);
    2057           0 :         found_end = btrfs_dir_log_end(path->nodes[0], item);
    2058             : 
    2059           0 :         if (*start_ret >= key.offset && *start_ret <= found_end) {
    2060           0 :                 ret = 0;
    2061           0 :                 *start_ret = key.offset;
    2062           0 :                 *end_ret = found_end;
    2063           0 :                 goto out;
    2064             :         }
    2065             :         ret = 1;
    2066          23 : next:
    2067             :         /* check the next slot in the tree to see if it is a valid item */
    2068          23 :         nritems = btrfs_header_nritems(path->nodes[0]);
    2069          23 :         path->slots[0]++;
    2070          23 :         if (path->slots[0] >= nritems) {
    2071           0 :                 ret = btrfs_next_leaf(root, path);
    2072           0 :                 if (ret)
    2073           0 :                         goto out;
    2074             :         }
    2075             : 
    2076          23 :         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
    2077             : 
    2078          23 :         if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
    2079           2 :                 ret = 1;
    2080           2 :                 goto out;
    2081             :         }
    2082          21 :         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
    2083             :                               struct btrfs_dir_log_item);
    2084          21 :         found_end = btrfs_dir_log_end(path->nodes[0], item);
    2085          21 :         *start_ret = key.offset;
    2086          21 :         *end_ret = found_end;
    2087          21 :         ret = 0;
    2088          23 : out:
    2089          23 :         btrfs_release_path(path);
    2090          23 :         return ret;
    2091             : }
    2092             : 
    2093             : /*
    2094             :  * this looks for a given directory item in the log.  If the directory
    2095             :  * item is not in the log, the item is removed and the inode it points
    2096             :  * to is unlinked
    2097             :  */
    2098           8 : static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
    2099             :                                       struct btrfs_root *log,
    2100             :                                       struct btrfs_path *path,
    2101             :                                       struct btrfs_path *log_path,
    2102             :                                       struct inode *dir,
    2103             :                                       struct btrfs_key *dir_key)
    2104             : {
    2105           8 :         struct btrfs_root *root = BTRFS_I(dir)->root;
    2106           8 :         int ret;
    2107           8 :         struct extent_buffer *eb;
    2108           8 :         int slot;
    2109           8 :         struct btrfs_dir_item *di;
    2110           8 :         struct fscrypt_str name;
    2111           8 :         struct inode *inode = NULL;
    2112           8 :         struct btrfs_key location;
    2113             : 
    2114             :         /*
    2115             :          * Currently we only log dir index keys. Even if we replay a log created
    2116             :          * by an older kernel that logged both dir index and dir item keys, all
    2117             :          * we need to do is process the dir index keys, we (and our caller) can
    2118             :          * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
    2119             :          */
    2120           8 :         ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
    2121             : 
    2122           8 :         eb = path->nodes[0];
    2123           8 :         slot = path->slots[0];
    2124           8 :         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
    2125           8 :         ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
    2126           8 :         if (ret)
    2127           0 :                 goto out;
    2128             : 
    2129           8 :         if (log) {
    2130           8 :                 struct btrfs_dir_item *log_di;
    2131             : 
    2132           8 :                 log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
    2133             :                                                      dir_key->objectid,
    2134             :                                                      dir_key->offset, &name, 0);
    2135           8 :                 if (IS_ERR(log_di)) {
    2136           0 :                         ret = PTR_ERR(log_di);
    2137           0 :                         goto out;
    2138           8 :                 } else if (log_di) {
    2139             :                         /* The dentry exists in the log, we have nothing to do. */
    2140           0 :                         ret = 0;
    2141           0 :                         goto out;
    2142             :                 }
    2143             :         }
    2144             : 
    2145           8 :         btrfs_dir_item_key_to_cpu(eb, di, &location);
    2146           8 :         btrfs_release_path(path);
    2147           8 :         btrfs_release_path(log_path);
    2148           8 :         inode = read_one_inode(root, location.objectid);
    2149           8 :         if (!inode) {
    2150           0 :                 ret = -EIO;
    2151           0 :                 goto out;
    2152             :         }
    2153             : 
    2154           8 :         ret = link_to_fixup_dir(trans, root, path, location.objectid);
    2155           8 :         if (ret)
    2156           0 :                 goto out;
    2157             : 
    2158           8 :         inc_nlink(inode);
    2159           8 :         ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
    2160             :                                           &name);
    2161             :         /*
    2162             :          * Unlike dir item keys, dir index keys can only have one name (entry) in
    2163             :          * them, as there are no key collisions since each key has a unique offset
    2164             :          * (an index number), so we're done.
    2165             :          */
    2166           8 : out:
    2167           8 :         btrfs_release_path(path);
    2168           8 :         btrfs_release_path(log_path);
    2169           8 :         kfree(name.name);
    2170           8 :         iput(inode);
    2171           8 :         return ret;
    2172             : }
    2173             : 
    2174        4558 : static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
    2175             :                               struct btrfs_root *root,
    2176             :                               struct btrfs_root *log,
    2177             :                               struct btrfs_path *path,
    2178             :                               const u64 ino)
    2179             : {
    2180        4558 :         struct btrfs_key search_key;
    2181        4558 :         struct btrfs_path *log_path;
    2182        4558 :         int i;
    2183        4558 :         int nritems;
    2184        4558 :         int ret;
    2185             : 
    2186        4558 :         log_path = btrfs_alloc_path();
    2187        4558 :         if (!log_path)
    2188             :                 return -ENOMEM;
    2189             : 
    2190        4558 :         search_key.objectid = ino;
    2191        4558 :         search_key.type = BTRFS_XATTR_ITEM_KEY;
    2192        4558 :         search_key.offset = 0;
    2193        4560 : again:
    2194        4560 :         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
    2195        4560 :         if (ret < 0)
    2196           0 :                 goto out;
    2197        4560 : process_leaf:
    2198        4570 :         nritems = btrfs_header_nritems(path->nodes[0]);
    2199        6578 :         for (i = path->slots[0]; i < nritems; i++) {
    2200        2749 :                 struct btrfs_key key;
    2201        2749 :                 struct btrfs_dir_item *di;
    2202        2749 :                 struct btrfs_dir_item *log_di;
    2203        2749 :                 u32 total_size;
    2204        2749 :                 u32 cur;
    2205             : 
    2206        2749 :                 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
    2207        2749 :                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
    2208         739 :                         ret = 0;
    2209         739 :                         goto out;
    2210             :                 }
    2211             : 
    2212        2010 :                 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
    2213        2010 :                 total_size = btrfs_item_size(path->nodes[0], i);
    2214        2010 :                 cur = 0;
    2215        4018 :                 while (cur < total_size) {
    2216        2010 :                         u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
    2217        2010 :                         u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
    2218        2010 :                         u32 this_len = sizeof(*di) + name_len + data_len;
    2219        2010 :                         char *name;
    2220             : 
    2221        2010 :                         name = kmalloc(name_len, GFP_NOFS);
    2222        2010 :                         if (!name) {
    2223           0 :                                 ret = -ENOMEM;
    2224           0 :                                 goto out;
    2225             :                         }
    2226        2010 :                         read_extent_buffer(path->nodes[0], name,
    2227        2010 :                                            (unsigned long)(di + 1), name_len);
    2228             : 
    2229        2010 :                         log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
    2230             :                                                     name, name_len, 0);
    2231        2010 :                         btrfs_release_path(log_path);
    2232        2010 :                         if (!log_di) {
    2233             :                                 /* Doesn't exist in log tree, so delete it. */
    2234           2 :                                 btrfs_release_path(path);
    2235           2 :                                 di = btrfs_lookup_xattr(trans, root, path, ino,
    2236             :                                                         name, name_len, -1);
    2237           2 :                                 kfree(name);
    2238           2 :                                 if (IS_ERR(di)) {
    2239           0 :                                         ret = PTR_ERR(di);
    2240           0 :                                         goto out;
    2241             :                                 }
    2242           2 :                                 ASSERT(di);
    2243           2 :                                 ret = btrfs_delete_one_dir_name(trans, root,
    2244             :                                                                 path, di);
    2245           2 :                                 if (ret)
    2246           0 :                                         goto out;
    2247           2 :                                 btrfs_release_path(path);
    2248           2 :                                 search_key = key;
    2249           2 :                                 goto again;
    2250             :                         }
    2251        2008 :                         kfree(name);
    2252        2008 :                         if (IS_ERR(log_di)) {
    2253           0 :                                 ret = PTR_ERR(log_di);
    2254           0 :                                 goto out;
    2255             :                         }
    2256        2008 :                         cur += this_len;
    2257        2008 :                         di = (struct btrfs_dir_item *)((char *)di + this_len);
    2258             :                 }
    2259             :         }
    2260        3829 :         ret = btrfs_next_leaf(root, path);
    2261        3829 :         if (ret > 0)
    2262             :                 ret = 0;
    2263          10 :         else if (ret == 0)
    2264          10 :                 goto process_leaf;
    2265           0 : out:
    2266        4558 :         btrfs_free_path(log_path);
    2267        4558 :         btrfs_release_path(path);
    2268        4558 :         return ret;
    2269             : }
    2270             : 
    2271             : 
    2272             : /*
    2273             :  * deletion replay happens before we copy any new directory items
    2274             :  * out of the log or out of backreferences from inodes.  It
    2275             :  * scans the log to find ranges of keys that log is authoritative for,
    2276             :  * and then scans the directory to find items in those ranges that are
    2277             :  * not present in the log.
    2278             :  *
    2279             :  * Anything we don't find in the log is unlinked and removed from the
    2280             :  * directory.
    2281             :  */
    2282          60 : static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
    2283             :                                        struct btrfs_root *root,
    2284             :                                        struct btrfs_root *log,
    2285             :                                        struct btrfs_path *path,
    2286             :                                        u64 dirid, int del_all)
    2287             : {
    2288          60 :         u64 range_start;
    2289          60 :         u64 range_end;
    2290          60 :         int ret = 0;
    2291          60 :         struct btrfs_key dir_key;
    2292          60 :         struct btrfs_key found_key;
    2293          60 :         struct btrfs_path *log_path;
    2294          60 :         struct inode *dir;
    2295             : 
    2296          60 :         dir_key.objectid = dirid;
    2297          60 :         dir_key.type = BTRFS_DIR_INDEX_KEY;
    2298          60 :         log_path = btrfs_alloc_path();
    2299          60 :         if (!log_path)
    2300             :                 return -ENOMEM;
    2301             : 
    2302          60 :         dir = read_one_inode(root, dirid);
    2303             :         /* it isn't an error if the inode isn't there, that can happen
    2304             :          * because we replay the deletes before we copy in the inode item
    2305             :          * from the log
    2306             :          */
    2307          60 :         if (!dir) {
    2308          37 :                 btrfs_free_path(log_path);
    2309          37 :                 return 0;
    2310             :         }
    2311             : 
    2312          23 :         range_start = 0;
    2313          23 :         range_end = 0;
    2314          23 :         while (1) {
    2315          23 :                 if (del_all)
    2316           0 :                         range_end = (u64)-1;
    2317             :                 else {
    2318          23 :                         ret = find_dir_range(log, path, dirid,
    2319             :                                              &range_start, &range_end);
    2320          23 :                         if (ret < 0)
    2321           0 :                                 goto out;
    2322          23 :                         else if (ret > 0)
    2323             :                                 break;
    2324             :                 }
    2325             : 
    2326          21 :                 dir_key.offset = range_start;
    2327          37 :                 while (1) {
    2328          29 :                         int nritems;
    2329          29 :                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
    2330             :                                                 0, 0);
    2331          29 :                         if (ret < 0)
    2332           0 :                                 goto out;
    2333             : 
    2334          29 :                         nritems = btrfs_header_nritems(path->nodes[0]);
    2335          29 :                         if (path->slots[0] >= nritems) {
    2336           9 :                                 ret = btrfs_next_leaf(root, path);
    2337           9 :                                 if (ret == 1)
    2338             :                                         break;
    2339           0 :                                 else if (ret < 0)
    2340           0 :                                         goto out;
    2341             :                         }
    2342          20 :                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
    2343             :                                               path->slots[0]);
    2344          20 :                         if (found_key.objectid != dirid ||
    2345           8 :                             found_key.type != dir_key.type) {
    2346          12 :                                 ret = 0;
    2347          12 :                                 goto out;
    2348             :                         }
    2349             : 
    2350           8 :                         if (found_key.offset > range_end)
    2351             :                                 break;
    2352             : 
    2353           8 :                         ret = check_item_in_log(trans, log, path,
    2354             :                                                 log_path, dir,
    2355             :                                                 &found_key);
    2356           8 :                         if (ret)
    2357           0 :                                 goto out;
    2358           8 :                         if (found_key.offset == (u64)-1)
    2359             :                                 break;
    2360           8 :                         dir_key.offset = found_key.offset + 1;
    2361             :                 }
    2362           9 :                 btrfs_release_path(path);
    2363           9 :                 if (range_end == (u64)-1)
    2364             :                         break;
    2365           0 :                 range_start = range_end + 1;
    2366             :         }
    2367             :         ret = 0;
    2368          23 : out:
    2369          23 :         btrfs_release_path(path);
    2370          23 :         btrfs_free_path(log_path);
    2371          23 :         iput(dir);
    2372          23 :         return ret;
    2373             : }
    2374             : 
    2375             : /*
    2376             :  * the process_func used to replay items from the log tree.  This
    2377             :  * gets called in two different stages.  The first stage just looks
    2378             :  * for inodes and makes sure they are all copied into the subvolume.
    2379             :  *
    2380             :  * The second stage copies all the other item types from the log into
    2381             :  * the subvolume.  The two stage approach is slower, but gets rid of
    2382             :  * lots of complexity around inodes referencing other inodes that exist
    2383             :  * only in the log (references come from either directory items or inode
    2384             :  * back refs).
    2385             :  */
    2386       14037 : static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
    2387             :                              struct walk_control *wc, u64 gen, int level)
    2388             : {
    2389       14037 :         int nritems;
    2390       14037 :         struct btrfs_tree_parent_check check = {
    2391             :                 .transid = gen,
    2392             :                 .level = level
    2393             :         };
    2394       14037 :         struct btrfs_path *path;
    2395       14037 :         struct btrfs_root *root = wc->replay_dest;
    2396       14037 :         struct btrfs_key key;
    2397       14037 :         int i;
    2398       14037 :         int ret;
    2399             : 
    2400       14037 :         ret = btrfs_read_extent_buffer(eb, &check);
    2401       14037 :         if (ret)
    2402             :                 return ret;
    2403             : 
    2404       14037 :         level = btrfs_header_level(eb);
    2405             : 
    2406       14037 :         if (level != 0)
    2407             :                 return 0;
    2408             : 
    2409       13848 :         path = btrfs_alloc_path();
    2410       13848 :         if (!path)
    2411             :                 return -ENOMEM;
    2412             : 
    2413       13848 :         nritems = btrfs_header_nritems(eb);
    2414      605763 :         for (i = 0; i < nritems; i++) {
    2415      591915 :                 btrfs_item_key_to_cpu(eb, &key, i);
    2416             : 
    2417             :                 /* inode keys are done during the first stage */
    2418      591915 :                 if (key.type == BTRFS_INODE_ITEM_KEY &&
    2419       13674 :                     wc->stage == LOG_WALK_REPLAY_INODES) {
    2420        4558 :                         struct btrfs_inode_item *inode_item;
    2421        4558 :                         u32 mode;
    2422             : 
    2423        4558 :                         inode_item = btrfs_item_ptr(eb, i,
    2424             :                                             struct btrfs_inode_item);
    2425             :                         /*
    2426             :                          * If we have a tmpfile (O_TMPFILE) that got fsync'ed
    2427             :                          * and never got linked before the fsync, skip it, as
    2428             :                          * replaying it is pointless since it would be deleted
    2429             :                          * later. We skip logging tmpfiles, but it's always
    2430             :                          * possible we are replaying a log created with a kernel
    2431             :                          * that used to log tmpfiles.
    2432             :                          */
    2433        4558 :                         if (btrfs_inode_nlink(eb, inode_item) == 0) {
    2434           0 :                                 wc->ignore_cur_inode = true;
    2435           0 :                                 continue;
    2436             :                         } else {
    2437        4558 :                                 wc->ignore_cur_inode = false;
    2438             :                         }
    2439        4558 :                         ret = replay_xattr_deletes(wc->trans, root, log,
    2440             :                                                    path, key.objectid);
    2441        4558 :                         if (ret)
    2442             :                                 break;
    2443        4558 :                         mode = btrfs_inode_mode(eb, inode_item);
    2444        4558 :                         if (S_ISDIR(mode)) {
    2445          60 :                                 ret = replay_dir_deletes(wc->trans,
    2446             :                                          root, log, path, key.objectid, 0);
    2447          60 :                                 if (ret)
    2448             :                                         break;
    2449             :                         }
    2450        4558 :                         ret = overwrite_item(wc->trans, root, path,
    2451             :                                              eb, i, &key);
    2452        4558 :                         if (ret)
    2453             :                                 break;
    2454             : 
    2455             :                         /*
    2456             :                          * Before replaying extents, truncate the inode to its
    2457             :                          * size. We need to do it now and not after log replay
    2458             :                          * because before an fsync we can have prealloc extents
    2459             :                          * added beyond the inode's i_size. If we did it after,
    2460             :                          * through orphan cleanup for example, we would drop
    2461             :                          * those prealloc extents just after replaying them.
    2462             :                          */
    2463        4558 :                         if (S_ISREG(mode)) {
    2464        4495 :                                 struct btrfs_drop_extents_args drop_args = { 0 };
    2465        4495 :                                 struct inode *inode;
    2466        4495 :                                 u64 from;
    2467             : 
    2468        4495 :                                 inode = read_one_inode(root, key.objectid);
    2469        4495 :                                 if (!inode) {
    2470             :                                         ret = -EIO;
    2471           0 :                                         break;
    2472             :                                 }
    2473        4495 :                                 from = ALIGN(i_size_read(inode),
    2474             :                                              root->fs_info->sectorsize);
    2475        4495 :                                 drop_args.start = from;
    2476        4495 :                                 drop_args.end = (u64)-1;
    2477        4495 :                                 drop_args.drop_cache = true;
    2478        4495 :                                 ret = btrfs_drop_extents(wc->trans, root,
    2479             :                                                          BTRFS_I(inode),
    2480             :                                                          &drop_args);
    2481        4495 :                                 if (!ret) {
    2482        4495 :                                         inode_sub_bytes(inode,
    2483        4495 :                                                         drop_args.bytes_found);
    2484             :                                         /* Update the inode's nbytes. */
    2485        4495 :                                         ret = btrfs_update_inode(wc->trans,
    2486             :                                                         root, BTRFS_I(inode));
    2487             :                                 }
    2488        4495 :                                 iput(inode);
    2489        4495 :                                 if (ret)
    2490             :                                         break;
    2491             :                         }
    2492             : 
    2493        4558 :                         ret = link_to_fixup_dir(wc->trans, root,
    2494             :                                                 path, key.objectid);
    2495        4558 :                         if (ret)
    2496             :                                 break;
    2497             :                 }
    2498             : 
    2499      591915 :                 if (wc->ignore_cur_inode)
    2500           0 :                         continue;
    2501             : 
    2502      591915 :                 if (key.type == BTRFS_DIR_INDEX_KEY &&
    2503         150 :                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
    2504          50 :                         ret = replay_one_dir_item(wc->trans, root, path,
    2505             :                                                   eb, i, &key);
    2506          50 :                         if (ret)
    2507             :                                 break;
    2508             :                 }
    2509             : 
    2510      591915 :                 if (wc->stage < LOG_WALK_REPLAY_ALL)
    2511      394610 :                         continue;
    2512             : 
    2513             :                 /* these keys are simply copied */
    2514      197305 :                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
    2515        2009 :                         ret = overwrite_item(wc->trans, root, path,
    2516             :                                              eb, i, &key);
    2517        2009 :                         if (ret)
    2518             :                                 break;
    2519      195296 :                 } else if (key.type == BTRFS_INODE_REF_KEY ||
    2520             :                            key.type == BTRFS_INODE_EXTREF_KEY) {
    2521        4046 :                         ret = add_inode_ref(wc->trans, root, log, path,
    2522             :                                             eb, i, &key);
    2523        4046 :                         if (ret && ret != -ENOENT)
    2524             :                                 break;
    2525             :                         ret = 0;
    2526      191250 :                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
    2527      100838 :                         ret = replay_one_extent(wc->trans, root, path,
    2528             :                                                 eb, i, &key);
    2529      100838 :                         if (ret)
    2530             :                                 break;
    2531             :                 }
    2532             :                 /*
    2533             :                  * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
    2534             :                  * BTRFS_DIR_INDEX_KEY items which we use to derive the
    2535             :                  * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
    2536             :                  * older kernel with such keys, ignore them.
    2537             :                  */
    2538             :         }
    2539       13848 :         btrfs_free_path(path);
    2540       13848 :         return ret;
    2541             : }
    2542             : 
    2543             : /*
    2544             :  * Correctly adjust the reserved bytes occupied by a log tree extent buffer
    2545             :  */
    2546          16 : static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
    2547             : {
    2548          16 :         struct btrfs_block_group *cache;
    2549             : 
    2550          16 :         cache = btrfs_lookup_block_group(fs_info, start);
    2551          16 :         if (!cache) {
    2552           0 :                 btrfs_err(fs_info, "unable to find block group for %llu", start);
    2553           0 :                 return;
    2554             :         }
    2555             : 
    2556          16 :         spin_lock(&cache->space_info->lock);
    2557          16 :         spin_lock(&cache->lock);
    2558          16 :         cache->reserved -= fs_info->nodesize;
    2559          16 :         cache->space_info->bytes_reserved -= fs_info->nodesize;
    2560          16 :         spin_unlock(&cache->lock);
    2561          16 :         spin_unlock(&cache->space_info->lock);
    2562             : 
    2563          16 :         btrfs_put_block_group(cache);
    2564             : }
    2565             : 
    2566       24517 : static int clean_log_buffer(struct btrfs_trans_handle *trans,
    2567             :                             struct extent_buffer *eb)
    2568             : {
    2569       24517 :         int ret;
    2570             : 
    2571       24517 :         btrfs_tree_lock(eb);
    2572       24517 :         btrfs_clear_buffer_dirty(trans, eb);
    2573       24517 :         wait_on_extent_buffer_writeback(eb);
    2574       24517 :         btrfs_tree_unlock(eb);
    2575             : 
    2576       24517 :         if (trans) {
    2577       24501 :                 ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
    2578       24501 :                 if (ret)
    2579             :                         return ret;
    2580       24501 :                 btrfs_redirty_list_add(trans->transaction, eb);
    2581             :         } else {
    2582          16 :                 unaccount_log_buffer(eb->fs_info, eb->start);
    2583             :         }
    2584             : 
    2585             :         return 0;
    2586             : }
    2587             : 
    2588       27666 : static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
    2589             :                                    struct btrfs_root *root,
    2590             :                                    struct btrfs_path *path, int *level,
    2591             :                                    struct walk_control *wc)
    2592             : {
    2593       27666 :         struct btrfs_fs_info *fs_info = root->fs_info;
    2594       27666 :         u64 bytenr;
    2595       27666 :         u64 ptr_gen;
    2596       27666 :         struct extent_buffer *next;
    2597       27666 :         struct extent_buffer *cur;
    2598       27666 :         int ret = 0;
    2599             : 
    2600       43712 :         while (*level > 0) {
    2601       16575 :                 struct btrfs_tree_parent_check check = { 0 };
    2602             : 
    2603       16575 :                 cur = path->nodes[*level];
    2604             : 
    2605       16575 :                 WARN_ON(btrfs_header_level(cur) != *level);
    2606             : 
    2607       16575 :                 if (path->slots[*level] >=
    2608             :                     btrfs_header_nritems(cur))
    2609             :                         break;
    2610             : 
    2611       16047 :                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
    2612       16047 :                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
    2613       16047 :                 check.transid = ptr_gen;
    2614       16047 :                 check.level = *level - 1;
    2615       16047 :                 check.has_first_key = true;
    2616       16047 :                 btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
    2617             : 
    2618       16047 :                 next = btrfs_find_create_tree_block(fs_info, bytenr,
    2619             :                                                     btrfs_header_owner(cur),
    2620       16047 :                                                     *level - 1);
    2621       16047 :                 if (IS_ERR(next))
    2622           1 :                         return PTR_ERR(next);
    2623             : 
    2624       16047 :                 if (*level == 1) {
    2625       16020 :                         ret = wc->process_func(root, next, wc, ptr_gen,
    2626             :                                                *level - 1);
    2627       16020 :                         if (ret) {
    2628           0 :                                 free_extent_buffer(next);
    2629           0 :                                 return ret;
    2630             :                         }
    2631             : 
    2632       16020 :                         path->slots[*level]++;
    2633       16020 :                         if (wc->free) {
    2634       14926 :                                 ret = btrfs_read_extent_buffer(next, &check);
    2635       14926 :                                 if (ret) {
    2636           1 :                                         free_extent_buffer(next);
    2637           1 :                                         return ret;
    2638             :                                 }
    2639             : 
    2640       14925 :                                 ret = clean_log_buffer(trans, next);
    2641       14925 :                                 if (ret) {
    2642           0 :                                         free_extent_buffer(next);
    2643           0 :                                         return ret;
    2644             :                                 }
    2645             :                         }
    2646       16019 :                         free_extent_buffer(next);
    2647       16019 :                         continue;
    2648             :                 }
    2649          27 :                 ret = btrfs_read_extent_buffer(next, &check);
    2650          27 :                 if (ret) {
    2651           0 :                         free_extent_buffer(next);
    2652           0 :                         return ret;
    2653             :                 }
    2654             : 
    2655          27 :                 if (path->nodes[*level-1])
    2656           0 :                         free_extent_buffer(path->nodes[*level-1]);
    2657          27 :                 path->nodes[*level-1] = next;
    2658          27 :                 *level = btrfs_header_level(next);
    2659          27 :                 path->slots[*level] = 0;
    2660          27 :                 cond_resched();
    2661             :         }
    2662       27665 :         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
    2663             : 
    2664       27665 :         cond_resched();
    2665       27665 :         return 0;
    2666             : }
    2667             : 
    2668       27665 : static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
    2669             :                                  struct btrfs_root *root,
    2670             :                                  struct btrfs_path *path, int *level,
    2671             :                                  struct walk_control *wc)
    2672             : {
    2673       27665 :         int i;
    2674       27665 :         int slot;
    2675       27665 :         int ret;
    2676             : 
    2677       55332 :         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
    2678       27692 :                 slot = path->slots[i];
    2679       27692 :                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
    2680          25 :                         path->slots[i]++;
    2681          25 :                         *level = i;
    2682          25 :                         WARN_ON(*level == 0);
    2683             :                         return 0;
    2684             :                 } else {
    2685       27667 :                         ret = wc->process_func(root, path->nodes[*level], wc,
    2686       27667 :                                  btrfs_header_generation(path->nodes[*level]),
    2687             :                                  *level);
    2688       27667 :                         if (ret)
    2689           0 :                                 return ret;
    2690             : 
    2691       27667 :                         if (wc->free) {
    2692        9592 :                                 ret = clean_log_buffer(trans, path->nodes[*level]);
    2693        9592 :                                 if (ret)
    2694           0 :                                         return ret;
    2695             :                         }
    2696       27667 :                         free_extent_buffer(path->nodes[*level]);
    2697       27667 :                         path->nodes[*level] = NULL;
    2698       27667 :                         *level = i + 1;
    2699             :                 }
    2700             :         }
    2701             :         return 1;
    2702             : }
    2703             : 
    2704             : /*
    2705             :  * drop the reference count on the tree rooted at 'snap'.  This traverses
    2706             :  * the tree freeing any blocks that have a ref count of zero after being
    2707             :  * decremented.
    2708             :  */
    2709       27641 : static int walk_log_tree(struct btrfs_trans_handle *trans,
    2710             :                          struct btrfs_root *log, struct walk_control *wc)
    2711             : {
    2712       27641 :         int ret = 0;
    2713       27641 :         int wret;
    2714       27641 :         int level;
    2715       27641 :         struct btrfs_path *path;
    2716       27641 :         int orig_level;
    2717             : 
    2718       27641 :         path = btrfs_alloc_path();
    2719       27641 :         if (!path)
    2720             :                 return -ENOMEM;
    2721             : 
    2722       27641 :         level = btrfs_header_level(log->node);
    2723       27641 :         orig_level = level;
    2724       27641 :         path->nodes[level] = log->node;
    2725       27641 :         atomic_inc(&log->node->refs);
    2726       27641 :         path->slots[level] = 0;
    2727             : 
    2728       27666 :         while (1) {
    2729       27666 :                 wret = walk_down_log_tree(trans, log, path, &level, wc);
    2730       27666 :                 if (wret > 0)
    2731             :                         break;
    2732       27666 :                 if (wret < 0) {
    2733           1 :                         ret = wret;
    2734           1 :                         goto out;
    2735             :                 }
    2736             : 
    2737       27665 :                 wret = walk_up_log_tree(trans, log, path, &level, wc);
    2738       27665 :                 if (wret > 0)
    2739             :                         break;
    2740          25 :                 if (wret < 0) {
    2741           0 :                         ret = wret;
    2742           0 :                         goto out;
    2743             :                 }
    2744             :         }
    2745             : 
    2746             :         /* was the root node processed? if not, catch it here */
    2747       27640 :         if (path->nodes[orig_level]) {
    2748           0 :                 ret = wc->process_func(log, path->nodes[orig_level], wc,
    2749             :                          btrfs_header_generation(path->nodes[orig_level]),
    2750             :                          orig_level);
    2751           0 :                 if (ret)
    2752           0 :                         goto out;
    2753           0 :                 if (wc->free)
    2754           0 :                         ret = clean_log_buffer(trans, path->nodes[orig_level]);
    2755             :         }
    2756             : 
    2757       27640 : out:
    2758       27641 :         btrfs_free_path(path);
    2759       27641 :         return ret;
    2760             : }
    2761             : 
    2762             : /*
    2763             :  * helper function to update the item for a given subvolumes log root
    2764             :  * in the tree of log roots
    2765             :  */
    2766      241048 : static int update_log_root(struct btrfs_trans_handle *trans,
    2767             :                            struct btrfs_root *log,
    2768             :                            struct btrfs_root_item *root_item)
    2769             : {
    2770      241048 :         struct btrfs_fs_info *fs_info = log->fs_info;
    2771      241048 :         int ret;
    2772             : 
    2773      241048 :         if (log->log_transid == 1) {
    2774             :                 /* insert root item on the first sync */
    2775        4481 :                 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
    2776        4481 :                                 &log->root_key, root_item);
    2777             :         } else {
    2778      236567 :                 ret = btrfs_update_root(trans, fs_info->log_root_tree,
    2779             :                                 &log->root_key, root_item);
    2780             :         }
    2781      241048 :         return ret;
    2782             : }
    2783             : 
    2784       14161 : static void wait_log_commit(struct btrfs_root *root, int transid)
    2785             : {
    2786       14161 :         DEFINE_WAIT(wait);
    2787       14161 :         int index = transid % 2;
    2788             : 
    2789             :         /*
    2790             :          * we only allow two pending log transactions at a time,
    2791             :          * so we know that if ours is more than 2 older than the
    2792             :          * current transaction, we're done
    2793             :          */
    2794       42474 :         for (;;) {
    2795       28322 :                 prepare_to_wait(&root->log_commit_wait[index],
    2796             :                                 &wait, TASK_UNINTERRUPTIBLE);
    2797             : 
    2798       28322 :                 if (!(root->log_transid_committed < transid &&
    2799       14161 :                       atomic_read(&root->log_commit[index])))
    2800             :                         break;
    2801             : 
    2802       14161 :                 mutex_unlock(&root->log_mutex);
    2803       14161 :                 schedule();
    2804       14152 :                 mutex_lock(&root->log_mutex);
    2805             :         }
    2806       14161 :         finish_wait(&root->log_commit_wait[index], &wait);
    2807       14161 : }
    2808             : 
    2809      241869 : static void wait_for_writer(struct btrfs_root *root)
    2810             : {
    2811      241869 :         DEFINE_WAIT(wait);
    2812             : 
    2813      241883 :         for (;;) {
    2814      241876 :                 prepare_to_wait(&root->log_writer_wait, &wait,
    2815             :                                 TASK_UNINTERRUPTIBLE);
    2816      241890 :                 if (!atomic_read(&root->log_writers))
    2817             :                         break;
    2818             : 
    2819           7 :                 mutex_unlock(&root->log_mutex);
    2820           7 :                 schedule();
    2821           7 :                 mutex_lock(&root->log_mutex);
    2822             :         }
    2823      241883 :         finish_wait(&root->log_writer_wait, &wait);
    2824      241873 : }
    2825             : 
    2826         651 : static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
    2827             :                                         struct btrfs_log_ctx *ctx)
    2828             : {
    2829         651 :         mutex_lock(&root->log_mutex);
    2830         651 :         list_del_init(&ctx->list);
    2831         651 :         mutex_unlock(&root->log_mutex);
    2832         651 : }
    2833             : 
    2834             : /* 
    2835             :  * Invoked in log mutex context, or be sure there is no other task which
    2836             :  * can access the list.
    2837             :  */
    2838      480796 : static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
    2839             :                                              int index, int error)
    2840             : {
    2841      480796 :         struct btrfs_log_ctx *ctx;
    2842      480796 :         struct btrfs_log_ctx *safe;
    2843             : 
    2844      973607 :         list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
    2845      492810 :                 list_del_init(&ctx->list);
    2846      492811 :                 ctx->log_ret = error;
    2847             :         }
    2848      480797 : }
    2849             : 
    2850             : /*
    2851             :  * btrfs_sync_log does sends a given tree log down to the disk and
    2852             :  * updates the super blocks to record it.  When this call is done,
    2853             :  * you know that any inodes previously logged are safely on disk only
    2854             :  * if it returns 0.
    2855             :  *
    2856             :  * Any other return value means you need to call btrfs_commit_transaction.
    2857             :  * Some of the edge cases for fsyncing directories that have had unlinks
    2858             :  * or renames done in the past mean that sometimes the only safe
    2859             :  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
    2860             :  * that has happened.
    2861             :  */
    2862      251723 : int btrfs_sync_log(struct btrfs_trans_handle *trans,
    2863             :                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
    2864             : {
    2865      251723 :         int index1;
    2866      251723 :         int index2;
    2867      251723 :         int mark;
    2868      251723 :         int ret;
    2869      251723 :         struct btrfs_fs_info *fs_info = root->fs_info;
    2870      251723 :         struct btrfs_root *log = root->log_root;
    2871      251723 :         struct btrfs_root *log_root_tree = fs_info->log_root_tree;
    2872      251723 :         struct btrfs_root_item new_root_item;
    2873      251723 :         int log_transid = 0;
    2874      251723 :         struct btrfs_log_ctx root_log_ctx;
    2875      251723 :         struct blk_plug plug;
    2876      251723 :         u64 log_root_start;
    2877      251723 :         u64 log_root_level;
    2878             : 
    2879      251723 :         mutex_lock(&root->log_mutex);
    2880      251737 :         log_transid = ctx->log_transid;
    2881      251737 :         if (root->log_transid_committed >= log_transid) {
    2882           0 :                 mutex_unlock(&root->log_mutex);
    2883           0 :                 return ctx->log_ret;
    2884             :         }
    2885             : 
    2886      251737 :         index1 = log_transid % 2;
    2887      251737 :         if (atomic_read(&root->log_commit[index1])) {
    2888       10554 :                 wait_log_commit(root, log_transid);
    2889       10554 :                 mutex_unlock(&root->log_mutex);
    2890       10554 :                 return ctx->log_ret;
    2891             :         }
    2892      241183 :         ASSERT(log_transid == root->log_transid);
    2893      241183 :         atomic_set(&root->log_commit[index1], 1);
    2894             : 
    2895             :         /* wait for previous tree log sync to complete */
    2896      241183 :         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
    2897        2068 :                 wait_log_commit(root, log_transid - 1);
    2898             : 
    2899      241887 :         while (1) {
    2900      241887 :                 int batch = atomic_read(&root->log_batch);
    2901             :                 /* when we're on an ssd, just kick the log commit out */
    2902      483770 :                 if (!btrfs_test_opt(fs_info, SSD) &&
    2903      241883 :                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
    2904        1704 :                         mutex_unlock(&root->log_mutex);
    2905        1704 :                         schedule_timeout_uninterruptible(1);
    2906        1704 :                         mutex_lock(&root->log_mutex);
    2907             :                 }
    2908      241887 :                 wait_for_writer(root);
    2909      241854 :                 if (batch == atomic_read(&root->log_batch))
    2910             :                         break;
    2911             :         }
    2912             : 
    2913             :         /* bail out if we need to do a full commit */
    2914      241150 :         if (btrfs_need_log_full_commit(trans)) {
    2915         161 :                 ret = BTRFS_LOG_FORCE_COMMIT;
    2916         161 :                 mutex_unlock(&root->log_mutex);
    2917         161 :                 goto out;
    2918             :         }
    2919             : 
    2920      240989 :         if (log_transid % 2 == 0)
    2921             :                 mark = EXTENT_DIRTY;
    2922             :         else
    2923      119079 :                 mark = EXTENT_NEW;
    2924             : 
    2925             :         /* we start IO on  all the marked extents here, but we don't actually
    2926             :          * wait for them until later.
    2927             :          */
    2928      240989 :         blk_start_plug(&plug);
    2929      240980 :         ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
    2930             :         /*
    2931             :          * -EAGAIN happens when someone, e.g., a concurrent transaction
    2932             :          *  commit, writes a dirty extent in this tree-log commit. This
    2933             :          *  concurrent write will create a hole writing out the extents,
    2934             :          *  and we cannot proceed on a zoned filesystem, requiring
    2935             :          *  sequential writing. While we can bail out to a full commit
    2936             :          *  here, but we can continue hoping the concurrent writing fills
    2937             :          *  the hole.
    2938             :          */
    2939      241048 :         if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
    2940             :                 ret = 0;
    2941      241048 :         if (ret) {
    2942           0 :                 blk_finish_plug(&plug);
    2943           0 :                 btrfs_set_log_full_commit(trans);
    2944           0 :                 mutex_unlock(&root->log_mutex);
    2945           0 :                 goto out;
    2946             :         }
    2947             : 
    2948             :         /*
    2949             :          * We _must_ update under the root->log_mutex in order to make sure we
    2950             :          * have a consistent view of the log root we are trying to commit at
    2951             :          * this moment.
    2952             :          *
    2953             :          * We _must_ copy this into a local copy, because we are not holding the
    2954             :          * log_root_tree->log_mutex yet.  This is important because when we
    2955             :          * commit the log_root_tree we must have a consistent view of the
    2956             :          * log_root_tree when we update the super block to point at the
    2957             :          * log_root_tree bytenr.  If we update the log_root_tree here we'll race
    2958             :          * with the commit and possibly point at the new block which we may not
    2959             :          * have written out.
    2960             :          */
    2961      241048 :         btrfs_set_root_node(&log->root_item, log->node);
    2962      241046 :         memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
    2963             : 
    2964      241046 :         root->log_transid++;
    2965      241046 :         log->log_transid = root->log_transid;
    2966      241046 :         root->log_start_pid = 0;
    2967             :         /*
    2968             :          * IO has been started, blocks of the log tree have WRITTEN flag set
    2969             :          * in their headers. new modifications of the log will be written to
    2970             :          * new positions. so it's safe to allow log writers to go in.
    2971             :          */
    2972      241046 :         mutex_unlock(&root->log_mutex);
    2973             : 
    2974      241046 :         if (btrfs_is_zoned(fs_info)) {
    2975           0 :                 mutex_lock(&fs_info->tree_root->log_mutex);
    2976           0 :                 if (!log_root_tree->node) {
    2977           0 :                         ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
    2978           0 :                         if (ret) {
    2979           0 :                                 mutex_unlock(&fs_info->tree_root->log_mutex);
    2980           0 :                                 blk_finish_plug(&plug);
    2981           0 :                                 goto out;
    2982             :                         }
    2983             :                 }
    2984           0 :                 mutex_unlock(&fs_info->tree_root->log_mutex);
    2985             :         }
    2986             : 
    2987      241046 :         btrfs_init_log_ctx(&root_log_ctx, NULL);
    2988             : 
    2989      241046 :         mutex_lock(&log_root_tree->log_mutex);
    2990             : 
    2991      241048 :         index2 = log_root_tree->log_transid % 2;
    2992      241048 :         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
    2993      241048 :         root_log_ctx.log_transid = log_root_tree->log_transid;
    2994             : 
    2995             :         /*
    2996             :          * Now we are safe to update the log_root_tree because we're under the
    2997             :          * log_mutex, and we're a current writer so we're holding the commit
    2998             :          * open until we drop the log_mutex.
    2999             :          */
    3000      241048 :         ret = update_log_root(trans, log, &new_root_item);
    3001      241048 :         if (ret) {
    3002           0 :                 if (!list_empty(&root_log_ctx.list))
    3003           0 :                         list_del_init(&root_log_ctx.list);
    3004             : 
    3005           0 :                 blk_finish_plug(&plug);
    3006           0 :                 btrfs_set_log_full_commit(trans);
    3007           0 :                 if (ret != -ENOSPC)
    3008           0 :                         btrfs_err(fs_info,
    3009             :                                   "failed to update log for root %llu ret %d",
    3010             :                                   root->root_key.objectid, ret);
    3011           0 :                 btrfs_wait_tree_log_extents(log, mark);
    3012           0 :                 mutex_unlock(&log_root_tree->log_mutex);
    3013           0 :                 goto out;
    3014             :         }
    3015             : 
    3016      241048 :         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
    3017           0 :                 blk_finish_plug(&plug);
    3018           0 :                 list_del_init(&root_log_ctx.list);
    3019           0 :                 mutex_unlock(&log_root_tree->log_mutex);
    3020           0 :                 ret = root_log_ctx.log_ret;
    3021           0 :                 goto out;
    3022             :         }
    3023             : 
    3024      241048 :         index2 = root_log_ctx.log_transid % 2;
    3025      241048 :         if (atomic_read(&log_root_tree->log_commit[index2])) {
    3026        1460 :                 blk_finish_plug(&plug);
    3027        1460 :                 ret = btrfs_wait_tree_log_extents(log, mark);
    3028        1460 :                 wait_log_commit(log_root_tree,
    3029             :                                 root_log_ctx.log_transid);
    3030        1460 :                 mutex_unlock(&log_root_tree->log_mutex);
    3031        1460 :                 if (!ret)
    3032        1460 :                         ret = root_log_ctx.log_ret;
    3033        1460 :                 goto out;
    3034             :         }
    3035      239588 :         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
    3036      239588 :         atomic_set(&log_root_tree->log_commit[index2], 1);
    3037             : 
    3038      239588 :         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
    3039          79 :                 wait_log_commit(log_root_tree,
    3040             :                                 root_log_ctx.log_transid - 1);
    3041             :         }
    3042             : 
    3043             :         /*
    3044             :          * now that we've moved on to the tree of log tree roots,
    3045             :          * check the full commit flag again
    3046             :          */
    3047      239588 :         if (btrfs_need_log_full_commit(trans)) {
    3048           2 :                 blk_finish_plug(&plug);
    3049           2 :                 btrfs_wait_tree_log_extents(log, mark);
    3050           2 :                 mutex_unlock(&log_root_tree->log_mutex);
    3051           2 :                 ret = BTRFS_LOG_FORCE_COMMIT;
    3052           2 :                 goto out_wake_log_root;
    3053             :         }
    3054             : 
    3055      239586 :         ret = btrfs_write_marked_extents(fs_info,
    3056             :                                          &log_root_tree->dirty_log_pages,
    3057             :                                          EXTENT_DIRTY | EXTENT_NEW);
    3058      239586 :         blk_finish_plug(&plug);
    3059             :         /*
    3060             :          * As described above, -EAGAIN indicates a hole in the extents. We
    3061             :          * cannot wait for these write outs since the waiting cause a
    3062             :          * deadlock. Bail out to the full commit instead.
    3063             :          */
    3064      239586 :         if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
    3065           0 :                 btrfs_set_log_full_commit(trans);
    3066           0 :                 btrfs_wait_tree_log_extents(log, mark);
    3067           0 :                 mutex_unlock(&log_root_tree->log_mutex);
    3068           0 :                 goto out_wake_log_root;
    3069      239586 :         } else if (ret) {
    3070           0 :                 btrfs_set_log_full_commit(trans);
    3071           0 :                 mutex_unlock(&log_root_tree->log_mutex);
    3072           0 :                 goto out_wake_log_root;
    3073             :         }
    3074      239586 :         ret = btrfs_wait_tree_log_extents(log, mark);
    3075      239586 :         if (!ret)
    3076      239585 :                 ret = btrfs_wait_tree_log_extents(log_root_tree,
    3077             :                                                   EXTENT_NEW | EXTENT_DIRTY);
    3078      239586 :         if (ret) {
    3079           1 :                 btrfs_set_log_full_commit(trans);
    3080           1 :                 mutex_unlock(&log_root_tree->log_mutex);
    3081           1 :                 goto out_wake_log_root;
    3082             :         }
    3083             : 
    3084      239585 :         log_root_start = log_root_tree->node->start;
    3085      239585 :         log_root_level = btrfs_header_level(log_root_tree->node);
    3086      239585 :         log_root_tree->log_transid++;
    3087      239585 :         mutex_unlock(&log_root_tree->log_mutex);
    3088             : 
    3089             :         /*
    3090             :          * Here we are guaranteed that nobody is going to write the superblock
    3091             :          * for the current transaction before us and that neither we do write
    3092             :          * our superblock before the previous transaction finishes its commit
    3093             :          * and writes its superblock, because:
    3094             :          *
    3095             :          * 1) We are holding a handle on the current transaction, so no body
    3096             :          *    can commit it until we release the handle;
    3097             :          *
    3098             :          * 2) Before writing our superblock we acquire the tree_log_mutex, so
    3099             :          *    if the previous transaction is still committing, and hasn't yet
    3100             :          *    written its superblock, we wait for it to do it, because a
    3101             :          *    transaction commit acquires the tree_log_mutex when the commit
    3102             :          *    begins and releases it only after writing its superblock.
    3103             :          */
    3104      239585 :         mutex_lock(&fs_info->tree_log_mutex);
    3105             : 
    3106             :         /*
    3107             :          * The previous transaction writeout phase could have failed, and thus
    3108             :          * marked the fs in an error state.  We must not commit here, as we
    3109             :          * could have updated our generation in the super_for_commit and
    3110             :          * writing the super here would result in transid mismatches.  If there
    3111             :          * is an error here just bail.
    3112             :          */
    3113      239585 :         if (BTRFS_FS_ERROR(fs_info)) {
    3114           0 :                 ret = -EIO;
    3115           0 :                 btrfs_set_log_full_commit(trans);
    3116           0 :                 btrfs_abort_transaction(trans, ret);
    3117           0 :                 mutex_unlock(&fs_info->tree_log_mutex);
    3118           0 :                 goto out_wake_log_root;
    3119             :         }
    3120             : 
    3121      239585 :         btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
    3122      239585 :         btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
    3123      239585 :         ret = write_all_supers(fs_info, 1);
    3124      239585 :         mutex_unlock(&fs_info->tree_log_mutex);
    3125      239585 :         if (ret) {
    3126           0 :                 btrfs_set_log_full_commit(trans);
    3127           0 :                 btrfs_abort_transaction(trans, ret);
    3128           0 :                 goto out_wake_log_root;
    3129             :         }
    3130             : 
    3131             :         /*
    3132             :          * We know there can only be one task here, since we have not yet set
    3133             :          * root->log_commit[index1] to 0 and any task attempting to sync the
    3134             :          * log must wait for the previous log transaction to commit if it's
    3135             :          * still in progress or wait for the current log transaction commit if
    3136             :          * someone else already started it. We use <= and not < because the
    3137             :          * first log transaction has an ID of 0.
    3138             :          */
    3139      239585 :         ASSERT(root->last_log_commit <= log_transid);
    3140      239585 :         root->last_log_commit = log_transid;
    3141             : 
    3142      239588 : out_wake_log_root:
    3143      239588 :         mutex_lock(&log_root_tree->log_mutex);
    3144      239588 :         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
    3145             : 
    3146      239588 :         log_root_tree->log_transid_committed++;
    3147      239588 :         atomic_set(&log_root_tree->log_commit[index2], 0);
    3148      239588 :         mutex_unlock(&log_root_tree->log_mutex);
    3149             : 
    3150             :         /*
    3151             :          * The barrier before waitqueue_active (in cond_wake_up) is needed so
    3152             :          * all the updates above are seen by the woken threads. It might not be
    3153             :          * necessary, but proving that seems to be hard.
    3154             :          */
    3155      239588 :         cond_wake_up(&log_root_tree->log_commit_wait[index2]);
    3156      241209 : out:
    3157      241209 :         mutex_lock(&root->log_mutex);
    3158      241208 :         btrfs_remove_all_log_ctxs(root, index1, ret);
    3159      241209 :         root->log_transid_committed++;
    3160      241209 :         atomic_set(&root->log_commit[index1], 0);
    3161      241209 :         mutex_unlock(&root->log_mutex);
    3162             : 
    3163             :         /*
    3164             :          * The barrier before waitqueue_active (in cond_wake_up) is needed so
    3165             :          * all the updates above are seen by the woken threads. It might not be
    3166             :          * necessary, but proving that seems to be hard.
    3167             :          */
    3168      241209 :         cond_wake_up(&root->log_commit_wait[index1]);
    3169      241209 :         return ret;
    3170             : }
    3171             : 
    3172        9566 : static void free_log_tree(struct btrfs_trans_handle *trans,
    3173             :                           struct btrfs_root *log)
    3174             : {
    3175        9566 :         int ret;
    3176        9566 :         struct walk_control wc = {
    3177             :                 .free = 1,
    3178             :                 .process_func = process_one_buffer
    3179             :         };
    3180             : 
    3181        9566 :         if (log->node) {
    3182        9566 :                 ret = walk_log_tree(trans, log, &wc);
    3183        9566 :                 if (ret) {
    3184             :                         /*
    3185             :                          * We weren't able to traverse the entire log tree, the
    3186             :                          * typical scenario is getting an -EIO when reading an
    3187             :                          * extent buffer of the tree, due to a previous writeback
    3188             :                          * failure of it.
    3189             :                          */
    3190           1 :                         set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
    3191           1 :                                 &log->fs_info->fs_state);
    3192             : 
    3193             :                         /*
    3194             :                          * Some extent buffers of the log tree may still be dirty
    3195             :                          * and not yet written back to storage, because we may
    3196             :                          * have updates to a log tree without syncing a log tree,
    3197             :                          * such as during rename and link operations. So flush
    3198             :                          * them out and wait for their writeback to complete, so
    3199             :                          * that we properly cleanup their state and pages.
    3200             :                          */
    3201           1 :                         btrfs_write_marked_extents(log->fs_info,
    3202             :                                                    &log->dirty_log_pages,
    3203             :                                                    EXTENT_DIRTY | EXTENT_NEW);
    3204           1 :                         btrfs_wait_tree_log_extents(log,
    3205             :                                                     EXTENT_DIRTY | EXTENT_NEW);
    3206             : 
    3207           1 :                         if (trans)
    3208           1 :                                 btrfs_abort_transaction(trans, ret);
    3209             :                         else
    3210           0 :                                 btrfs_handle_fs_error(log->fs_info, ret, NULL);
    3211             :                 }
    3212             :         }
    3213             : 
    3214        9566 :         clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
    3215             :                           EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
    3216        9566 :         extent_io_tree_release(&log->log_csum_range);
    3217             : 
    3218        9566 :         btrfs_put_root(log);
    3219        9566 : }
    3220             : 
    3221             : /*
    3222             :  * free all the extents used by the tree log.  This should be called
    3223             :  * at commit time of the full transaction
    3224             :  */
    3225      152455 : int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
    3226             : {
    3227      152455 :         if (root->log_root) {
    3228        4809 :                 free_log_tree(trans, root->log_root);
    3229        4809 :                 root->log_root = NULL;
    3230        4809 :                 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
    3231             :         }
    3232      152455 :         return 0;
    3233             : }
    3234             : 
    3235      203035 : int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
    3236             :                              struct btrfs_fs_info *fs_info)
    3237             : {
    3238      203035 :         if (fs_info->log_root_tree) {
    3239        4757 :                 free_log_tree(trans, fs_info->log_root_tree);
    3240        4757 :                 fs_info->log_root_tree = NULL;
    3241        4757 :                 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
    3242             :         }
    3243      203035 :         return 0;
    3244             : }
    3245             : 
    3246             : /*
    3247             :  * Check if an inode was logged in the current transaction. This correctly deals
    3248             :  * with the case where the inode was logged but has a logged_trans of 0, which
    3249             :  * happens if the inode is evicted and loaded again, as logged_trans is an in
    3250             :  * memory only field (not persisted).
    3251             :  *
    3252             :  * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
    3253             :  * and < 0 on error.
    3254             :  */
    3255     4417296 : static int inode_logged(const struct btrfs_trans_handle *trans,
    3256             :                         struct btrfs_inode *inode,
    3257             :                         struct btrfs_path *path_in)
    3258             : {
    3259     4417296 :         struct btrfs_path *path = path_in;
    3260     4417296 :         struct btrfs_key key;
    3261     4417296 :         int ret;
    3262             : 
    3263     4417296 :         if (inode->logged_trans == trans->transid)
    3264             :                 return 1;
    3265             : 
    3266             :         /*
    3267             :          * If logged_trans is not 0, then we know the inode logged was not logged
    3268             :          * in this transaction, so we can return false right away.
    3269             :          */
    3270     4160341 :         if (inode->logged_trans > 0)
    3271             :                 return 0;
    3272             : 
    3273             :         /*
    3274             :          * If no log tree was created for this root in this transaction, then
    3275             :          * the inode can not have been logged in this transaction. In that case
    3276             :          * set logged_trans to anything greater than 0 and less than the current
    3277             :          * transaction's ID, to avoid the search below in a future call in case
    3278             :          * a log tree gets created after this.
    3279             :          */
    3280     1450546 :         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
    3281     1424845 :                 inode->logged_trans = trans->transid - 1;
    3282     1424845 :                 return 0;
    3283             :         }
    3284             : 
    3285             :         /*
    3286             :          * We have a log tree and the inode's logged_trans is 0. We can't tell
    3287             :          * for sure if the inode was logged before in this transaction by looking
    3288             :          * only at logged_trans. We could be pessimistic and assume it was, but
    3289             :          * that can lead to unnecessarily logging an inode during rename and link
    3290             :          * operations, and then further updating the log in followup rename and
    3291             :          * link operations, specially if it's a directory, which adds latency
    3292             :          * visible to applications doing a series of rename or link operations.
    3293             :          *
    3294             :          * A logged_trans of 0 here can mean several things:
    3295             :          *
    3296             :          * 1) The inode was never logged since the filesystem was mounted, and may
    3297             :          *    or may have not been evicted and loaded again;
    3298             :          *
    3299             :          * 2) The inode was logged in a previous transaction, then evicted and
    3300             :          *    then loaded again;
    3301             :          *
    3302             :          * 3) The inode was logged in the current transaction, then evicted and
    3303             :          *    then loaded again.
    3304             :          *
    3305             :          * For cases 1) and 2) we don't want to return true, but we need to detect
    3306             :          * case 3) and return true. So we do a search in the log root for the inode
    3307             :          * item.
    3308             :          */
    3309       25701 :         key.objectid = btrfs_ino(inode);
    3310       25701 :         key.type = BTRFS_INODE_ITEM_KEY;
    3311       25701 :         key.offset = 0;
    3312             : 
    3313       25701 :         if (!path) {
    3314       20310 :                 path = btrfs_alloc_path();
    3315       20310 :                 if (!path)
    3316             :                         return -ENOMEM;
    3317             :         }
    3318             : 
    3319       25701 :         ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
    3320             : 
    3321       25699 :         if (path_in)
    3322        5389 :                 btrfs_release_path(path);
    3323             :         else
    3324       20310 :                 btrfs_free_path(path);
    3325             : 
    3326             :         /*
    3327             :          * Logging an inode always results in logging its inode item. So if we
    3328             :          * did not find the item we know the inode was not logged for sure.
    3329             :          */
    3330       25701 :         if (ret < 0) {
    3331             :                 return ret;
    3332       25701 :         } else if (ret > 0) {
    3333             :                 /*
    3334             :                  * Set logged_trans to a value greater than 0 and less then the
    3335             :                  * current transaction to avoid doing the search in future calls.
    3336             :                  */
    3337       25695 :                 inode->logged_trans = trans->transid - 1;
    3338       25695 :                 return 0;
    3339             :         }
    3340             : 
    3341             :         /*
    3342             :          * The inode was previously logged and then evicted, set logged_trans to
    3343             :          * the current transacion's ID, to avoid future tree searches as long as
    3344             :          * the inode is not evicted again.
    3345             :          */
    3346           6 :         inode->logged_trans = trans->transid;
    3347             : 
    3348             :         /*
    3349             :          * If it's a directory, then we must set last_dir_index_offset to the
    3350             :          * maximum possible value, so that the next attempt to log the inode does
    3351             :          * not skip checking if dir index keys found in modified subvolume tree
    3352             :          * leaves have been logged before, otherwise it would result in attempts
    3353             :          * to insert duplicate dir index keys in the log tree. This must be done
    3354             :          * because last_dir_index_offset is an in-memory only field, not persisted
    3355             :          * in the inode item or any other on-disk structure, so its value is lost
    3356             :          * once the inode is evicted.
    3357             :          */
    3358           6 :         if (S_ISDIR(inode->vfs_inode.i_mode))
    3359           1 :                 inode->last_dir_index_offset = (u64)-1;
    3360             : 
    3361             :         return 1;
    3362             : }
    3363             : 
    3364             : /*
    3365             :  * Delete a directory entry from the log if it exists.
    3366             :  *
    3367             :  * Returns < 0 on error
    3368             :  *           1 if the entry does not exists
    3369             :  *           0 if the entry existed and was successfully deleted
    3370             :  */
    3371        1183 : static int del_logged_dentry(struct btrfs_trans_handle *trans,
    3372             :                              struct btrfs_root *log,
    3373             :                              struct btrfs_path *path,
    3374             :                              u64 dir_ino,
    3375             :                              const struct fscrypt_str *name,
    3376             :                              u64 index)
    3377             : {
    3378        1183 :         struct btrfs_dir_item *di;
    3379             : 
    3380             :         /*
    3381             :          * We only log dir index items of a directory, so we don't need to look
    3382             :          * for dir item keys.
    3383             :          */
    3384        1183 :         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
    3385             :                                          index, name, -1);
    3386        1183 :         if (IS_ERR(di))
    3387           0 :                 return PTR_ERR(di);
    3388        1183 :         else if (!di)
    3389             :                 return 1;
    3390             : 
    3391             :         /*
    3392             :          * We do not need to update the size field of the directory's
    3393             :          * inode item because on log replay we update the field to reflect
    3394             :          * all existing entries in the directory (see overwrite_item()).
    3395             :          */
    3396         206 :         return btrfs_delete_one_dir_name(trans, log, path, di);
    3397             : }
    3398             : 
    3399             : /*
    3400             :  * If both a file and directory are logged, and unlinks or renames are
    3401             :  * mixed in, we have a few interesting corners:
    3402             :  *
    3403             :  * create file X in dir Y
    3404             :  * link file X to X.link in dir Y
    3405             :  * fsync file X
    3406             :  * unlink file X but leave X.link
    3407             :  * fsync dir Y
    3408             :  *
    3409             :  * After a crash we would expect only X.link to exist.  But file X
    3410             :  * didn't get fsync'd again so the log has back refs for X and X.link.
    3411             :  *
    3412             :  * We solve this by removing directory entries and inode backrefs from the
    3413             :  * log when a file that was logged in the current transaction is
    3414             :  * unlinked.  Any later fsync will include the updated log entries, and
    3415             :  * we'll be able to reconstruct the proper directory items from backrefs.
    3416             :  *
    3417             :  * This optimizations allows us to avoid relogging the entire inode
    3418             :  * or the entire directory.
    3419             :  */
    3420     1422535 : void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
    3421             :                                   struct btrfs_root *root,
    3422             :                                   const struct fscrypt_str *name,
    3423             :                                   struct btrfs_inode *dir, u64 index)
    3424             : {
    3425     1422535 :         struct btrfs_path *path;
    3426     1422535 :         int ret;
    3427             : 
    3428     1422535 :         ret = inode_logged(trans, dir, NULL);
    3429     1422535 :         if (ret == 0)
    3430             :                 return;
    3431         259 :         else if (ret < 0) {
    3432           0 :                 btrfs_set_log_full_commit(trans);
    3433           0 :                 return;
    3434             :         }
    3435             : 
    3436         259 :         ret = join_running_log_trans(root);
    3437         259 :         if (ret)
    3438             :                 return;
    3439             : 
    3440         259 :         mutex_lock(&dir->log_mutex);
    3441             : 
    3442         259 :         path = btrfs_alloc_path();
    3443         259 :         if (!path) {
    3444           0 :                 ret = -ENOMEM;
    3445           0 :                 goto out_unlock;
    3446             :         }
    3447             : 
    3448         259 :         ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
    3449             :                                 name, index);
    3450         259 :         btrfs_free_path(path);
    3451         259 : out_unlock:
    3452         259 :         mutex_unlock(&dir->log_mutex);
    3453         259 :         if (ret < 0)
    3454           0 :                 btrfs_set_log_full_commit(trans);
    3455         259 :         btrfs_end_log_trans(root);
    3456             : }
    3457             : 
    3458             : /* see comments for btrfs_del_dir_entries_in_log */
    3459     1422556 : void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
    3460             :                                 struct btrfs_root *root,
    3461             :                                 const struct fscrypt_str *name,
    3462             :                                 struct btrfs_inode *inode, u64 dirid)
    3463             : {
    3464     1422556 :         struct btrfs_root *log;
    3465     1422556 :         u64 index;
    3466     1422556 :         int ret;
    3467             : 
    3468     1422556 :         ret = inode_logged(trans, inode, NULL);
    3469     1422548 :         if (ret == 0)
    3470     1421994 :                 return;
    3471         554 :         else if (ret < 0) {
    3472           0 :                 btrfs_set_log_full_commit(trans);
    3473           0 :                 return;
    3474             :         }
    3475             : 
    3476         554 :         ret = join_running_log_trans(root);
    3477         554 :         if (ret)
    3478             :                 return;
    3479         554 :         log = root->log_root;
    3480         554 :         mutex_lock(&inode->log_mutex);
    3481             : 
    3482         554 :         ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
    3483             :                                   dirid, &index);
    3484         554 :         mutex_unlock(&inode->log_mutex);
    3485         554 :         if (ret < 0 && ret != -ENOENT)
    3486           0 :                 btrfs_set_log_full_commit(trans);
    3487         554 :         btrfs_end_log_trans(root);
    3488             : }
    3489             : 
    3490             : /*
    3491             :  * creates a range item in the log for 'dirid'.  first_offset and
    3492             :  * last_offset tell us which parts of the key space the log should
    3493             :  * be considered authoritative for.
    3494             :  */
    3495        3071 : static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
    3496             :                                        struct btrfs_root *log,
    3497             :                                        struct btrfs_path *path,
    3498             :                                        u64 dirid,
    3499             :                                        u64 first_offset, u64 last_offset)
    3500             : {
    3501        3071 :         int ret;
    3502        3071 :         struct btrfs_key key;
    3503        3071 :         struct btrfs_dir_log_item *item;
    3504             : 
    3505        3071 :         key.objectid = dirid;
    3506        3071 :         key.offset = first_offset;
    3507        3071 :         key.type = BTRFS_DIR_LOG_INDEX_KEY;
    3508        3071 :         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
    3509             :         /*
    3510             :          * -EEXIST is fine and can happen sporadically when we are logging a
    3511             :          * directory and have concurrent insertions in the subvolume's tree for
    3512             :          * items from other inodes and that result in pushing off some dir items
    3513             :          * from one leaf to another in order to accommodate for the new items.
    3514             :          * This results in logging the same dir index range key.
    3515             :          */
    3516        3071 :         if (ret && ret != -EEXIST)
    3517             :                 return ret;
    3518             : 
    3519        3071 :         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
    3520             :                               struct btrfs_dir_log_item);
    3521        3071 :         if (ret == -EEXIST) {
    3522          53 :                 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
    3523             : 
    3524             :                 /*
    3525             :                  * btrfs_del_dir_entries_in_log() might have been called during
    3526             :                  * an unlink between the initial insertion of this key and the
    3527             :                  * current update, or we might be logging a single entry deletion
    3528             :                  * during a rename, so set the new last_offset to the max value.
    3529             :                  */
    3530          53 :                 last_offset = max(last_offset, curr_end);
    3531             :         }
    3532        3071 :         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
    3533        3071 :         btrfs_mark_buffer_dirty(path->nodes[0]);
    3534        3071 :         btrfs_release_path(path);
    3535        3071 :         return 0;
    3536             : }
    3537             : 
    3538          86 : static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
    3539             :                                  struct btrfs_inode *inode,
    3540             :                                  struct extent_buffer *src,
    3541             :                                  struct btrfs_path *dst_path,
    3542             :                                  int start_slot,
    3543             :                                  int count)
    3544             : {
    3545          86 :         struct btrfs_root *log = inode->root->log_root;
    3546          86 :         char *ins_data = NULL;
    3547          86 :         struct btrfs_item_batch batch;
    3548          86 :         struct extent_buffer *dst;
    3549          86 :         unsigned long src_offset;
    3550          86 :         unsigned long dst_offset;
    3551          86 :         u64 last_index;
    3552          86 :         struct btrfs_key key;
    3553          86 :         u32 item_size;
    3554          86 :         int ret;
    3555          86 :         int i;
    3556             : 
    3557          86 :         ASSERT(count > 0);
    3558          86 :         batch.nr = count;
    3559             : 
    3560          86 :         if (count == 1) {
    3561          60 :                 btrfs_item_key_to_cpu(src, &key, start_slot);
    3562          60 :                 item_size = btrfs_item_size(src, start_slot);
    3563          60 :                 batch.keys = &key;
    3564          60 :                 batch.data_sizes = &item_size;
    3565          60 :                 batch.total_data_size = item_size;
    3566             :         } else {
    3567          26 :                 struct btrfs_key *ins_keys;
    3568          26 :                 u32 *ins_sizes;
    3569             : 
    3570          26 :                 ins_data = kmalloc(count * sizeof(u32) +
    3571             :                                    count * sizeof(struct btrfs_key), GFP_NOFS);
    3572          26 :                 if (!ins_data)
    3573             :                         return -ENOMEM;
    3574             : 
    3575          26 :                 ins_sizes = (u32 *)ins_data;
    3576          26 :                 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
    3577          26 :                 batch.keys = ins_keys;
    3578          26 :                 batch.data_sizes = ins_sizes;
    3579          26 :                 batch.total_data_size = 0;
    3580             : 
    3581          89 :                 for (i = 0; i < count; i++) {
    3582          63 :                         const int slot = start_slot + i;
    3583             : 
    3584          63 :                         btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
    3585          63 :                         ins_sizes[i] = btrfs_item_size(src, slot);
    3586          63 :                         batch.total_data_size += ins_sizes[i];
    3587             :                 }
    3588             :         }
    3589             : 
    3590          86 :         ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
    3591          86 :         if (ret)
    3592           0 :                 goto out;
    3593             : 
    3594          86 :         dst = dst_path->nodes[0];
    3595             :         /*
    3596             :          * Copy all the items in bulk, in a single copy operation. Item data is
    3597             :          * organized such that it's placed at the end of a leaf and from right
    3598             :          * to left. For example, the data for the second item ends at an offset
    3599             :          * that matches the offset where the data for the first item starts, the
    3600             :          * data for the third item ends at an offset that matches the offset
    3601             :          * where the data of the second items starts, and so on.
    3602             :          * Therefore our source and destination start offsets for copy match the
    3603             :          * offsets of the last items (highest slots).
    3604             :          */
    3605          86 :         dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
    3606          86 :         src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
    3607          86 :         copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
    3608          86 :         btrfs_release_path(dst_path);
    3609             : 
    3610          86 :         last_index = batch.keys[count - 1].offset;
    3611          86 :         ASSERT(last_index > inode->last_dir_index_offset);
    3612             : 
    3613             :         /*
    3614             :          * If for some unexpected reason the last item's index is not greater
    3615             :          * than the last index we logged, warn and force a transaction commit.
    3616             :          */
    3617          86 :         if (WARN_ON(last_index <= inode->last_dir_index_offset))
    3618             :                 ret = BTRFS_LOG_FORCE_COMMIT;
    3619             :         else
    3620          86 :                 inode->last_dir_index_offset = last_index;
    3621             : 
    3622          86 :         if (btrfs_get_first_dir_index_to_log(inode) == 0)
    3623          85 :                 btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
    3624           1 : out:
    3625          86 :         kfree(ins_data);
    3626             : 
    3627          86 :         return ret;
    3628             : }
    3629             : 
    3630         507 : static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
    3631             :                                   struct btrfs_inode *inode,
    3632             :                                   struct btrfs_path *path,
    3633             :                                   struct btrfs_path *dst_path,
    3634             :                                   struct btrfs_log_ctx *ctx,
    3635             :                                   u64 *last_old_dentry_offset)
    3636             : {
    3637         507 :         struct btrfs_root *log = inode->root->log_root;
    3638         507 :         struct extent_buffer *src;
    3639         507 :         const int nritems = btrfs_header_nritems(path->nodes[0]);
    3640         507 :         const u64 ino = btrfs_ino(inode);
    3641         507 :         bool last_found = false;
    3642         507 :         int batch_start = 0;
    3643         507 :         int batch_size = 0;
    3644         507 :         int i;
    3645             : 
    3646             :         /*
    3647             :          * We need to clone the leaf, release the read lock on it, and use the
    3648             :          * clone before modifying the log tree. See the comment at copy_items()
    3649             :          * about why we need to do this.
    3650             :          */
    3651         507 :         src = btrfs_clone_extent_buffer(path->nodes[0]);
    3652         507 :         if (!src)
    3653             :                 return -ENOMEM;
    3654             : 
    3655         507 :         i = path->slots[0];
    3656         507 :         btrfs_release_path(path);
    3657         507 :         path->nodes[0] = src;
    3658         507 :         path->slots[0] = i;
    3659             : 
    3660        3887 :         for (; i < nritems; i++) {
    3661        3864 :                 struct btrfs_dir_item *di;
    3662        3864 :                 struct btrfs_key key;
    3663        3864 :                 int ret;
    3664             : 
    3665        3864 :                 btrfs_item_key_to_cpu(src, &key, i);
    3666             : 
    3667        3864 :                 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
    3668         484 :                         last_found = true;
    3669         484 :                         break;
    3670             :                 }
    3671             : 
    3672        3380 :                 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
    3673             : 
    3674             :                 /*
    3675             :                  * Skip ranges of items that consist only of dir item keys created
    3676             :                  * in past transactions. However if we find a gap, we must log a
    3677             :                  * dir index range item for that gap, so that index keys in that
    3678             :                  * gap are deleted during log replay.
    3679             :                  */
    3680        3380 :                 if (btrfs_dir_transid(src, di) < trans->transid) {
    3681        3247 :                         if (key.offset > *last_old_dentry_offset + 1) {
    3682        1242 :                                 ret = insert_dir_log_key(trans, log, dst_path,
    3683             :                                                  ino, *last_old_dentry_offset + 1,
    3684             :                                                  key.offset - 1);
    3685        1242 :                                 if (ret < 0)
    3686           0 :                                         return ret;
    3687             :                         }
    3688             : 
    3689        3247 :                         *last_old_dentry_offset = key.offset;
    3690        3257 :                         continue;
    3691             :                 }
    3692             : 
    3693             :                 /* If we logged this dir index item before, we can skip it. */
    3694         133 :                 if (key.offset <= inode->last_dir_index_offset)
    3695          10 :                         continue;
    3696             : 
    3697             :                 /*
    3698             :                  * We must make sure that when we log a directory entry, the
    3699             :                  * corresponding inode, after log replay, has a matching link
    3700             :                  * count. For example:
    3701             :                  *
    3702             :                  * touch foo
    3703             :                  * mkdir mydir
    3704             :                  * sync
    3705             :                  * ln foo mydir/bar
    3706             :                  * xfs_io -c "fsync" mydir
    3707             :                  * <crash>
    3708             :                  * <mount fs and log replay>
    3709             :                  *
    3710             :                  * Would result in a fsync log that when replayed, our file inode
    3711             :                  * would have a link count of 1, but we get two directory entries
    3712             :                  * pointing to the same inode. After removing one of the names,
    3713             :                  * it would not be possible to remove the other name, which
    3714             :                  * resulted always in stale file handle errors, and would not be
    3715             :                  * possible to rmdir the parent directory, since its i_size could
    3716             :                  * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
    3717             :                  * resulting in -ENOTEMPTY errors.
    3718             :                  */
    3719         123 :                 if (!ctx->log_new_dentries) {
    3720          86 :                         struct btrfs_key di_key;
    3721             : 
    3722          86 :                         btrfs_dir_item_key_to_cpu(src, di, &di_key);
    3723          86 :                         if (di_key.type != BTRFS_ROOT_ITEM_KEY)
    3724          86 :                                 ctx->log_new_dentries = true;
    3725             :                 }
    3726             : 
    3727         123 :                 if (batch_size == 0)
    3728          86 :                         batch_start = i;
    3729         123 :                 batch_size++;
    3730             :         }
    3731             : 
    3732         507 :         if (batch_size > 0) {
    3733          86 :                 int ret;
    3734             : 
    3735          86 :                 ret = flush_dir_items_batch(trans, inode, src, dst_path,
    3736             :                                             batch_start, batch_size);
    3737          86 :                 if (ret < 0)
    3738             :                         return ret;
    3739             :         }
    3740             : 
    3741         507 :         return last_found ? 1 : 0;
    3742             : }
    3743             : 
    3744             : /*
    3745             :  * log all the items included in the current transaction for a given
    3746             :  * directory.  This also creates the range items in the log tree required
    3747             :  * to replay anything deleted before the fsync
    3748             :  */
    3749         801 : static noinline int log_dir_items(struct btrfs_trans_handle *trans,
    3750             :                           struct btrfs_inode *inode,
    3751             :                           struct btrfs_path *path,
    3752             :                           struct btrfs_path *dst_path,
    3753             :                           struct btrfs_log_ctx *ctx,
    3754             :                           u64 min_offset, u64 *last_offset_ret)
    3755             : {
    3756         801 :         struct btrfs_key min_key;
    3757         801 :         struct btrfs_root *root = inode->root;
    3758         801 :         struct btrfs_root *log = root->log_root;
    3759         801 :         int ret;
    3760         801 :         u64 last_old_dentry_offset = min_offset - 1;
    3761         801 :         u64 last_offset = (u64)-1;
    3762         801 :         u64 ino = btrfs_ino(inode);
    3763             : 
    3764         801 :         min_key.objectid = ino;
    3765         801 :         min_key.type = BTRFS_DIR_INDEX_KEY;
    3766         801 :         min_key.offset = min_offset;
    3767             : 
    3768         801 :         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
    3769             : 
    3770             :         /*
    3771             :          * we didn't find anything from this transaction, see if there
    3772             :          * is anything at all
    3773             :          */
    3774         801 :         if (ret != 0 || min_key.objectid != ino ||
    3775         494 :             min_key.type != BTRFS_DIR_INDEX_KEY) {
    3776         307 :                 min_key.objectid = ino;
    3777         307 :                 min_key.type = BTRFS_DIR_INDEX_KEY;
    3778         307 :                 min_key.offset = (u64)-1;
    3779         307 :                 btrfs_release_path(path);
    3780         307 :                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
    3781         307 :                 if (ret < 0) {
    3782           0 :                         btrfs_release_path(path);
    3783           0 :                         return ret;
    3784             :                 }
    3785         307 :                 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
    3786             : 
    3787             :                 /* if ret == 0 there are items for this type,
    3788             :                  * create a range to tell us the last key of this type.
    3789             :                  * otherwise, there are no items in this directory after
    3790             :                  * *min_offset, and we create a range to indicate that.
    3791             :                  */
    3792         307 :                 if (ret == 0) {
    3793          33 :                         struct btrfs_key tmp;
    3794             : 
    3795          33 :                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
    3796             :                                               path->slots[0]);
    3797          33 :                         if (tmp.type == BTRFS_DIR_INDEX_KEY)
    3798          33 :                                 last_old_dentry_offset = tmp.offset;
    3799         274 :                 } else if (ret > 0) {
    3800             :                         ret = 0;
    3801             :                 }
    3802             : 
    3803         307 :                 goto done;
    3804             :         }
    3805             : 
    3806             :         /* go backward to find any previous key */
    3807         494 :         ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
    3808         494 :         if (ret == 0) {
    3809           0 :                 struct btrfs_key tmp;
    3810             : 
    3811           0 :                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
    3812             :                 /*
    3813             :                  * The dir index key before the first one we found that needs to
    3814             :                  * be logged might be in a previous leaf, and there might be a
    3815             :                  * gap between these keys, meaning that we had deletions that
    3816             :                  * happened. So the key range item we log (key type
    3817             :                  * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
    3818             :                  * previous key's offset plus 1, so that those deletes are replayed.
    3819             :                  */
    3820           0 :                 if (tmp.type == BTRFS_DIR_INDEX_KEY)
    3821           0 :                         last_old_dentry_offset = tmp.offset;
    3822         494 :         } else if (ret < 0) {
    3823           0 :                 goto done;
    3824             :         }
    3825             : 
    3826         494 :         btrfs_release_path(path);
    3827             : 
    3828             :         /*
    3829             :          * Find the first key from this transaction again or the one we were at
    3830             :          * in the loop below in case we had to reschedule. We may be logging the
    3831             :          * directory without holding its VFS lock, which happen when logging new
    3832             :          * dentries (through log_new_dir_dentries()) or in some cases when we
    3833             :          * need to log the parent directory of an inode. This means a dir index
    3834             :          * key might be deleted from the inode's root, and therefore we may not
    3835             :          * find it anymore. If we can't find it, just move to the next key. We
    3836             :          * can not bail out and ignore, because if we do that we will simply
    3837             :          * not log dir index keys that come after the one that was just deleted
    3838             :          * and we can end up logging a dir index range that ends at (u64)-1
    3839             :          * (@last_offset is initialized to that), resulting in removing dir
    3840             :          * entries we should not remove at log replay time.
    3841             :          */
    3842         494 : search:
    3843         494 :         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
    3844         494 :         if (ret > 0) {
    3845           0 :                 ret = btrfs_next_item(root, path);
    3846           0 :                 if (ret > 0) {
    3847             :                         /* There are no more keys in the inode's root. */
    3848           0 :                         ret = 0;
    3849           0 :                         goto done;
    3850             :                 }
    3851             :         }
    3852         494 :         if (ret < 0)
    3853           0 :                 goto done;
    3854             : 
    3855             :         /*
    3856             :          * we have a block from this transaction, log every item in it
    3857             :          * from our directory
    3858             :          */
    3859         507 :         while (1) {
    3860         507 :                 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
    3861             :                                              &last_old_dentry_offset);
    3862         507 :                 if (ret != 0) {
    3863         484 :                         if (ret > 0)
    3864             :                                 ret = 0;
    3865         484 :                         goto done;
    3866             :                 }
    3867          23 :                 path->slots[0] = btrfs_header_nritems(path->nodes[0]);
    3868             : 
    3869             :                 /*
    3870             :                  * look ahead to the next item and see if it is also
    3871             :                  * from this directory and from this transaction
    3872             :                  */
    3873          23 :                 ret = btrfs_next_leaf(root, path);
    3874          23 :                 if (ret) {
    3875           3 :                         if (ret == 1) {
    3876           3 :                                 last_offset = (u64)-1;
    3877           3 :                                 ret = 0;
    3878             :                         }
    3879           3 :                         goto done;
    3880             :                 }
    3881          20 :                 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
    3882          20 :                 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
    3883           5 :                         last_offset = (u64)-1;
    3884           5 :                         goto done;
    3885             :                 }
    3886          15 :                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
    3887             :                         /*
    3888             :                          * The next leaf was not changed in the current transaction
    3889             :                          * and has at least one dir index key.
    3890             :                          * We check for the next key because there might have been
    3891             :                          * one or more deletions between the last key we logged and
    3892             :                          * that next key. So the key range item we log (key type
    3893             :                          * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
    3894             :                          * offset minus 1, so that those deletes are replayed.
    3895             :                          */
    3896           2 :                         last_offset = min_key.offset - 1;
    3897           2 :                         goto done;
    3898             :                 }
    3899          13 :                 if (need_resched()) {
    3900           0 :                         btrfs_release_path(path);
    3901           0 :                         cond_resched();
    3902           0 :                         goto search;
    3903             :                 }
    3904             :         }
    3905         801 : done:
    3906         801 :         btrfs_release_path(path);
    3907         801 :         btrfs_release_path(dst_path);
    3908             : 
    3909         801 :         if (ret == 0) {
    3910         801 :                 *last_offset_ret = last_offset;
    3911             :                 /*
    3912             :                  * In case the leaf was changed in the current transaction but
    3913             :                  * all its dir items are from a past transaction, the last item
    3914             :                  * in the leaf is a dir item and there's no gap between that last
    3915             :                  * dir item and the first one on the next leaf (which did not
    3916             :                  * change in the current transaction), then we don't need to log
    3917             :                  * a range, last_old_dentry_offset is == to last_offset.
    3918             :                  */
    3919         801 :                 ASSERT(last_old_dentry_offset <= last_offset);
    3920         801 :                 if (last_old_dentry_offset < last_offset)
    3921         799 :                         ret = insert_dir_log_key(trans, log, path, ino,
    3922             :                                                  last_old_dentry_offset + 1,
    3923             :                                                  last_offset);
    3924             :         }
    3925             : 
    3926             :         return ret;
    3927             : }
    3928             : 
    3929             : /*
    3930             :  * If the inode was logged before and it was evicted, then its
    3931             :  * last_dir_index_offset is (u64)-1, so we don't the value of the last index
    3932             :  * key offset. If that's the case, search for it and update the inode. This
    3933             :  * is to avoid lookups in the log tree every time we try to insert a dir index
    3934             :  * key from a leaf changed in the current transaction, and to allow us to always
    3935             :  * do batch insertions of dir index keys.
    3936             :  */
    3937         799 : static int update_last_dir_index_offset(struct btrfs_inode *inode,
    3938             :                                         struct btrfs_path *path,
    3939             :                                         const struct btrfs_log_ctx *ctx)
    3940             : {
    3941         799 :         const u64 ino = btrfs_ino(inode);
    3942         799 :         struct btrfs_key key;
    3943         799 :         int ret;
    3944             : 
    3945         799 :         lockdep_assert_held(&inode->log_mutex);
    3946             : 
    3947         799 :         if (inode->last_dir_index_offset != (u64)-1)
    3948             :                 return 0;
    3949             : 
    3950           0 :         if (!ctx->logged_before) {
    3951           0 :                 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
    3952           0 :                 return 0;
    3953             :         }
    3954             : 
    3955           0 :         key.objectid = ino;
    3956           0 :         key.type = BTRFS_DIR_INDEX_KEY;
    3957           0 :         key.offset = (u64)-1;
    3958             : 
    3959           0 :         ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
    3960             :         /*
    3961             :          * An error happened or we actually have an index key with an offset
    3962             :          * value of (u64)-1. Bail out, we're done.
    3963             :          */
    3964           0 :         if (ret <= 0)
    3965           0 :                 goto out;
    3966             : 
    3967           0 :         ret = 0;
    3968           0 :         inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
    3969             : 
    3970             :         /*
    3971             :          * No dir index items, bail out and leave last_dir_index_offset with
    3972             :          * the value right before the first valid index value.
    3973             :          */
    3974           0 :         if (path->slots[0] == 0)
    3975           0 :                 goto out;
    3976             : 
    3977             :         /*
    3978             :          * btrfs_search_slot() left us at one slot beyond the slot with the last
    3979             :          * index key, or beyond the last key of the directory that is not an
    3980             :          * index key. If we have an index key before, set last_dir_index_offset
    3981             :          * to its offset value, otherwise leave it with a value right before the
    3982             :          * first valid index value, as it means we have an empty directory.
    3983             :          */
    3984           0 :         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
    3985           0 :         if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
    3986           0 :                 inode->last_dir_index_offset = key.offset;
    3987             : 
    3988           0 : out:
    3989           0 :         btrfs_release_path(path);
    3990             : 
    3991           0 :         return ret;
    3992             : }
    3993             : 
    3994             : /*
    3995             :  * logging directories is very similar to logging inodes, We find all the items
    3996             :  * from the current transaction and write them to the log.
    3997             :  *
    3998             :  * The recovery code scans the directory in the subvolume, and if it finds a
    3999             :  * key in the range logged that is not present in the log tree, then it means
    4000             :  * that dir entry was unlinked during the transaction.
    4001             :  *
    4002             :  * In order for that scan to work, we must include one key smaller than
    4003             :  * the smallest logged by this transaction and one key larger than the largest
    4004             :  * key logged by this transaction.
    4005             :  */
    4006         799 : static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
    4007             :                           struct btrfs_inode *inode,
    4008             :                           struct btrfs_path *path,
    4009             :                           struct btrfs_path *dst_path,
    4010             :                           struct btrfs_log_ctx *ctx)
    4011             : {
    4012         799 :         u64 min_key;
    4013         799 :         u64 max_key;
    4014         799 :         int ret;
    4015             : 
    4016         799 :         ret = update_last_dir_index_offset(inode, path, ctx);
    4017         799 :         if (ret)
    4018             :                 return ret;
    4019             : 
    4020         799 :         min_key = BTRFS_DIR_START_INDEX;
    4021         799 :         max_key = 0;
    4022             : 
    4023         803 :         while (1) {
    4024         801 :                 ret = log_dir_items(trans, inode, path, dst_path,
    4025             :                                 ctx, min_key, &max_key);
    4026         801 :                 if (ret)
    4027           0 :                         return ret;
    4028         801 :                 if (max_key == (u64)-1)
    4029             :                         break;
    4030           2 :                 min_key = max_key + 1;
    4031             :         }
    4032             : 
    4033             :         return 0;
    4034             : }
    4035             : 
    4036             : /*
    4037             :  * a helper function to drop items from the log before we relog an
    4038             :  * inode.  max_key_type indicates the highest item type to remove.
    4039             :  * This cannot be run for file data extents because it does not
    4040             :  * free the extents they point to.
    4041             :  */
    4042         853 : static int drop_inode_items(struct btrfs_trans_handle *trans,
    4043             :                                   struct btrfs_root *log,
    4044             :                                   struct btrfs_path *path,
    4045             :                                   struct btrfs_inode *inode,
    4046             :                                   int max_key_type)
    4047             : {
    4048         853 :         int ret;
    4049         853 :         struct btrfs_key key;
    4050         853 :         struct btrfs_key found_key;
    4051         853 :         int start_slot;
    4052             : 
    4053         853 :         key.objectid = btrfs_ino(inode);
    4054         853 :         key.type = max_key_type;
    4055         853 :         key.offset = (u64)-1;
    4056             : 
    4057        1267 :         while (1) {
    4058        1060 :                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
    4059        1060 :                 if (ret < 0) {
    4060             :                         break;
    4061        1060 :                 } else if (ret > 0) {
    4062        1060 :                         if (path->slots[0] == 0)
    4063             :                                 break;
    4064         854 :                         path->slots[0]--;
    4065             :                 }
    4066             : 
    4067         854 :                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
    4068             :                                       path->slots[0]);
    4069             : 
    4070         854 :                 if (found_key.objectid != key.objectid)
    4071             :                         break;
    4072             : 
    4073         854 :                 found_key.offset = 0;
    4074         854 :                 found_key.type = 0;
    4075         854 :                 ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
    4076         854 :                 if (ret < 0)
    4077             :                         break;
    4078             : 
    4079         854 :                 ret = btrfs_del_items(trans, log, path, start_slot,
    4080         854 :                                       path->slots[0] - start_slot + 1);
    4081             :                 /*
    4082             :                  * If start slot isn't 0 then we don't need to re-search, we've
    4083             :                  * found the last guy with the objectid in this tree.
    4084             :                  */
    4085         854 :                 if (ret || start_slot != 0)
    4086             :                         break;
    4087         207 :                 btrfs_release_path(path);
    4088             :         }
    4089         853 :         btrfs_release_path(path);
    4090         853 :         if (ret > 0)
    4091             :                 ret = 0;
    4092         853 :         return ret;
    4093             : }
    4094             : 
    4095      125792 : static int truncate_inode_items(struct btrfs_trans_handle *trans,
    4096             :                                 struct btrfs_root *log_root,
    4097             :                                 struct btrfs_inode *inode,
    4098             :                                 u64 new_size, u32 min_type)
    4099             : {
    4100      125792 :         struct btrfs_truncate_control control = {
    4101             :                 .new_size = new_size,
    4102             :                 .ino = btrfs_ino(inode),
    4103             :                 .min_type = min_type,
    4104             :                 .skip_ref_updates = true,
    4105             :         };
    4106             : 
    4107      125792 :         return btrfs_truncate_inode_items(trans, log_root, &control);
    4108             : }
    4109             : 
    4110      255121 : static void fill_inode_item(struct btrfs_trans_handle *trans,
    4111             :                             struct extent_buffer *leaf,
    4112             :                             struct btrfs_inode_item *item,
    4113             :                             struct inode *inode, int log_inode_only,
    4114             :                             u64 logged_isize)
    4115             : {
    4116      255121 :         struct btrfs_map_token token;
    4117      255121 :         u64 flags;
    4118             : 
    4119      255121 :         btrfs_init_map_token(&token, leaf);
    4120             : 
    4121      255108 :         if (log_inode_only) {
    4122             :                 /* set the generation to zero so the recover code
    4123             :                  * can tell the difference between an logging
    4124             :                  * just to say 'this inode exists' and a logging
    4125             :                  * to say 'update this inode with these values'
    4126             :                  */
    4127        2465 :                 btrfs_set_token_inode_generation(&token, item, 0);
    4128        2465 :                 btrfs_set_token_inode_size(&token, item, logged_isize);
    4129             :         } else {
    4130      252643 :                 btrfs_set_token_inode_generation(&token, item,
    4131             :                                                  BTRFS_I(inode)->generation);
    4132      252629 :                 btrfs_set_token_inode_size(&token, item, inode->i_size);
    4133             :         }
    4134             : 
    4135      255092 :         btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
    4136      255088 :         btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
    4137      255101 :         btrfs_set_token_inode_mode(&token, item, inode->i_mode);
    4138      255098 :         btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
    4139             : 
    4140      255114 :         btrfs_set_token_timespec_sec(&token, &item->atime,
    4141      255114 :                                      inode->i_atime.tv_sec);
    4142      255111 :         btrfs_set_token_timespec_nsec(&token, &item->atime,
    4143      255111 :                                       inode->i_atime.tv_nsec);
    4144             : 
    4145      255117 :         btrfs_set_token_timespec_sec(&token, &item->mtime,
    4146      255117 :                                      inode->i_mtime.tv_sec);
    4147      255113 :         btrfs_set_token_timespec_nsec(&token, &item->mtime,
    4148      255113 :                                       inode->i_mtime.tv_nsec);
    4149             : 
    4150      255121 :         btrfs_set_token_timespec_sec(&token, &item->ctime,
    4151      255121 :                                      inode->i_ctime.tv_sec);
    4152      255115 :         btrfs_set_token_timespec_nsec(&token, &item->ctime,
    4153      255115 :                                       inode->i_ctime.tv_nsec);
    4154             : 
    4155             :         /*
    4156             :          * We do not need to set the nbytes field, in fact during a fast fsync
    4157             :          * its value may not even be correct, since a fast fsync does not wait
    4158             :          * for ordered extent completion, which is where we update nbytes, it
    4159             :          * only waits for writeback to complete. During log replay as we find
    4160             :          * file extent items and replay them, we adjust the nbytes field of the
    4161             :          * inode item in subvolume tree as needed (see overwrite_item()).
    4162             :          */
    4163             : 
    4164      255117 :         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
    4165      255115 :         btrfs_set_token_inode_transid(&token, item, trans->transid);
    4166      255117 :         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
    4167      255113 :         flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
    4168             :                                           BTRFS_I(inode)->ro_flags);
    4169      255113 :         btrfs_set_token_inode_flags(&token, item, flags);
    4170      255112 :         btrfs_set_token_inode_block_group(&token, item, 0);
    4171      255110 : }
    4172             : 
    4173      135961 : static int log_inode_item(struct btrfs_trans_handle *trans,
    4174             :                           struct btrfs_root *log, struct btrfs_path *path,
    4175             :                           struct btrfs_inode *inode, bool inode_item_dropped)
    4176             : {
    4177      135961 :         struct btrfs_inode_item *inode_item;
    4178      135961 :         int ret;
    4179             : 
    4180             :         /*
    4181             :          * If we are doing a fast fsync and the inode was logged before in the
    4182             :          * current transaction, then we know the inode was previously logged and
    4183             :          * it exists in the log tree. For performance reasons, in this case use
    4184             :          * btrfs_search_slot() directly with ins_len set to 0 so that we never
    4185             :          * attempt a write lock on the leaf's parent, which adds unnecessary lock
    4186             :          * contention in case there are concurrent fsyncs for other inodes of the
    4187             :          * same subvolume. Using btrfs_insert_empty_item() when the inode item
    4188             :          * already exists can also result in unnecessarily splitting a leaf.
    4189             :          */
    4190      135961 :         if (!inode_item_dropped && inode->logged_trans == trans->transid) {
    4191      134032 :                 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
    4192      134046 :                 ASSERT(ret <= 0);
    4193      134046 :                 if (ret > 0)
    4194             :                         ret = -ENOENT;
    4195             :         } else {
    4196             :                 /*
    4197             :                  * This means it is the first fsync in the current transaction,
    4198             :                  * so the inode item is not in the log and we need to insert it.
    4199             :                  * We can never get -EEXIST because we are only called for a fast
    4200             :                  * fsync and in case an inode eviction happens after the inode was
    4201             :                  * logged before in the current transaction, when we load again
    4202             :                  * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
    4203             :                  * flags and set ->logged_trans to 0.
    4204             :                  */
    4205        1929 :                 ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
    4206             :                                               sizeof(*inode_item));
    4207      135955 :                 ASSERT(ret != -EEXIST);
    4208             :         }
    4209      135955 :         if (ret)
    4210           0 :                 return ret;
    4211      135955 :         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
    4212             :                                     struct btrfs_inode_item);
    4213      135946 :         fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
    4214             :                         0, 0);
    4215      135952 :         btrfs_release_path(path);
    4216      135952 :         return 0;
    4217             : }
    4218             : 
    4219     1667518 : static int log_csums(struct btrfs_trans_handle *trans,
    4220             :                      struct btrfs_inode *inode,
    4221             :                      struct btrfs_root *log_root,
    4222             :                      struct btrfs_ordered_sum *sums)
    4223             : {
    4224     1667518 :         const u64 lock_end = sums->logical + sums->len - 1;
    4225     1667518 :         struct extent_state *cached_state = NULL;
    4226     1667518 :         int ret;
    4227             : 
    4228             :         /*
    4229             :          * If this inode was not used for reflink operations in the current
    4230             :          * transaction with new extents, then do the fast path, no need to
    4231             :          * worry about logging checksum items with overlapping ranges.
    4232             :          */
    4233     1667518 :         if (inode->last_reflink_trans < trans->transid)
    4234      537876 :                 return btrfs_csum_file_blocks(trans, log_root, sums);
    4235             : 
    4236             :         /*
    4237             :          * Serialize logging for checksums. This is to avoid racing with the
    4238             :          * same checksum being logged by another task that is logging another
    4239             :          * file which happens to refer to the same extent as well. Such races
    4240             :          * can leave checksum items in the log with overlapping ranges.
    4241             :          */
    4242     1129642 :         ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
    4243             :                           &cached_state);
    4244     1129642 :         if (ret)
    4245             :                 return ret;
    4246             :         /*
    4247             :          * Due to extent cloning, we might have logged a csum item that covers a
    4248             :          * subrange of a cloned extent, and later we can end up logging a csum
    4249             :          * item for a larger subrange of the same extent or the entire range.
    4250             :          * This would leave csum items in the log tree that cover the same range
    4251             :          * and break the searches for checksums in the log tree, resulting in
    4252             :          * some checksums missing in the fs/subvolume tree. So just delete (or
    4253             :          * trim and adjust) any existing csum items in the log for this range.
    4254             :          */
    4255     1129642 :         ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
    4256     1129642 :         if (!ret)
    4257     1129642 :                 ret = btrfs_csum_file_blocks(trans, log_root, sums);
    4258             : 
    4259     1129642 :         unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
    4260             :                       &cached_state);
    4261             : 
    4262     1129642 :         return ret;
    4263             : }
    4264             : 
    4265      146695 : static noinline int copy_items(struct btrfs_trans_handle *trans,
    4266             :                                struct btrfs_inode *inode,
    4267             :                                struct btrfs_path *dst_path,
    4268             :                                struct btrfs_path *src_path,
    4269             :                                int start_slot, int nr, int inode_only,
    4270             :                                u64 logged_isize)
    4271             : {
    4272      146695 :         struct btrfs_root *log = inode->root->log_root;
    4273      146695 :         struct btrfs_file_extent_item *extent;
    4274      146695 :         struct extent_buffer *src;
    4275      146695 :         int ret = 0;
    4276      146695 :         struct btrfs_key *ins_keys;
    4277      146695 :         u32 *ins_sizes;
    4278      146695 :         struct btrfs_item_batch batch;
    4279      146695 :         char *ins_data;
    4280      146695 :         int i;
    4281      146695 :         int dst_index;
    4282      146695 :         const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
    4283      146695 :         const u64 i_size = i_size_read(&inode->vfs_inode);
    4284             : 
    4285             :         /*
    4286             :          * To keep lockdep happy and avoid deadlocks, clone the source leaf and
    4287             :          * use the clone. This is because otherwise we would be changing the log
    4288             :          * tree, to insert items from the subvolume tree or insert csum items,
    4289             :          * while holding a read lock on a leaf from the subvolume tree, which
    4290             :          * creates a nasty lock dependency when COWing log tree nodes/leaves:
    4291             :          *
    4292             :          * 1) Modifying the log tree triggers an extent buffer allocation while
    4293             :          *    holding a write lock on a parent extent buffer from the log tree.
    4294             :          *    Allocating the pages for an extent buffer, or the extent buffer
    4295             :          *    struct, can trigger inode eviction and finally the inode eviction
    4296             :          *    will trigger a release/remove of a delayed node, which requires
    4297             :          *    taking the delayed node's mutex;
    4298             :          *
    4299             :          * 2) Allocating a metadata extent for a log tree can trigger the async
    4300             :          *    reclaim thread and make us wait for it to release enough space and
    4301             :          *    unblock our reservation ticket. The reclaim thread can start
    4302             :          *    flushing delayed items, and that in turn results in the need to
    4303             :          *    lock delayed node mutexes and in the need to write lock extent
    4304             :          *    buffers of a subvolume tree - all this while holding a write lock
    4305             :          *    on the parent extent buffer in the log tree.
    4306             :          *
    4307             :          * So one task in scenario 1) running in parallel with another task in
    4308             :          * scenario 2) could lead to a deadlock, one wanting to lock a delayed
    4309             :          * node mutex while having a read lock on a leaf from the subvolume,
    4310             :          * while the other is holding the delayed node's mutex and wants to
    4311             :          * write lock the same subvolume leaf for flushing delayed items.
    4312             :          */
    4313      146695 :         src = btrfs_clone_extent_buffer(src_path->nodes[0]);
    4314      146696 :         if (!src)
    4315             :                 return -ENOMEM;
    4316             : 
    4317      146696 :         i = src_path->slots[0];
    4318      146696 :         btrfs_release_path(src_path);
    4319      146696 :         src_path->nodes[0] = src;
    4320      146696 :         src_path->slots[0] = i;
    4321             : 
    4322      146696 :         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
    4323             :                            nr * sizeof(u32), GFP_NOFS);
    4324      146694 :         if (!ins_data)
    4325             :                 return -ENOMEM;
    4326             : 
    4327      146694 :         ins_sizes = (u32 *)ins_data;
    4328      146694 :         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
    4329      146694 :         batch.keys = ins_keys;
    4330      146694 :         batch.data_sizes = ins_sizes;
    4331      146694 :         batch.total_data_size = 0;
    4332      146694 :         batch.nr = 0;
    4333             : 
    4334      146694 :         dst_index = 0;
    4335     2797496 :         for (i = 0; i < nr; i++) {
    4336     2650800 :                 const int src_slot = start_slot + i;
    4337     2650800 :                 struct btrfs_root *csum_root;
    4338     2650800 :                 struct btrfs_ordered_sum *sums;
    4339     2650800 :                 struct btrfs_ordered_sum *sums_next;
    4340     2650800 :                 LIST_HEAD(ordered_sums);
    4341     2650800 :                 u64 disk_bytenr;
    4342     2650800 :                 u64 disk_num_bytes;
    4343     2650800 :                 u64 extent_offset;
    4344     2650800 :                 u64 extent_num_bytes;
    4345     2650800 :                 bool is_old_extent;
    4346             : 
    4347     2650800 :                 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
    4348             : 
    4349     2650801 :                 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
    4350      244059 :                         goto add_to_batch;
    4351             : 
    4352     2406742 :                 extent = btrfs_item_ptr(src, src_slot,
    4353             :                                         struct btrfs_file_extent_item);
    4354             : 
    4355     2406743 :                 is_old_extent = (btrfs_file_extent_generation(src, extent) <
    4356     2406741 :                                  trans->transid);
    4357             : 
    4358             :                 /*
    4359             :                  * Don't copy extents from past generations. That would make us
    4360             :                  * log a lot more metadata for common cases like doing only a
    4361             :                  * few random writes into a file and then fsync it for the first
    4362             :                  * time or after the full sync flag is set on the inode. We can
    4363             :                  * get leaves full of extent items, most of which are from past
    4364             :                  * generations, so we can skip them - as long as the inode has
    4365             :                  * not been the target of a reflink operation in this transaction,
    4366             :                  * as in that case it might have had file extent items with old
    4367             :                  * generations copied into it. We also must always log prealloc
    4368             :                  * extents that start at or beyond eof, otherwise we would lose
    4369             :                  * them on log replay.
    4370             :                  */
    4371     2406741 :                 if (is_old_extent &&
    4372      612113 :                     ins_keys[dst_index].offset < i_size &&
    4373      611920 :                     inode->last_reflink_trans < trans->transid)
    4374      603129 :                         continue;
    4375             : 
    4376     1803612 :                 if (skip_csum)
    4377          60 :                         goto add_to_batch;
    4378             : 
    4379             :                 /* Only regular extents have checksums. */
    4380     1803552 :                 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
    4381      400752 :                         goto add_to_batch;
    4382             : 
    4383             :                 /*
    4384             :                  * If it's an extent created in a past transaction, then its
    4385             :                  * checksums are already accessible from the committed csum tree,
    4386             :                  * no need to log them.
    4387             :                  */
    4388     1402800 :                 if (is_old_extent)
    4389        5801 :                         goto add_to_batch;
    4390             : 
    4391     1396999 :                 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
    4392             :                 /* If it's an explicit hole, there are no checksums. */
    4393     1396998 :                 if (disk_bytenr == 0)
    4394           2 :                         goto add_to_batch;
    4395             : 
    4396     1396996 :                 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
    4397             : 
    4398     1396994 :                 if (btrfs_file_extent_compression(src, extent)) {
    4399             :                         extent_offset = 0;
    4400             :                         extent_num_bytes = disk_num_bytes;
    4401             :                 } else {
    4402     1396831 :                         extent_offset = btrfs_file_extent_offset(src, extent);
    4403     1396830 :                         extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
    4404             :                 }
    4405             : 
    4406     1396998 :                 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
    4407     1396999 :                 disk_bytenr += extent_offset;
    4408     1396999 :                 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
    4409     1396999 :                                               disk_bytenr + extent_num_bytes - 1,
    4410             :                                               &ordered_sums, 0, false);
    4411     1396999 :                 if (ret)
    4412           0 :                         goto out;
    4413             : 
    4414     2796851 :                 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
    4415     1399852 :                         if (!ret)
    4416     1399852 :                                 ret = log_csums(trans, inode, log, sums);
    4417     1399853 :                         list_del(&sums->list);
    4418     1399853 :                         kfree(sums);
    4419             :                 }
    4420     1396999 :                 if (ret)
    4421           0 :                         goto out;
    4422             : 
    4423     1396999 : add_to_batch:
    4424     2047673 :                 ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
    4425     2047673 :                 batch.total_data_size += ins_sizes[dst_index];
    4426     2047673 :                 batch.nr++;
    4427     2047673 :                 dst_index++;
    4428             :         }
    4429             : 
    4430             :         /*
    4431             :          * We have a leaf full of old extent items that don't need to be logged,
    4432             :          * so we don't need to do anything.
    4433             :          */
    4434      146696 :         if (batch.nr == 0)
    4435        3171 :                 goto out;
    4436             : 
    4437      143525 :         ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
    4438      143525 :         if (ret)
    4439           0 :                 goto out;
    4440             : 
    4441             :         dst_index = 0;
    4442     2203569 :         for (i = 0; i < nr; i++) {
    4443     2061359 :                 const int src_slot = start_slot + i;
    4444     2061359 :                 const int dst_slot = dst_path->slots[0] + dst_index;
    4445     2061359 :                 struct btrfs_key key;
    4446     2061359 :                 unsigned long src_offset;
    4447     2061359 :                 unsigned long dst_offset;
    4448             : 
    4449             :                 /*
    4450             :                  * We're done, all the remaining items in the source leaf
    4451             :                  * correspond to old file extent items.
    4452             :                  */
    4453     2061359 :                 if (dst_index >= batch.nr)
    4454             :                         break;
    4455             : 
    4456     2060044 :                 btrfs_item_key_to_cpu(src, &key, src_slot);
    4457             : 
    4458     2060045 :                 if (key.type != BTRFS_EXTENT_DATA_KEY)
    4459      244062 :                         goto copy_item;
    4460             : 
    4461     1815983 :                 extent = btrfs_item_ptr(src, src_slot,
    4462             :                                         struct btrfs_file_extent_item);
    4463             : 
    4464             :                 /* See the comment in the previous loop, same logic. */
    4465     1815983 :                 if (btrfs_file_extent_generation(src, extent) < trans->transid &&
    4466       21354 :                     key.offset < i_size &&
    4467       21161 :                     inode->last_reflink_trans < trans->transid)
    4468       12370 :                         continue;
    4469             : 
    4470     1803614 : copy_item:
    4471     2047676 :                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
    4472     2047675 :                 src_offset = btrfs_item_ptr_offset(src, src_slot);
    4473             : 
    4474     2047674 :                 if (key.type == BTRFS_INODE_ITEM_KEY) {
    4475      119167 :                         struct btrfs_inode_item *inode_item;
    4476             : 
    4477      119167 :                         inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
    4478             :                                                     struct btrfs_inode_item);
    4479      119166 :                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
    4480             :                                         &inode->vfs_inode,
    4481             :                                         inode_only == LOG_INODE_EXISTS,
    4482             :                                         logged_isize);
    4483             :                 } else {
    4484     1928507 :                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
    4485     1928507 :                                            src_offset, ins_sizes[dst_index]);
    4486             :                 }
    4487             : 
    4488     2047674 :                 dst_index++;
    4489             :         }
    4490             : 
    4491      143525 :         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
    4492      143525 :         btrfs_release_path(dst_path);
    4493      146696 : out:
    4494      146696 :         kfree(ins_data);
    4495             : 
    4496      146696 :         return ret;
    4497             : }
    4498             : 
    4499      461814 : static int extent_cmp(void *priv, const struct list_head *a,
    4500             :                       const struct list_head *b)
    4501             : {
    4502      461814 :         const struct extent_map *em1, *em2;
    4503             : 
    4504      461814 :         em1 = list_entry(a, struct extent_map, list);
    4505      461814 :         em2 = list_entry(b, struct extent_map, list);
    4506             : 
    4507      461814 :         if (em1->start < em2->start)
    4508             :                 return -1;
    4509      304727 :         else if (em1->start > em2->start)
    4510      304735 :                 return 1;
    4511             :         return 0;
    4512             : }
    4513             : 
    4514      325169 : static int log_extent_csums(struct btrfs_trans_handle *trans,
    4515             :                             struct btrfs_inode *inode,
    4516             :                             struct btrfs_root *log_root,
    4517             :                             const struct extent_map *em,
    4518             :                             struct btrfs_log_ctx *ctx)
    4519             : {
    4520      325169 :         struct btrfs_ordered_extent *ordered;
    4521      325169 :         struct btrfs_root *csum_root;
    4522      325169 :         u64 csum_offset;
    4523      325169 :         u64 csum_len;
    4524      325169 :         u64 mod_start = em->mod_start;
    4525      325169 :         u64 mod_len = em->mod_len;
    4526      325169 :         LIST_HEAD(ordered_sums);
    4527      325169 :         int ret = 0;
    4528             : 
    4529      650288 :         if (inode->flags & BTRFS_INODE_NODATASUM ||
    4530      325119 :             test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
    4531      299214 :             em->block_start == EXTENT_MAP_HOLE)
    4532             :                 return 0;
    4533             : 
    4534     1817748 :         list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
    4535     1669550 :                 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
    4536     1669550 :                 const u64 mod_end = mod_start + mod_len;
    4537     1669550 :                 struct btrfs_ordered_sum *sums;
    4538             : 
    4539     1669550 :                 if (mod_len == 0)
    4540             :                         break;
    4541             : 
    4542     1587563 :                 if (ordered_end <= mod_start)
    4543     1345923 :                         continue;
    4544      241640 :                 if (mod_end <= ordered->file_offset)
    4545             :                         break;
    4546             : 
    4547             :                 /*
    4548             :                  * We are going to copy all the csums on this ordered extent, so
    4549             :                  * go ahead and adjust mod_start and mod_len in case this ordered
    4550             :                  * extent has already been logged.
    4551             :                  */
    4552      204666 :                 if (ordered->file_offset > mod_start) {
    4553           0 :                         if (ordered_end >= mod_end)
    4554           0 :                                 mod_len = ordered->file_offset - mod_start;
    4555             :                         /*
    4556             :                          * If we have this case
    4557             :                          *
    4558             :                          * |--------- logged extent ---------|
    4559             :                          *       |----- ordered extent ----|
    4560             :                          *
    4561             :                          * Just don't mess with mod_start and mod_len, we'll
    4562             :                          * just end up logging more csums than we need and it
    4563             :                          * will be ok.
    4564             :                          */
    4565             :                 } else {
    4566      204666 :                         if (ordered_end < mod_end) {
    4567           0 :                                 mod_len = mod_end - ordered_end;
    4568           0 :                                 mod_start = ordered_end;
    4569             :                         } else {
    4570             :                                 mod_len = 0;
    4571             :                         }
    4572             :                 }
    4573             : 
    4574             :                 /*
    4575             :                  * To keep us from looping for the above case of an ordered
    4576             :                  * extent that falls inside of the logged extent.
    4577             :                  */
    4578      204666 :                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
    4579          23 :                         continue;
    4580             : 
    4581      409463 :                 list_for_each_entry(sums, &ordered->list, list) {
    4582      204816 :                         ret = log_csums(trans, inode, log_root, sums);
    4583      204767 :                         if (ret)
    4584           0 :                                 return ret;
    4585             :                 }
    4586             :         }
    4587             : 
    4588             :         /* We're done, found all csums in the ordered extents. */
    4589      267159 :         if (mod_len == 0)
    4590             :                 return 0;
    4591             : 
    4592             :         /* If we're compressed we have to save the entire range of csums. */
    4593       62510 :         if (em->compress_type) {
    4594          75 :                 csum_offset = 0;
    4595          75 :                 csum_len = max(em->block_len, em->orig_block_len);
    4596             :         } else {
    4597       62435 :                 csum_offset = mod_start - em->start;
    4598       62435 :                 csum_len = mod_len;
    4599             :         }
    4600             : 
    4601             :         /* block start is already adjusted for the file extent offset. */
    4602       62510 :         csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
    4603       62511 :         ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
    4604       62511 :                                       em->block_start + csum_offset +
    4605             :                                       csum_len - 1, &ordered_sums, 0, false);
    4606       62511 :         if (ret)
    4607             :                 return ret;
    4608             : 
    4609      125372 :         while (!list_empty(&ordered_sums)) {
    4610       62861 :                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
    4611             :                                                    struct btrfs_ordered_sum,
    4612             :                                                    list);
    4613       62861 :                 if (!ret)
    4614       62861 :                         ret = log_csums(trans, inode, log_root, sums);
    4615       62861 :                 list_del(&sums->list);
    4616       62861 :                 kfree(sums);
    4617             :         }
    4618             : 
    4619             :         return ret;
    4620             : }
    4621             : 
    4622      325187 : static int log_one_extent(struct btrfs_trans_handle *trans,
    4623             :                           struct btrfs_inode *inode,
    4624             :                           const struct extent_map *em,
    4625             :                           struct btrfs_path *path,
    4626             :                           struct btrfs_log_ctx *ctx)
    4627             : {
    4628      325187 :         struct btrfs_drop_extents_args drop_args = { 0 };
    4629      325187 :         struct btrfs_root *log = inode->root->log_root;
    4630      325187 :         struct btrfs_file_extent_item fi = { 0 };
    4631      325187 :         struct extent_buffer *leaf;
    4632      325187 :         struct btrfs_key key;
    4633      325187 :         u64 extent_offset = em->start - em->orig_start;
    4634      325187 :         u64 block_len;
    4635      325187 :         int ret;
    4636             : 
    4637      325187 :         btrfs_set_stack_file_extent_generation(&fi, trans->transid);
    4638      650374 :         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
    4639       25905 :                 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
    4640             :         else
    4641      299282 :                 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
    4642             : 
    4643      325187 :         block_len = max(em->block_len, em->orig_block_len);
    4644      325187 :         if (em->compress_type != BTRFS_COMPRESS_NONE) {
    4645         101 :                 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
    4646         101 :                 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
    4647      325086 :         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
    4648      293005 :                 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
    4649             :                                                         extent_offset);
    4650      293005 :                 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
    4651             :         }
    4652             : 
    4653      325187 :         btrfs_set_stack_file_extent_offset(&fi, extent_offset);
    4654      325187 :         btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
    4655      325187 :         btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
    4656      325187 :         btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
    4657             : 
    4658      325187 :         ret = log_extent_csums(trans, inode, log, em, ctx);
    4659      325166 :         if (ret)
    4660             :                 return ret;
    4661             : 
    4662             :         /*
    4663             :          * If this is the first time we are logging the inode in the current
    4664             :          * transaction, we can avoid btrfs_drop_extents(), which is expensive
    4665             :          * because it does a deletion search, which always acquires write locks
    4666             :          * for extent buffers at levels 2, 1 and 0. This not only wastes time
    4667             :          * but also adds significant contention in a log tree, since log trees
    4668             :          * are small, with a root at level 2 or 3 at most, due to their short
    4669             :          * life span.
    4670             :          */
    4671      325166 :         if (ctx->logged_before) {
    4672      319955 :                 drop_args.path = path;
    4673      319955 :                 drop_args.start = em->start;
    4674      319955 :                 drop_args.end = em->start + em->len;
    4675      319955 :                 drop_args.replace_extent = true;
    4676      319955 :                 drop_args.extent_item_size = sizeof(fi);
    4677      319955 :                 ret = btrfs_drop_extents(trans, log, inode, &drop_args);
    4678      319976 :                 if (ret)
    4679             :                         return ret;
    4680             :         }
    4681             : 
    4682      325187 :         if (!drop_args.extent_inserted) {
    4683        7656 :                 key.objectid = btrfs_ino(inode);
    4684        7656 :                 key.type = BTRFS_EXTENT_DATA_KEY;
    4685        7656 :                 key.offset = em->start;
    4686             : 
    4687        7656 :                 ret = btrfs_insert_empty_item(trans, log, path, &key,
    4688             :                                               sizeof(fi));
    4689        7656 :                 if (ret)
    4690             :                         return ret;
    4691             :         }
    4692      325187 :         leaf = path->nodes[0];
    4693      650366 :         write_extent_buffer(leaf, &fi,
    4694      325187 :                             btrfs_item_ptr_offset(leaf, path->slots[0]),
    4695             :                             sizeof(fi));
    4696      325195 :         btrfs_mark_buffer_dirty(leaf);
    4697             : 
    4698      325217 :         btrfs_release_path(path);
    4699             : 
    4700      325217 :         return ret;
    4701             : }
    4702             : 
    4703             : /*
    4704             :  * Log all prealloc extents beyond the inode's i_size to make sure we do not
    4705             :  * lose them after doing a full/fast fsync and replaying the log. We scan the
    4706             :  * subvolume's root instead of iterating the inode's extent map tree because
    4707             :  * otherwise we can log incorrect extent items based on extent map conversion.
    4708             :  * That can happen due to the fact that extent maps are merged when they
    4709             :  * are not in the extent map tree's list of modified extents.
    4710             :  */
    4711      251711 : static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
    4712             :                                       struct btrfs_inode *inode,
    4713             :                                       struct btrfs_path *path)
    4714             : {
    4715      251711 :         struct btrfs_root *root = inode->root;
    4716      251711 :         struct btrfs_key key;
    4717      251711 :         const u64 i_size = i_size_read(&inode->vfs_inode);
    4718      251711 :         const u64 ino = btrfs_ino(inode);
    4719      251711 :         struct btrfs_path *dst_path = NULL;
    4720      251711 :         bool dropped_extents = false;
    4721      251711 :         u64 truncate_offset = i_size;
    4722      251711 :         struct extent_buffer *leaf;
    4723      251711 :         int slot;
    4724      251711 :         int ins_nr = 0;
    4725      251711 :         int start_slot;
    4726      251711 :         int ret;
    4727             : 
    4728      251711 :         if (!(inode->flags & BTRFS_INODE_PREALLOC))
    4729             :                 return 0;
    4730             : 
    4731      201645 :         key.objectid = ino;
    4732      201645 :         key.type = BTRFS_EXTENT_DATA_KEY;
    4733      201645 :         key.offset = i_size;
    4734      201645 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    4735      201645 :         if (ret < 0)
    4736           0 :                 goto out;
    4737             : 
    4738             :         /*
    4739             :          * We must check if there is a prealloc extent that starts before the
    4740             :          * i_size and crosses the i_size boundary. This is to ensure later we
    4741             :          * truncate down to the end of that extent and not to the i_size, as
    4742             :          * otherwise we end up losing part of the prealloc extent after a log
    4743             :          * replay and with an implicit hole if there is another prealloc extent
    4744             :          * that starts at an offset beyond i_size.
    4745             :          */
    4746      201645 :         ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
    4747      201645 :         if (ret < 0)
    4748           0 :                 goto out;
    4749             : 
    4750      201645 :         if (ret == 0) {
    4751      201166 :                 struct btrfs_file_extent_item *ei;
    4752             : 
    4753      201166 :                 leaf = path->nodes[0];
    4754      201166 :                 slot = path->slots[0];
    4755      201166 :                 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
    4756             : 
    4757      201166 :                 if (btrfs_file_extent_type(leaf, ei) ==
    4758             :                     BTRFS_FILE_EXTENT_PREALLOC) {
    4759       33135 :                         u64 extent_end;
    4760             : 
    4761       33135 :                         btrfs_item_key_to_cpu(leaf, &key, slot);
    4762       33135 :                         extent_end = key.offset +
    4763             :                                 btrfs_file_extent_num_bytes(leaf, ei);
    4764             : 
    4765       33135 :                         if (extent_end > i_size)
    4766             :                                 truncate_offset = extent_end;
    4767             :                 }
    4768             :         } else {
    4769             :                 ret = 0;
    4770             :         }
    4771             : 
    4772      421513 :         while (true) {
    4773      421513 :                 leaf = path->nodes[0];
    4774      421513 :                 slot = path->slots[0];
    4775             : 
    4776      421513 :                 if (slot >= btrfs_header_nritems(leaf)) {
    4777      113317 :                         if (ins_nr > 0) {
    4778        8307 :                                 ret = copy_items(trans, inode, dst_path, path,
    4779             :                                                  start_slot, ins_nr, 1, 0);
    4780        8307 :                                 if (ret < 0)
    4781           0 :                                         goto out;
    4782             :                                 ins_nr = 0;
    4783             :                         }
    4784      113317 :                         ret = btrfs_next_leaf(root, path);
    4785      113317 :                         if (ret < 0)
    4786           0 :                                 goto out;
    4787      113317 :                         if (ret > 0) {
    4788             :                                 ret = 0;
    4789             :                                 break;
    4790             :                         }
    4791          37 :                         continue;
    4792             :                 }
    4793             : 
    4794      308196 :                 btrfs_item_key_to_cpu(leaf, &key, slot);
    4795      308196 :                 if (key.objectid > ino)
    4796             :                         break;
    4797      219831 :                 if (WARN_ON_ONCE(key.objectid < ino) ||
    4798      219831 :                     key.type < BTRFS_EXTENT_DATA_KEY ||
    4799      219352 :                     key.offset < i_size) {
    4800      201648 :                         path->slots[0]++;
    4801      201648 :                         continue;
    4802             :                 }
    4803       18183 :                 if (!dropped_extents) {
    4804             :                         /*
    4805             :                          * Avoid logging extent items logged in past fsync calls
    4806             :                          * and leading to duplicate keys in the log tree.
    4807             :                          */
    4808       16452 :                         ret = truncate_inode_items(trans, root->log_root, inode,
    4809             :                                                    truncate_offset,
    4810             :                                                    BTRFS_EXTENT_DATA_KEY);
    4811       16452 :                         if (ret)
    4812           0 :                                 goto out;
    4813             :                         dropped_extents = true;
    4814             :                 }
    4815       18183 :                 if (ins_nr == 0)
    4816       16453 :                         start_slot = slot;
    4817       18183 :                 ins_nr++;
    4818       18183 :                 path->slots[0]++;
    4819       18183 :                 if (!dst_path) {
    4820       16452 :                         dst_path = btrfs_alloc_path();
    4821       16452 :                         if (!dst_path) {
    4822           0 :                                 ret = -ENOMEM;
    4823           0 :                                 goto out;
    4824             :                         }
    4825             :                 }
    4826             :         }
    4827      201645 :         if (ins_nr > 0)
    4828        8146 :                 ret = copy_items(trans, inode, dst_path, path,
    4829             :                                  start_slot, ins_nr, 1, 0);
    4830      193499 : out:
    4831      201645 :         btrfs_release_path(path);
    4832      201645 :         btrfs_free_path(dst_path);
    4833      201645 :         return ret;
    4834             : }
    4835             : 
    4836      135509 : static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
    4837             :                                      struct btrfs_inode *inode,
    4838             :                                      struct btrfs_path *path,
    4839             :                                      struct btrfs_log_ctx *ctx)
    4840             : {
    4841      135509 :         struct btrfs_ordered_extent *ordered;
    4842      135509 :         struct btrfs_ordered_extent *tmp;
    4843      135509 :         struct extent_map *em, *n;
    4844      135509 :         struct list_head extents;
    4845      135509 :         struct extent_map_tree *tree = &inode->extent_tree;
    4846      135509 :         int ret = 0;
    4847      135509 :         int num = 0;
    4848             : 
    4849      135509 :         INIT_LIST_HEAD(&extents);
    4850             : 
    4851      135509 :         write_lock(&tree->lock);
    4852             : 
    4853      488849 :         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
    4854      353305 :                 list_del_init(&em->list);
    4855             :                 /*
    4856             :                  * Just an arbitrary number, this can be really CPU intensive
    4857             :                  * once we start getting a lot of extents, and really once we
    4858             :                  * have a bunch of extents we just want to commit since it will
    4859             :                  * be faster.
    4860             :                  */
    4861      353287 :                 if (++num > 32768) {
    4862           0 :                         list_del_init(&tree->modified_extents);
    4863           0 :                         ret = -EFBIG;
    4864           0 :                         goto process;
    4865             :                 }
    4866             : 
    4867      353287 :                 if (em->generation < trans->transid)
    4868       26395 :                         continue;
    4869             : 
    4870             :                 /* We log prealloc extents beyond eof later. */
    4871      653784 :                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
    4872       27655 :                     em->start >= i_size_read(&inode->vfs_inode))
    4873        1750 :                         continue;
    4874             : 
    4875             :                 /* Need a ref to keep it from getting evicted from cache */
    4876      325142 :                 refcount_inc(&em->refs);
    4877      325193 :                 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
    4878      325182 :                 list_add_tail(&em->list, &extents);
    4879      325174 :                 num++;
    4880             :         }
    4881             : 
    4882      135544 :         list_sort(NULL, &extents, extent_cmp);
    4883             : process:
    4884      460754 :         while (!list_empty(&extents)) {
    4885      325204 :                 em = list_entry(extents.next, struct extent_map, list);
    4886             : 
    4887      325204 :                 list_del_init(&em->list);
    4888             : 
    4889             :                 /*
    4890             :                  * If we had an error we just need to delete everybody from our
    4891             :                  * private list.
    4892             :                  */
    4893      325175 :                 if (ret) {
    4894           0 :                         clear_em_logging(tree, em);
    4895           0 :                         free_extent_map(em);
    4896           0 :                         continue;
    4897             :                 }
    4898             : 
    4899      325175 :                 write_unlock(&tree->lock);
    4900             : 
    4901      325191 :                 ret = log_one_extent(trans, inode, em, path, ctx);
    4902      325219 :                 write_lock(&tree->lock);
    4903      325220 :                 clear_em_logging(tree, em);
    4904      325205 :                 free_extent_map(em);
    4905             :         }
    4906      135550 :         WARN_ON(!list_empty(&extents));
    4907      135550 :         write_unlock(&tree->lock);
    4908             : 
    4909      135540 :         if (!ret)
    4910      135538 :                 ret = btrfs_log_prealloc_extents(trans, inode, path);
    4911      135519 :         if (ret)
    4912             :                 return ret;
    4913             : 
    4914             :         /*
    4915             :          * We have logged all extents successfully, now make sure the commit of
    4916             :          * the current transaction waits for the ordered extents to complete
    4917             :          * before it commits and wipes out the log trees, otherwise we would
    4918             :          * lose data if an ordered extents completes after the transaction
    4919             :          * commits and a power failure happens after the transaction commit.
    4920             :          */
    4921      340568 :         list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
    4922      205025 :                 list_del_init(&ordered->log_list);
    4923      205008 :                 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
    4924             : 
    4925      205047 :                 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
    4926        7413 :                         spin_lock_irq(&inode->ordered_tree.lock);
    4927        7409 :                         if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
    4928        7403 :                                 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
    4929        7406 :                                 atomic_inc(&trans->transaction->pending_ordered);
    4930             :                         }
    4931        7412 :                         spin_unlock_irq(&inode->ordered_tree.lock);
    4932             :                 }
    4933      205047 :                 btrfs_put_ordered_extent(ordered);
    4934             :         }
    4935             : 
    4936             :         return 0;
    4937             : }
    4938             : 
    4939         732 : static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
    4940             :                              struct btrfs_path *path, u64 *size_ret)
    4941             : {
    4942         732 :         struct btrfs_key key;
    4943         732 :         int ret;
    4944             : 
    4945         732 :         key.objectid = btrfs_ino(inode);
    4946         732 :         key.type = BTRFS_INODE_ITEM_KEY;
    4947         732 :         key.offset = 0;
    4948             : 
    4949         732 :         ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
    4950         732 :         if (ret < 0) {
    4951             :                 return ret;
    4952         732 :         } else if (ret > 0) {
    4953           0 :                 *size_ret = 0;
    4954             :         } else {
    4955         732 :                 struct btrfs_inode_item *item;
    4956             : 
    4957         732 :                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
    4958             :                                       struct btrfs_inode_item);
    4959         732 :                 *size_ret = btrfs_inode_size(path->nodes[0], item);
    4960             :                 /*
    4961             :                  * If the in-memory inode's i_size is smaller then the inode
    4962             :                  * size stored in the btree, return the inode's i_size, so
    4963             :                  * that we get a correct inode size after replaying the log
    4964             :                  * when before a power failure we had a shrinking truncate
    4965             :                  * followed by addition of a new name (rename / new hard link).
    4966             :                  * Otherwise return the inode size from the btree, to avoid
    4967             :                  * data loss when replaying a log due to previously doing a
    4968             :                  * write that expands the inode's size and logging a new name
    4969             :                  * immediately after.
    4970             :                  */
    4971         732 :                 if (*size_ret > inode->vfs_inode.i_size)
    4972          16 :                         *size_ret = inode->vfs_inode.i_size;
    4973             :         }
    4974             : 
    4975         732 :         btrfs_release_path(path);
    4976         732 :         return 0;
    4977             : }
    4978             : 
    4979             : /*
    4980             :  * At the moment we always log all xattrs. This is to figure out at log replay
    4981             :  * time which xattrs must have their deletion replayed. If a xattr is missing
    4982             :  * in the log tree and exists in the fs/subvol tree, we delete it. This is
    4983             :  * because if a xattr is deleted, the inode is fsynced and a power failure
    4984             :  * happens, causing the log to be replayed the next time the fs is mounted,
    4985             :  * we want the xattr to not exist anymore (same behaviour as other filesystems
    4986             :  * with a journal, ext3/4, xfs, f2fs, etc).
    4987             :  */
    4988      121086 : static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
    4989             :                                 struct btrfs_inode *inode,
    4990             :                                 struct btrfs_path *path,
    4991             :                                 struct btrfs_path *dst_path)
    4992             : {
    4993      121086 :         struct btrfs_root *root = inode->root;
    4994      121086 :         int ret;
    4995      121086 :         struct btrfs_key key;
    4996      121086 :         const u64 ino = btrfs_ino(inode);
    4997      121086 :         int ins_nr = 0;
    4998      121086 :         int start_slot = 0;
    4999      121086 :         bool found_xattrs = false;
    5000             : 
    5001      242172 :         if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
    5002             :                 return 0;
    5003             : 
    5004        9715 :         key.objectid = ino;
    5005        9715 :         key.type = BTRFS_XATTR_ITEM_KEY;
    5006        9715 :         key.offset = 0;
    5007             : 
    5008        9715 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    5009        9716 :         if (ret < 0)
    5010             :                 return ret;
    5011             : 
    5012       13697 :         while (true) {
    5013       13697 :                 int slot = path->slots[0];
    5014       13697 :                 struct extent_buffer *leaf = path->nodes[0];
    5015       13697 :                 int nritems = btrfs_header_nritems(leaf);
    5016             : 
    5017       13697 :                 if (slot >= nritems) {
    5018         629 :                         if (ins_nr > 0) {
    5019          23 :                                 ret = copy_items(trans, inode, dst_path, path,
    5020             :                                                  start_slot, ins_nr, 1, 0);
    5021          23 :                                 if (ret < 0)
    5022           0 :                                         return ret;
    5023             :                                 ins_nr = 0;
    5024             :                         }
    5025         629 :                         ret = btrfs_next_leaf(root, path);
    5026         629 :                         if (ret < 0)
    5027           0 :                                 return ret;
    5028         629 :                         else if (ret > 0)
    5029             :                                 break;
    5030          71 :                         continue;
    5031             :                 }
    5032             : 
    5033       13068 :                 btrfs_item_key_to_cpu(leaf, &key, slot);
    5034       13067 :                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
    5035             :                         break;
    5036             : 
    5037        3910 :                 if (ins_nr == 0)
    5038        1622 :                         start_slot = slot;
    5039        3910 :                 ins_nr++;
    5040        3910 :                 path->slots[0]++;
    5041        3910 :                 found_xattrs = true;
    5042        3910 :                 cond_resched();
    5043             :         }
    5044        9715 :         if (ins_nr > 0) {
    5045        1599 :                 ret = copy_items(trans, inode, dst_path, path,
    5046             :                                  start_slot, ins_nr, 1, 0);
    5047        1599 :                 if (ret < 0)
    5048             :                         return ret;
    5049             :         }
    5050             : 
    5051        9715 :         if (!found_xattrs)
    5052        8104 :                 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
    5053             : 
    5054             :         return 0;
    5055             : }
    5056             : 
    5057             : /*
    5058             :  * When using the NO_HOLES feature if we punched a hole that causes the
    5059             :  * deletion of entire leafs or all the extent items of the first leaf (the one
    5060             :  * that contains the inode item and references) we may end up not processing
    5061             :  * any extents, because there are no leafs with a generation matching the
    5062             :  * current transaction that have extent items for our inode. So we need to find
    5063             :  * if any holes exist and then log them. We also need to log holes after any
    5064             :  * truncate operation that changes the inode's size.
    5065             :  */
    5066      116316 : static int btrfs_log_holes(struct btrfs_trans_handle *trans,
    5067             :                            struct btrfs_inode *inode,
    5068             :                            struct btrfs_path *path)
    5069             : {
    5070      116316 :         struct btrfs_root *root = inode->root;
    5071      116316 :         struct btrfs_fs_info *fs_info = root->fs_info;
    5072      116316 :         struct btrfs_key key;
    5073      116316 :         const u64 ino = btrfs_ino(inode);
    5074      116316 :         const u64 i_size = i_size_read(&inode->vfs_inode);
    5075      116316 :         u64 prev_extent_end = 0;
    5076      116316 :         int ret;
    5077             : 
    5078      116316 :         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
    5079             :                 return 0;
    5080             : 
    5081      114896 :         key.objectid = ino;
    5082      114896 :         key.type = BTRFS_EXTENT_DATA_KEY;
    5083      114896 :         key.offset = 0;
    5084             : 
    5085      114896 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    5086      114896 :         if (ret < 0)
    5087             :                 return ret;
    5088             : 
    5089     3725131 :         while (true) {
    5090     3840027 :                 struct extent_buffer *leaf = path->nodes[0];
    5091             : 
    5092     3840027 :                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
    5093       66015 :                         ret = btrfs_next_leaf(root, path);
    5094       66015 :                         if (ret < 0)
    5095           0 :                                 return ret;
    5096       66015 :                         if (ret > 0) {
    5097             :                                 ret = 0;
    5098             :                                 break;
    5099             :                         }
    5100       10742 :                         leaf = path->nodes[0];
    5101             :                 }
    5102             : 
    5103     3784754 :                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    5104     3784755 :                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
    5105             :                         break;
    5106             : 
    5107             :                 /* We have a hole, log it. */
    5108     3725132 :                 if (prev_extent_end < key.offset) {
    5109     2591079 :                         const u64 hole_len = key.offset - prev_extent_end;
    5110             : 
    5111             :                         /*
    5112             :                          * Release the path to avoid deadlocks with other code
    5113             :                          * paths that search the root while holding locks on
    5114             :                          * leafs from the log root.
    5115             :                          */
    5116     2591079 :                         btrfs_release_path(path);
    5117     2591079 :                         ret = btrfs_insert_hole_extent(trans, root->log_root,
    5118             :                                                        ino, prev_extent_end,
    5119             :                                                        hole_len);
    5120     2591079 :                         if (ret < 0)
    5121           0 :                                 return ret;
    5122             : 
    5123             :                         /*
    5124             :                          * Search for the same key again in the root. Since it's
    5125             :                          * an extent item and we are holding the inode lock, the
    5126             :                          * key must still exist. If it doesn't just emit warning
    5127             :                          * and return an error to fall back to a transaction
    5128             :                          * commit.
    5129             :                          */
    5130     2591079 :                         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    5131     2591079 :                         if (ret < 0)
    5132           0 :                                 return ret;
    5133     2591079 :                         if (WARN_ON(ret > 0))
    5134             :                                 return -ENOENT;
    5135             :                         leaf = path->nodes[0];
    5136             :                 }
    5137             : 
    5138     3725132 :                 prev_extent_end = btrfs_file_extent_end(path);
    5139     3725132 :                 path->slots[0]++;
    5140     3725132 :                 cond_resched();
    5141             :         }
    5142             : 
    5143      114896 :         if (prev_extent_end < i_size) {
    5144       17219 :                 u64 hole_len;
    5145             : 
    5146       17219 :                 btrfs_release_path(path);
    5147       17219 :                 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
    5148       17219 :                 ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
    5149             :                                                prev_extent_end, hole_len);
    5150       17219 :                 if (ret < 0)
    5151             :                         return ret;
    5152             :         }
    5153             : 
    5154             :         return 0;
    5155             : }
    5156             : 
    5157             : /*
    5158             :  * When we are logging a new inode X, check if it doesn't have a reference that
    5159             :  * matches the reference from some other inode Y created in a past transaction
    5160             :  * and that was renamed in the current transaction. If we don't do this, then at
    5161             :  * log replay time we can lose inode Y (and all its files if it's a directory):
    5162             :  *
    5163             :  * mkdir /mnt/x
    5164             :  * echo "hello world" > /mnt/x/foobar
    5165             :  * sync
    5166             :  * mv /mnt/x /mnt/y
    5167             :  * mkdir /mnt/x                 # or touch /mnt/x
    5168             :  * xfs_io -c fsync /mnt/x
    5169             :  * <power fail>
    5170             :  * mount fs, trigger log replay
    5171             :  *
    5172             :  * After the log replay procedure, we would lose the first directory and all its
    5173             :  * files (file foobar).
    5174             :  * For the case where inode Y is not a directory we simply end up losing it:
    5175             :  *
    5176             :  * echo "123" > /mnt/foo
    5177             :  * sync
    5178             :  * mv /mnt/foo /mnt/bar
    5179             :  * echo "abc" > /mnt/foo
    5180             :  * xfs_io -c fsync /mnt/foo
    5181             :  * <power fail>
    5182             :  *
    5183             :  * We also need this for cases where a snapshot entry is replaced by some other
    5184             :  * entry (file or directory) otherwise we end up with an unreplayable log due to
    5185             :  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
    5186             :  * if it were a regular entry:
    5187             :  *
    5188             :  * mkdir /mnt/x
    5189             :  * btrfs subvolume snapshot /mnt /mnt/x/snap
    5190             :  * btrfs subvolume delete /mnt/x/snap
    5191             :  * rmdir /mnt/x
    5192             :  * mkdir /mnt/x
    5193             :  * fsync /mnt/x or fsync some new file inside it
    5194             :  * <power fail>
    5195             :  *
    5196             :  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
    5197             :  * the same transaction.
    5198             :  */
    5199       17199 : static int btrfs_check_ref_name_override(struct extent_buffer *eb,
    5200             :                                          const int slot,
    5201             :                                          const struct btrfs_key *key,
    5202             :                                          struct btrfs_inode *inode,
    5203             :                                          u64 *other_ino, u64 *other_parent)
    5204             : {
    5205       17199 :         int ret;
    5206       17199 :         struct btrfs_path *search_path;
    5207       17199 :         char *name = NULL;
    5208       17199 :         u32 name_len = 0;
    5209       17199 :         u32 item_size = btrfs_item_size(eb, slot);
    5210       17199 :         u32 cur_offset = 0;
    5211       17199 :         unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
    5212             : 
    5213       17199 :         search_path = btrfs_alloc_path();
    5214       17199 :         if (!search_path)
    5215             :                 return -ENOMEM;
    5216       17199 :         search_path->search_commit_root = 1;
    5217       17199 :         search_path->skip_locking = 1;
    5218             : 
    5219       30147 :         while (cur_offset < item_size) {
    5220       17355 :                 u64 parent;
    5221       17355 :                 u32 this_name_len;
    5222       17355 :                 u32 this_len;
    5223       17355 :                 unsigned long name_ptr;
    5224       17355 :                 struct btrfs_dir_item *di;
    5225       17355 :                 struct fscrypt_str name_str;
    5226             : 
    5227       17355 :                 if (key->type == BTRFS_INODE_REF_KEY) {
    5228       17355 :                         struct btrfs_inode_ref *iref;
    5229             : 
    5230       17355 :                         iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
    5231       17355 :                         parent = key->offset;
    5232       17355 :                         this_name_len = btrfs_inode_ref_name_len(eb, iref);
    5233       17355 :                         name_ptr = (unsigned long)(iref + 1);
    5234       17355 :                         this_len = sizeof(*iref) + this_name_len;
    5235             :                 } else {
    5236           0 :                         struct btrfs_inode_extref *extref;
    5237             : 
    5238           0 :                         extref = (struct btrfs_inode_extref *)(ptr +
    5239             :                                                                cur_offset);
    5240           0 :                         parent = btrfs_inode_extref_parent(eb, extref);
    5241           0 :                         this_name_len = btrfs_inode_extref_name_len(eb, extref);
    5242           0 :                         name_ptr = (unsigned long)&extref->name;
    5243           0 :                         this_len = sizeof(*extref) + this_name_len;
    5244             :                 }
    5245             : 
    5246       17355 :                 if (this_name_len > name_len) {
    5247       17232 :                         char *new_name;
    5248             : 
    5249       17232 :                         new_name = krealloc(name, this_name_len, GFP_NOFS);
    5250       17232 :                         if (!new_name) {
    5251           0 :                                 ret = -ENOMEM;
    5252        4407 :                                 goto out;
    5253             :                         }
    5254             :                         name_len = this_name_len;
    5255             :                         name = new_name;
    5256             :                 }
    5257             : 
    5258       17355 :                 read_extent_buffer(eb, name, name_ptr, this_name_len);
    5259             : 
    5260       17355 :                 name_str.name = name;
    5261       17355 :                 name_str.len = this_name_len;
    5262       17355 :                 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
    5263             :                                 parent, &name_str, 0);
    5264       17355 :                 if (di && !IS_ERR(di)) {
    5265        4407 :                         struct btrfs_key di_key;
    5266             : 
    5267        4407 :                         btrfs_dir_item_key_to_cpu(search_path->nodes[0],
    5268             :                                                   di, &di_key);
    5269        4407 :                         if (di_key.type == BTRFS_INODE_ITEM_KEY) {
    5270        4407 :                                 if (di_key.objectid != key->objectid) {
    5271        4379 :                                         ret = 1;
    5272        4379 :                                         *other_ino = di_key.objectid;
    5273        4379 :                                         *other_parent = parent;
    5274             :                                 } else {
    5275             :                                         ret = 0;
    5276             :                                 }
    5277             :                         } else {
    5278             :                                 ret = -EAGAIN;
    5279             :                         }
    5280        4407 :                         goto out;
    5281       12948 :                 } else if (IS_ERR(di)) {
    5282           0 :                         ret = PTR_ERR(di);
    5283           0 :                         goto out;
    5284             :                 }
    5285       12948 :                 btrfs_release_path(search_path);
    5286             : 
    5287       12948 :                 cur_offset += this_len;
    5288             :         }
    5289             :         ret = 0;
    5290       17199 : out:
    5291       17199 :         btrfs_free_path(search_path);
    5292       17199 :         kfree(name);
    5293       17199 :         return ret;
    5294             : }
    5295             : 
    5296             : /*
    5297             :  * Check if we need to log an inode. This is used in contexts where while
    5298             :  * logging an inode we need to log another inode (either that it exists or in
    5299             :  * full mode). This is used instead of btrfs_inode_in_log() because the later
    5300             :  * requires the inode to be in the log and have the log transaction committed,
    5301             :  * while here we do not care if the log transaction was already committed - our
    5302             :  * caller will commit the log later - and we want to avoid logging an inode
    5303             :  * multiple times when multiple tasks have joined the same log transaction.
    5304             :  */
    5305       12220 : static bool need_log_inode(const struct btrfs_trans_handle *trans,
    5306             :                            struct btrfs_inode *inode)
    5307             : {
    5308             :         /*
    5309             :          * If a directory was not modified, no dentries added or removed, we can
    5310             :          * and should avoid logging it.
    5311             :          */
    5312       12220 :         if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
    5313             :                 return false;
    5314             : 
    5315             :         /*
    5316             :          * If this inode does not have new/updated/deleted xattrs since the last
    5317             :          * time it was logged and is flagged as logged in the current transaction,
    5318             :          * we can skip logging it. As for new/deleted names, those are updated in
    5319             :          * the log by link/unlink/rename operations.
    5320             :          * In case the inode was logged and then evicted and reloaded, its
    5321             :          * logged_trans will be 0, in which case we have to fully log it since
    5322             :          * logged_trans is a transient field, not persisted.
    5323             :          */
    5324       22111 :         if (inode_logged(trans, inode, NULL) == 1 &&
    5325        9965 :             !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
    5326        9870 :                 return false;
    5327             : 
    5328             :         return true;
    5329             : }
    5330             : 
    5331             : struct btrfs_dir_list {
    5332             :         u64 ino;
    5333             :         struct list_head list;
    5334             : };
    5335             : 
    5336             : /*
    5337             :  * Log the inodes of the new dentries of a directory.
    5338             :  * See process_dir_items_leaf() for details about why it is needed.
    5339             :  * This is a recursive operation - if an existing dentry corresponds to a
    5340             :  * directory, that directory's new entries are logged too (same behaviour as
    5341             :  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
    5342             :  * the dentries point to we do not acquire their VFS lock, otherwise lockdep
    5343             :  * complains about the following circular lock dependency / possible deadlock:
    5344             :  *
    5345             :  *        CPU0                                        CPU1
    5346             :  *        ----                                        ----
    5347             :  * lock(&type->i_mutex_dir_key#3/2);
    5348             :  *                                            lock(sb_internal#2);
    5349             :  *                                            lock(&type->i_mutex_dir_key#3/2);
    5350             :  * lock(&sb->s_type->i_mutex_key#14);
    5351             :  *
    5352             :  * Where sb_internal is the lock (a counter that works as a lock) acquired by
    5353             :  * sb_start_intwrite() in btrfs_start_transaction().
    5354             :  * Not acquiring the VFS lock of the inodes is still safe because:
    5355             :  *
    5356             :  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
    5357             :  *    that while logging the inode new references (names) are added or removed
    5358             :  *    from the inode, leaving the logged inode item with a link count that does
    5359             :  *    not match the number of logged inode reference items. This is fine because
    5360             :  *    at log replay time we compute the real number of links and correct the
    5361             :  *    link count in the inode item (see replay_one_buffer() and
    5362             :  *    link_to_fixup_dir());
    5363             :  *
    5364             :  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
    5365             :  *    while logging the inode's items new index items (key type
    5366             :  *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
    5367             :  *    has a size that doesn't match the sum of the lengths of all the logged
    5368             :  *    names - this is ok, not a problem, because at log replay time we set the
    5369             :  *    directory's i_size to the correct value (see replay_one_name() and
    5370             :  *    overwrite_item()).
    5371             :  */
    5372          82 : static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
    5373             :                                 struct btrfs_inode *start_inode,
    5374             :                                 struct btrfs_log_ctx *ctx)
    5375             : {
    5376          82 :         struct btrfs_root *root = start_inode->root;
    5377          82 :         struct btrfs_fs_info *fs_info = root->fs_info;
    5378          82 :         struct btrfs_path *path;
    5379          82 :         LIST_HEAD(dir_list);
    5380          82 :         struct btrfs_dir_list *dir_elem;
    5381          82 :         u64 ino = btrfs_ino(start_inode);
    5382          82 :         struct btrfs_inode *curr_inode = start_inode;
    5383          82 :         int ret = 0;
    5384             : 
    5385             :         /*
    5386             :          * If we are logging a new name, as part of a link or rename operation,
    5387             :          * don't bother logging new dentries, as we just want to log the names
    5388             :          * of an inode and that any new parents exist.
    5389             :          */
    5390          82 :         if (ctx->logging_new_name)
    5391             :                 return 0;
    5392             : 
    5393          72 :         path = btrfs_alloc_path();
    5394          72 :         if (!path)
    5395             :                 return -ENOMEM;
    5396             : 
    5397             :         /* Pairs with btrfs_add_delayed_iput below. */
    5398          72 :         ihold(&curr_inode->vfs_inode);
    5399             : 
    5400           4 :         while (true) {
    5401          76 :                 struct inode *vfs_inode;
    5402          76 :                 struct btrfs_key key;
    5403          76 :                 struct btrfs_key found_key;
    5404          76 :                 u64 next_index;
    5405          76 :                 bool continue_curr_inode = true;
    5406          76 :                 int iter_ret;
    5407             : 
    5408          76 :                 key.objectid = ino;
    5409          76 :                 key.type = BTRFS_DIR_INDEX_KEY;
    5410          76 :                 key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
    5411          76 :                 next_index = key.offset;
    5412         194 : again:
    5413         194 :                 btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
    5414         188 :                         struct extent_buffer *leaf = path->nodes[0];
    5415         188 :                         struct btrfs_dir_item *di;
    5416         188 :                         struct btrfs_key di_key;
    5417         188 :                         struct inode *di_inode;
    5418         188 :                         int log_mode = LOG_INODE_EXISTS;
    5419         188 :                         int type;
    5420             : 
    5421         188 :                         if (found_key.objectid != ino ||
    5422         119 :                             found_key.type != BTRFS_DIR_INDEX_KEY) {
    5423             :                                 continue_curr_inode = false;
    5424         187 :                                 break;
    5425             :                         }
    5426             : 
    5427         119 :                         next_index = found_key.offset + 1;
    5428             : 
    5429         119 :                         di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
    5430         119 :                         type = btrfs_dir_ftype(leaf, di);
    5431         119 :                         if (btrfs_dir_transid(leaf, di) < trans->transid)
    5432           0 :                                 continue;
    5433         119 :                         btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
    5434         119 :                         if (di_key.type == BTRFS_ROOT_ITEM_KEY)
    5435           0 :                                 continue;
    5436             : 
    5437         119 :                         btrfs_release_path(path);
    5438         119 :                         di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
    5439         119 :                         if (IS_ERR(di_inode)) {
    5440           0 :                                 ret = PTR_ERR(di_inode);
    5441           1 :                                 goto out;
    5442             :                         }
    5443             : 
    5444         119 :                         if (!need_log_inode(trans, BTRFS_I(di_inode))) {
    5445          19 :                                 btrfs_add_delayed_iput(BTRFS_I(di_inode));
    5446          19 :                                 break;
    5447             :                         }
    5448             : 
    5449         100 :                         ctx->log_new_dentries = false;
    5450         100 :                         if (type == BTRFS_FT_DIR)
    5451          13 :                                 log_mode = LOG_INODE_ALL;
    5452         100 :                         ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
    5453             :                                               log_mode, ctx);
    5454         100 :                         btrfs_add_delayed_iput(BTRFS_I(di_inode));
    5455         100 :                         if (ret)
    5456           1 :                                 goto out;
    5457          99 :                         if (ctx->log_new_dentries) {
    5458           4 :                                 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
    5459           4 :                                 if (!dir_elem) {
    5460           0 :                                         ret = -ENOMEM;
    5461           0 :                                         goto out;
    5462             :                                 }
    5463           4 :                                 dir_elem->ino = di_key.objectid;
    5464           4 :                                 list_add_tail(&dir_elem->list, &dir_list);
    5465             :                         }
    5466             :                         break;
    5467             :                 }
    5468             : 
    5469         193 :                 btrfs_release_path(path);
    5470             : 
    5471         193 :                 if (iter_ret < 0) {
    5472           0 :                         ret = iter_ret;
    5473           0 :                         goto out;
    5474         193 :                 } else if (iter_ret > 0) {
    5475             :                         continue_curr_inode = false;
    5476             :                 } else {
    5477         187 :                         key = found_key;
    5478             :                 }
    5479             : 
    5480         187 :                 if (continue_curr_inode && key.offset < (u64)-1) {
    5481         118 :                         key.offset++;
    5482         118 :                         goto again;
    5483             :                 }
    5484             : 
    5485          75 :                 btrfs_set_first_dir_index_to_log(curr_inode, next_index);
    5486             : 
    5487          75 :                 if (list_empty(&dir_list))
    5488             :                         break;
    5489             : 
    5490           4 :                 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
    5491           4 :                 ino = dir_elem->ino;
    5492           4 :                 list_del(&dir_elem->list);
    5493           4 :                 kfree(dir_elem);
    5494             : 
    5495           4 :                 btrfs_add_delayed_iput(curr_inode);
    5496           4 :                 curr_inode = NULL;
    5497             : 
    5498           4 :                 vfs_inode = btrfs_iget(fs_info->sb, ino, root);
    5499           4 :                 if (IS_ERR(vfs_inode)) {
    5500           0 :                         ret = PTR_ERR(vfs_inode);
    5501           0 :                         break;
    5502             :                 }
    5503           4 :                 curr_inode = BTRFS_I(vfs_inode);
    5504             :         }
    5505          72 : out:
    5506          72 :         btrfs_free_path(path);
    5507          72 :         if (curr_inode)
    5508          72 :                 btrfs_add_delayed_iput(curr_inode);
    5509             : 
    5510          72 :         if (ret) {
    5511           1 :                 struct btrfs_dir_list *next;
    5512             : 
    5513           1 :                 list_for_each_entry_safe(dir_elem, next, &dir_list, list)
    5514           0 :                         kfree(dir_elem);
    5515             :         }
    5516             : 
    5517             :         return ret;
    5518             : }
    5519             : 
    5520             : struct btrfs_ino_list {
    5521             :         u64 ino;
    5522             :         u64 parent;
    5523             :         struct list_head list;
    5524             : };
    5525             : 
    5526         702 : static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
    5527             : {
    5528         702 :         struct btrfs_ino_list *curr;
    5529         702 :         struct btrfs_ino_list *next;
    5530             : 
    5531         702 :         list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
    5532           0 :                 list_del(&curr->list);
    5533           0 :                 kfree(curr);
    5534             :         }
    5535         702 : }
    5536             : 
    5537        4288 : static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
    5538             :                                     struct btrfs_path *path)
    5539             : {
    5540        4288 :         struct btrfs_key key;
    5541        4288 :         int ret;
    5542             : 
    5543        4288 :         key.objectid = ino;
    5544        4288 :         key.type = BTRFS_INODE_ITEM_KEY;
    5545        4288 :         key.offset = 0;
    5546             : 
    5547        4288 :         path->search_commit_root = 1;
    5548        4288 :         path->skip_locking = 1;
    5549             : 
    5550        4288 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    5551        4288 :         if (WARN_ON_ONCE(ret > 0)) {
    5552             :                 /*
    5553             :                  * We have previously found the inode through the commit root
    5554             :                  * so this should not happen. If it does, just error out and
    5555             :                  * fallback to a transaction commit.
    5556             :                  */
    5557             :                 ret = -ENOENT;
    5558        4288 :         } else if (ret == 0) {
    5559        4288 :                 struct btrfs_inode_item *item;
    5560             : 
    5561        4288 :                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
    5562             :                                       struct btrfs_inode_item);
    5563        4288 :                 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
    5564          78 :                         ret = 1;
    5565             :         }
    5566             : 
    5567        4288 :         btrfs_release_path(path);
    5568        4288 :         path->search_commit_root = 0;
    5569        4288 :         path->skip_locking = 0;
    5570             : 
    5571        4288 :         return ret;
    5572             : }
    5573             : 
    5574        4372 : static int add_conflicting_inode(struct btrfs_trans_handle *trans,
    5575             :                                  struct btrfs_root *root,
    5576             :                                  struct btrfs_path *path,
    5577             :                                  u64 ino, u64 parent,
    5578             :                                  struct btrfs_log_ctx *ctx)
    5579             : {
    5580        4372 :         struct btrfs_ino_list *ino_elem;
    5581        4372 :         struct inode *inode;
    5582             : 
    5583             :         /*
    5584             :          * It's rare to have a lot of conflicting inodes, in practice it is not
    5585             :          * common to have more than 1 or 2. We don't want to collect too many,
    5586             :          * as we could end up logging too many inodes (even if only in
    5587             :          * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
    5588             :          * commits.
    5589             :          */
    5590        4372 :         if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
    5591             :                 return BTRFS_LOG_FORCE_COMMIT;
    5592             : 
    5593        4372 :         inode = btrfs_iget(root->fs_info->sb, ino, root);
    5594             :         /*
    5595             :          * If the other inode that had a conflicting dir entry was deleted in
    5596             :          * the current transaction then we either:
    5597             :          *
    5598             :          * 1) Log the parent directory (later after adding it to the list) if
    5599             :          *    the inode is a directory. This is because it may be a deleted
    5600             :          *    subvolume/snapshot or it may be a regular directory that had
    5601             :          *    deleted subvolumes/snapshots (or subdirectories that had them),
    5602             :          *    and at the moment we can't deal with dropping subvolumes/snapshots
    5603             :          *    during log replay. So we just log the parent, which will result in
    5604             :          *    a fallback to a transaction commit if we are dealing with those
    5605             :          *    cases (last_unlink_trans will match the current transaction);
    5606             :          *
    5607             :          * 2) Do nothing if it's not a directory. During log replay we simply
    5608             :          *    unlink the conflicting dentry from the parent directory and then
    5609             :          *    add the dentry for our inode. Like this we can avoid logging the
    5610             :          *    parent directory (and maybe fallback to a transaction commit in
    5611             :          *    case it has a last_unlink_trans == trans->transid, due to moving
    5612             :          *    some inode from it to some other directory).
    5613             :          */
    5614        4372 :         if (IS_ERR(inode)) {
    5615        4288 :                 int ret = PTR_ERR(inode);
    5616             : 
    5617        4288 :                 if (ret != -ENOENT)
    5618             :                         return ret;
    5619             : 
    5620        4288 :                 ret = conflicting_inode_is_dir(root, ino, path);
    5621             :                 /* Not a directory or we got an error. */
    5622        4288 :                 if (ret <= 0)
    5623             :                         return ret;
    5624             : 
    5625             :                 /* Conflicting inode is a directory, so we'll log its parent. */
    5626          78 :                 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
    5627          78 :                 if (!ino_elem)
    5628             :                         return -ENOMEM;
    5629          78 :                 ino_elem->ino = ino;
    5630          78 :                 ino_elem->parent = parent;
    5631          78 :                 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
    5632          78 :                 ctx->num_conflict_inodes++;
    5633             : 
    5634          78 :                 return 0;
    5635             :         }
    5636             : 
    5637             :         /*
    5638             :          * If the inode was already logged skip it - otherwise we can hit an
    5639             :          * infinite loop. Example:
    5640             :          *
    5641             :          * From the commit root (previous transaction) we have the following
    5642             :          * inodes:
    5643             :          *
    5644             :          * inode 257 a directory
    5645             :          * inode 258 with references "zz" and "zz_link" on inode 257
    5646             :          * inode 259 with reference "a" on inode 257
    5647             :          *
    5648             :          * And in the current (uncommitted) transaction we have:
    5649             :          *
    5650             :          * inode 257 a directory, unchanged
    5651             :          * inode 258 with references "a" and "a2" on inode 257
    5652             :          * inode 259 with reference "zz_link" on inode 257
    5653             :          * inode 261 with reference "zz" on inode 257
    5654             :          *
    5655             :          * When logging inode 261 the following infinite loop could
    5656             :          * happen if we don't skip already logged inodes:
    5657             :          *
    5658             :          * - we detect inode 258 as a conflicting inode, with inode 261
    5659             :          *   on reference "zz", and log it;
    5660             :          *
    5661             :          * - we detect inode 259 as a conflicting inode, with inode 258
    5662             :          *   on reference "a", and log it;
    5663             :          *
    5664             :          * - we detect inode 258 as a conflicting inode, with inode 259
    5665             :          *   on reference "zz_link", and log it - again! After this we
    5666             :          *   repeat the above steps forever.
    5667             :          *
    5668             :          * Here we can use need_log_inode() because we only need to log the
    5669             :          * inode in LOG_INODE_EXISTS mode and rename operations update the log,
    5670             :          * so that the log ends up with the new name and without the old name.
    5671             :          */
    5672          84 :         if (!need_log_inode(trans, BTRFS_I(inode))) {
    5673          28 :                 btrfs_add_delayed_iput(BTRFS_I(inode));
    5674          28 :                 return 0;
    5675             :         }
    5676             : 
    5677          56 :         btrfs_add_delayed_iput(BTRFS_I(inode));
    5678             : 
    5679          56 :         ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
    5680          56 :         if (!ino_elem)
    5681             :                 return -ENOMEM;
    5682          56 :         ino_elem->ino = ino;
    5683          56 :         ino_elem->parent = parent;
    5684          56 :         list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
    5685          56 :         ctx->num_conflict_inodes++;
    5686             : 
    5687          56 :         return 0;
    5688             : }
    5689             : 
    5690      255112 : static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
    5691             :                                   struct btrfs_root *root,
    5692             :                                   struct btrfs_log_ctx *ctx)
    5693             : {
    5694      255112 :         struct btrfs_fs_info *fs_info = root->fs_info;
    5695      255112 :         int ret = 0;
    5696             : 
    5697             :         /*
    5698             :          * Conflicting inodes are logged by the first call to btrfs_log_inode(),
    5699             :          * otherwise we could have unbounded recursion of btrfs_log_inode()
    5700             :          * calls. This check guarantees we can have only 1 level of recursion.
    5701             :          */
    5702      255112 :         if (ctx->logging_conflict_inodes)
    5703             :                 return 0;
    5704             : 
    5705      255032 :         ctx->logging_conflict_inodes = true;
    5706             : 
    5707             :         /*
    5708             :          * New conflicting inodes may be found and added to the list while we
    5709             :          * are logging a conflicting inode, so keep iterating while the list is
    5710             :          * not empty.
    5711             :          */
    5712      255112 :         while (!list_empty(&ctx->conflict_inodes)) {
    5713         149 :                 struct btrfs_ino_list *curr;
    5714         149 :                 struct inode *inode;
    5715         149 :                 u64 ino;
    5716         149 :                 u64 parent;
    5717             : 
    5718         149 :                 curr = list_first_entry(&ctx->conflict_inodes,
    5719             :                                         struct btrfs_ino_list, list);
    5720         149 :                 ino = curr->ino;
    5721         149 :                 parent = curr->parent;
    5722         149 :                 list_del(&curr->list);
    5723         134 :                 kfree(curr);
    5724             : 
    5725         134 :                 inode = btrfs_iget(fs_info->sb, ino, root);
    5726             :                 /*
    5727             :                  * If the other inode that had a conflicting dir entry was
    5728             :                  * deleted in the current transaction, we need to log its parent
    5729             :                  * directory. See the comment at add_conflicting_inode().
    5730             :                  */
    5731         134 :                 if (IS_ERR(inode)) {
    5732          78 :                         ret = PTR_ERR(inode);
    5733          78 :                         if (ret != -ENOENT)
    5734             :                                 break;
    5735             : 
    5736          78 :                         inode = btrfs_iget(fs_info->sb, parent, root);
    5737          78 :                         if (IS_ERR(inode)) {
    5738           0 :                                 ret = PTR_ERR(inode);
    5739           0 :                                 break;
    5740             :                         }
    5741             : 
    5742             :                         /*
    5743             :                          * Always log the directory, we cannot make this
    5744             :                          * conditional on need_log_inode() because the directory
    5745             :                          * might have been logged in LOG_INODE_EXISTS mode or
    5746             :                          * the dir index of the conflicting inode is not in a
    5747             :                          * dir index key range logged for the directory. So we
    5748             :                          * must make sure the deletion is recorded.
    5749             :                          */
    5750          78 :                         ret = btrfs_log_inode(trans, BTRFS_I(inode),
    5751             :                                               LOG_INODE_ALL, ctx);
    5752          78 :                         btrfs_add_delayed_iput(BTRFS_I(inode));
    5753          78 :                         if (ret)
    5754             :                                 break;
    5755          24 :                         continue;
    5756             :                 }
    5757             : 
    5758             :                 /*
    5759             :                  * Here we can use need_log_inode() because we only need to log
    5760             :                  * the inode in LOG_INODE_EXISTS mode and rename operations
    5761             :                  * update the log, so that the log ends up with the new name and
    5762             :                  * without the old name.
    5763             :                  *
    5764             :                  * We did this check at add_conflicting_inode(), but here we do
    5765             :                  * it again because if some other task logged the inode after
    5766             :                  * that, we can avoid doing it again.
    5767             :                  */
    5768          56 :                 if (!need_log_inode(trans, BTRFS_I(inode))) {
    5769           0 :                         btrfs_add_delayed_iput(BTRFS_I(inode));
    5770           0 :                         continue;
    5771             :                 }
    5772             : 
    5773             :                 /*
    5774             :                  * We are safe logging the other inode without acquiring its
    5775             :                  * lock as long as we log with the LOG_INODE_EXISTS mode. We
    5776             :                  * are safe against concurrent renames of the other inode as
    5777             :                  * well because during a rename we pin the log and update the
    5778             :                  * log with the new name before we unpin it.
    5779             :                  */
    5780          56 :                 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
    5781          56 :                 btrfs_add_delayed_iput(BTRFS_I(inode));
    5782          56 :                 if (ret)
    5783             :                         break;
    5784             :         }
    5785             : 
    5786      255017 :         ctx->logging_conflict_inodes = false;
    5787      255017 :         if (ret)
    5788          54 :                 free_conflicting_inodes(ctx);
    5789             : 
    5790             :         return ret;
    5791             : }
    5792             : 
    5793      119704 : static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
    5794             :                                    struct btrfs_inode *inode,
    5795             :                                    struct btrfs_key *min_key,
    5796             :                                    const struct btrfs_key *max_key,
    5797             :                                    struct btrfs_path *path,
    5798             :                                    struct btrfs_path *dst_path,
    5799             :                                    const u64 logged_isize,
    5800             :                                    const int inode_only,
    5801             :                                    struct btrfs_log_ctx *ctx,
    5802             :                                    bool *need_log_inode_item)
    5803             : {
    5804      119704 :         const u64 i_size = i_size_read(&inode->vfs_inode);
    5805      119704 :         struct btrfs_root *root = inode->root;
    5806      119704 :         int ins_start_slot = 0;
    5807      119704 :         int ins_nr = 0;
    5808      178892 :         int ret;
    5809             : 
    5810       59188 :         while (1) {
    5811      178892 :                 ret = btrfs_search_forward(root, min_key, path, trans->transid);
    5812      178891 :                 if (ret < 0)
    5813           0 :                         return ret;
    5814      178891 :                 if (ret > 0) {
    5815             :                         ret = 0;
    5816             :                         break;
    5817             :                 }
    5818      128349 : again:
    5819             :                 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
    5820     2699717 :                 if (min_key->objectid != max_key->objectid)
    5821             :                         break;
    5822     2643568 :                 if (min_key->type > max_key->type)
    5823             :                         break;
    5824             : 
    5825     2641512 :                 if (min_key->type == BTRFS_INODE_ITEM_KEY) {
    5826      119164 :                         *need_log_inode_item = false;
    5827     2522348 :                 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
    5828     2399509 :                            min_key->offset >= i_size) {
    5829             :                         /*
    5830             :                          * Extents at and beyond eof are logged with
    5831             :                          * btrfs_log_prealloc_extents().
    5832             :                          * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
    5833             :                          * and no keys greater than that, so bail out.
    5834             :                          */
    5835             :                         break;
    5836     2511394 :                 } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
    5837      120979 :                             min_key->type == BTRFS_INODE_EXTREF_KEY) &&
    5838      120979 :                            (inode->generation == trans->transid ||
    5839      116696 :                             ctx->logging_conflict_inodes)) {
    5840       17199 :                         u64 other_ino = 0;
    5841       17199 :                         u64 other_parent = 0;
    5842             : 
    5843       17199 :                         ret = btrfs_check_ref_name_override(path->nodes[0],
    5844             :                                         path->slots[0], min_key, inode,
    5845             :                                         &other_ino, &other_parent);
    5846       17199 :                         if (ret < 0) {
    5847           0 :                                 return ret;
    5848       17199 :                         } else if (ret > 0 &&
    5849        4379 :                                    other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
    5850        4372 :                                 if (ins_nr > 0) {
    5851        4372 :                                         ins_nr++;
    5852             :                                 } else {
    5853           0 :                                         ins_nr = 1;
    5854           0 :                                         ins_start_slot = path->slots[0];
    5855             :                                 }
    5856        4372 :                                 ret = copy_items(trans, inode, dst_path, path,
    5857             :                                                  ins_start_slot, ins_nr,
    5858             :                                                  inode_only, logged_isize);
    5859        4372 :                                 if (ret < 0)
    5860           0 :                                         return ret;
    5861        4372 :                                 ins_nr = 0;
    5862             : 
    5863        4372 :                                 btrfs_release_path(path);
    5864        4372 :                                 ret = add_conflicting_inode(trans, root, path,
    5865             :                                                             other_ino,
    5866             :                                                             other_parent, ctx);
    5867        4372 :                                 if (ret)
    5868           0 :                                         return ret;
    5869        4372 :                                 goto next_key;
    5870             :                         }
    5871     2494195 :                 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
    5872             :                         /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
    5873        1861 :                         if (ins_nr == 0)
    5874         443 :                                 goto next_slot;
    5875        1418 :                         ret = copy_items(trans, inode, dst_path, path,
    5876             :                                          ins_start_slot,
    5877             :                                          ins_nr, inode_only, logged_isize);
    5878        1418 :                         if (ret < 0)
    5879           0 :                                 return ret;
    5880        1418 :                         ins_nr = 0;
    5881        1418 :                         goto next_slot;
    5882             :                 }
    5883             : 
    5884     2624325 :                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
    5885     2495707 :                         ins_nr++;
    5886     2495707 :                         goto next_slot;
    5887      128618 :                 } else if (!ins_nr) {
    5888      128618 :                         ins_start_slot = path->slots[0];
    5889      128618 :                         ins_nr = 1;
    5890      128618 :                         goto next_slot;
    5891             :                 }
    5892             : 
    5893           0 :                 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
    5894             :                                  ins_nr, inode_only, logged_isize);
    5895           0 :                 if (ret < 0)
    5896           0 :                         return ret;
    5897           0 :                 ins_nr = 1;
    5898           0 :                 ins_start_slot = path->slots[0];
    5899     2626186 : next_slot:
    5900     2626186 :                 path->slots[0]++;
    5901     2626186 :                 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
    5902     2571369 :                         btrfs_item_key_to_cpu(path->nodes[0], min_key,
    5903             :                                               path->slots[0]);
    5904     2571368 :                         goto again;
    5905             :                 }
    5906       54817 :                 if (ins_nr) {
    5907       54806 :                         ret = copy_items(trans, inode, dst_path, path,
    5908             :                                          ins_start_slot, ins_nr, inode_only,
    5909             :                                          logged_isize);
    5910       54806 :                         if (ret < 0)
    5911           0 :                                 return ret;
    5912             :                         ins_nr = 0;
    5913             :                 }
    5914       54817 :                 btrfs_release_path(path);
    5915       59188 : next_key:
    5916       59188 :                 if (min_key->offset < (u64)-1) {
    5917       59188 :                         min_key->offset++;
    5918           0 :                 } else if (min_key->type < max_key->type) {
    5919           0 :                         min_key->type++;
    5920           0 :                         min_key->offset = 0;
    5921             :                 } else {
    5922             :                         break;
    5923             :                 }
    5924             : 
    5925             :                 /*
    5926             :                  * We may process many leaves full of items for our inode, so
    5927             :                  * avoid monopolizing a cpu for too long by rescheduling while
    5928             :                  * not holding locks on any tree.
    5929             :                  */
    5930       59188 :                 cond_resched();
    5931             :         }
    5932      119701 :         if (ins_nr) {
    5933       68023 :                 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
    5934             :                                  ins_nr, inode_only, logged_isize);
    5935       68025 :                 if (ret)
    5936             :                         return ret;
    5937             :         }
    5938             : 
    5939      119703 :         if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
    5940             :                 /*
    5941             :                  * Release the path because otherwise we might attempt to double
    5942             :                  * lock the same leaf with btrfs_log_prealloc_extents() below.
    5943             :                  */
    5944      116184 :                 btrfs_release_path(path);
    5945      116185 :                 ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
    5946             :         }
    5947             : 
    5948             :         return ret;
    5949             : }
    5950             : 
    5951         466 : static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
    5952             :                                       struct btrfs_root *log,
    5953             :                                       struct btrfs_path *path,
    5954             :                                       const struct btrfs_item_batch *batch,
    5955             :                                       const struct btrfs_delayed_item *first_item)
    5956             : {
    5957         466 :         const struct btrfs_delayed_item *curr = first_item;
    5958         466 :         int ret;
    5959             : 
    5960         466 :         ret = btrfs_insert_empty_items(trans, log, path, batch);
    5961         466 :         if (ret)
    5962             :                 return ret;
    5963             : 
    5964        1594 :         for (int i = 0; i < batch->nr; i++) {
    5965        1128 :                 char *data_ptr;
    5966             : 
    5967        1128 :                 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
    5968        1128 :                 write_extent_buffer(path->nodes[0], &curr->data,
    5969        1128 :                                     (unsigned long)data_ptr, curr->data_len);
    5970        1128 :                 curr = list_next_entry(curr, log_list);
    5971        1128 :                 path->slots[0]++;
    5972             :         }
    5973             : 
    5974         466 :         btrfs_release_path(path);
    5975             : 
    5976         466 :         return 0;
    5977             : }
    5978             : 
    5979         799 : static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
    5980             :                                        struct btrfs_inode *inode,
    5981             :                                        struct btrfs_path *path,
    5982             :                                        const struct list_head *delayed_ins_list,
    5983             :                                        struct btrfs_log_ctx *ctx)
    5984             : {
    5985             :         /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
    5986         799 :         const int max_batch_size = 195;
    5987         799 :         const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
    5988         799 :         const u64 ino = btrfs_ino(inode);
    5989         799 :         struct btrfs_root *log = inode->root->log_root;
    5990         799 :         struct btrfs_item_batch batch = {
    5991             :                 .nr = 0,
    5992             :                 .total_data_size = 0,
    5993             :         };
    5994         799 :         const struct btrfs_delayed_item *first = NULL;
    5995         799 :         const struct btrfs_delayed_item *curr;
    5996         799 :         char *ins_data;
    5997         799 :         struct btrfs_key *ins_keys;
    5998         799 :         u32 *ins_sizes;
    5999         799 :         u64 curr_batch_size = 0;
    6000         799 :         int batch_idx = 0;
    6001         799 :         int ret;
    6002             : 
    6003             :         /* We are adding dir index items to the log tree. */
    6004         799 :         lockdep_assert_held(&inode->log_mutex);
    6005             : 
    6006             :         /*
    6007             :          * We collect delayed items before copying index keys from the subvolume
    6008             :          * to the log tree. However just after we collected them, they may have
    6009             :          * been flushed (all of them or just some of them), and therefore we
    6010             :          * could have copied them from the subvolume tree to the log tree.
    6011             :          * So find the first delayed item that was not yet logged (they are
    6012             :          * sorted by index number).
    6013             :          */
    6014         801 :         list_for_each_entry(curr, delayed_ins_list, log_list) {
    6015         468 :                 if (curr->index > inode->last_dir_index_offset) {
    6016             :                         first = curr;
    6017             :                         break;
    6018             :                 }
    6019             :         }
    6020             : 
    6021             :         /* Empty list or all delayed items were already logged. */
    6022         799 :         if (!first)
    6023             :                 return 0;
    6024             : 
    6025         466 :         ins_data = kmalloc(max_batch_size * sizeof(u32) +
    6026             :                            max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
    6027         466 :         if (!ins_data)
    6028             :                 return -ENOMEM;
    6029         466 :         ins_sizes = (u32 *)ins_data;
    6030         466 :         batch.data_sizes = ins_sizes;
    6031         466 :         ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
    6032         466 :         batch.keys = ins_keys;
    6033             : 
    6034         466 :         curr = first;
    6035        1594 :         while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
    6036        1128 :                 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
    6037             : 
    6038        1128 :                 if (curr_batch_size + curr_size > leaf_data_size ||
    6039        1128 :                     batch.nr == max_batch_size) {
    6040           0 :                         ret = insert_delayed_items_batch(trans, log, path,
    6041             :                                                          &batch, first);
    6042           0 :                         if (ret)
    6043           0 :                                 goto out;
    6044           0 :                         batch_idx = 0;
    6045           0 :                         batch.nr = 0;
    6046           0 :                         batch.total_data_size = 0;
    6047           0 :                         curr_batch_size = 0;
    6048           0 :                         first = curr;
    6049             :                 }
    6050             : 
    6051        1128 :                 ins_sizes[batch_idx] = curr->data_len;
    6052        1128 :                 ins_keys[batch_idx].objectid = ino;
    6053        1128 :                 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
    6054        1128 :                 ins_keys[batch_idx].offset = curr->index;
    6055        1128 :                 curr_batch_size += curr_size;
    6056        1128 :                 batch.total_data_size += curr->data_len;
    6057        1128 :                 batch.nr++;
    6058        1128 :                 batch_idx++;
    6059        1128 :                 curr = list_next_entry(curr, log_list);
    6060             :         }
    6061             : 
    6062         466 :         ASSERT(batch.nr >= 1);
    6063         466 :         ret = insert_delayed_items_batch(trans, log, path, &batch, first);
    6064             : 
    6065         466 :         curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
    6066             :                                log_list);
    6067         466 :         inode->last_dir_index_offset = curr->index;
    6068         466 : out:
    6069         466 :         kfree(ins_data);
    6070             : 
    6071         466 :         return ret;
    6072             : }
    6073             : 
    6074         192 : static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
    6075             :                                       struct btrfs_inode *inode,
    6076             :                                       struct btrfs_path *path,
    6077             :                                       const struct list_head *delayed_del_list,
    6078             :                                       struct btrfs_log_ctx *ctx)
    6079             : {
    6080         192 :         const u64 ino = btrfs_ino(inode);
    6081         192 :         const struct btrfs_delayed_item *curr;
    6082             : 
    6083         192 :         curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
    6084             :                                 log_list);
    6085             : 
    6086         461 :         while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
    6087         269 :                 u64 first_dir_index = curr->index;
    6088         269 :                 u64 last_dir_index;
    6089         269 :                 const struct btrfs_delayed_item *next;
    6090         269 :                 int ret;
    6091             : 
    6092             :                 /*
    6093             :                  * Find a range of consecutive dir index items to delete. Like
    6094             :                  * this we log a single dir range item spanning several contiguous
    6095             :                  * dir items instead of logging one range item per dir index item.
    6096             :                  */
    6097         269 :                 next = list_next_entry(curr, log_list);
    6098         299 :                 while (!list_entry_is_head(next, delayed_del_list, log_list)) {
    6099         107 :                         if (next->index != curr->index + 1)
    6100             :                                 break;
    6101          30 :                         curr = next;
    6102          30 :                         next = list_next_entry(next, log_list);
    6103             :                 }
    6104             : 
    6105         269 :                 last_dir_index = curr->index;
    6106         269 :                 ASSERT(last_dir_index >= first_dir_index);
    6107             : 
    6108         269 :                 ret = insert_dir_log_key(trans, inode->root->log_root, path,
    6109             :                                          ino, first_dir_index, last_dir_index);
    6110         269 :                 if (ret)
    6111           0 :                         return ret;
    6112         269 :                 curr = list_next_entry(curr, log_list);
    6113             :         }
    6114             : 
    6115             :         return 0;
    6116             : }
    6117             : 
    6118           0 : static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
    6119             :                                         struct btrfs_inode *inode,
    6120             :                                         struct btrfs_path *path,
    6121             :                                         struct btrfs_log_ctx *ctx,
    6122             :                                         const struct list_head *delayed_del_list,
    6123             :                                         const struct btrfs_delayed_item *first,
    6124             :                                         const struct btrfs_delayed_item **last_ret)
    6125             : {
    6126           0 :         const struct btrfs_delayed_item *next;
    6127           0 :         struct extent_buffer *leaf = path->nodes[0];
    6128           0 :         const int last_slot = btrfs_header_nritems(leaf) - 1;
    6129           0 :         int slot = path->slots[0] + 1;
    6130           0 :         const u64 ino = btrfs_ino(inode);
    6131             : 
    6132           0 :         next = list_next_entry(first, log_list);
    6133             : 
    6134           0 :         while (slot < last_slot &&
    6135           0 :                !list_entry_is_head(next, delayed_del_list, log_list)) {
    6136           0 :                 struct btrfs_key key;
    6137             : 
    6138           0 :                 btrfs_item_key_to_cpu(leaf, &key, slot);
    6139           0 :                 if (key.objectid != ino ||
    6140           0 :                     key.type != BTRFS_DIR_INDEX_KEY ||
    6141           0 :                     key.offset != next->index)
    6142             :                         break;
    6143             : 
    6144           0 :                 slot++;
    6145           0 :                 *last_ret = next;
    6146           0 :                 next = list_next_entry(next, log_list);
    6147             :         }
    6148             : 
    6149           0 :         return btrfs_del_items(trans, inode->root->log_root, path,
    6150           0 :                                path->slots[0], slot - path->slots[0]);
    6151             : }
    6152             : 
    6153           5 : static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
    6154             :                                              struct btrfs_inode *inode,
    6155             :                                              struct btrfs_path *path,
    6156             :                                              const struct list_head *delayed_del_list,
    6157             :                                              struct btrfs_log_ctx *ctx)
    6158             : {
    6159           5 :         struct btrfs_root *log = inode->root->log_root;
    6160           5 :         const struct btrfs_delayed_item *curr;
    6161           5 :         u64 last_range_start = 0;
    6162           5 :         u64 last_range_end = 0;
    6163           5 :         struct btrfs_key key;
    6164             : 
    6165           5 :         key.objectid = btrfs_ino(inode);
    6166           5 :         key.type = BTRFS_DIR_INDEX_KEY;
    6167           5 :         curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
    6168             :                                 log_list);
    6169             : 
    6170          10 :         while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
    6171           5 :                 const struct btrfs_delayed_item *last = curr;
    6172           5 :                 u64 first_dir_index = curr->index;
    6173           5 :                 u64 last_dir_index;
    6174           5 :                 bool deleted_items = false;
    6175           5 :                 int ret;
    6176             : 
    6177           5 :                 key.offset = curr->index;
    6178           5 :                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
    6179           5 :                 if (ret < 0) {
    6180           0 :                         return ret;
    6181           5 :                 } else if (ret == 0) {
    6182           0 :                         ret = batch_delete_dir_index_items(trans, inode, path, ctx,
    6183             :                                                            delayed_del_list, curr,
    6184             :                                                            &last);
    6185           0 :                         if (ret)
    6186           0 :                                 return ret;
    6187             :                         deleted_items = true;
    6188             :                 }
    6189             : 
    6190           5 :                 btrfs_release_path(path);
    6191             : 
    6192             :                 /*
    6193             :                  * If we deleted items from the leaf, it means we have a range
    6194             :                  * item logging their range, so no need to add one or update an
    6195             :                  * existing one. Otherwise we have to log a dir range item.
    6196             :                  */
    6197           5 :                 if (deleted_items)
    6198           0 :                         goto next_batch;
    6199             : 
    6200           5 :                 last_dir_index = last->index;
    6201           5 :                 ASSERT(last_dir_index >= first_dir_index);
    6202             :                 /*
    6203             :                  * If this range starts right after where the previous one ends,
    6204             :                  * then we want to reuse the previous range item and change its
    6205             :                  * end offset to the end of this range. This is just to minimize
    6206             :                  * leaf space usage, by avoiding adding a new range item.
    6207             :                  */
    6208           5 :                 if (last_range_end != 0 && first_dir_index == last_range_end + 1)
    6209           0 :                         first_dir_index = last_range_start;
    6210             : 
    6211           5 :                 ret = insert_dir_log_key(trans, log, path, key.objectid,
    6212             :                                          first_dir_index, last_dir_index);
    6213           5 :                 if (ret)
    6214           0 :                         return ret;
    6215             : 
    6216             :                 last_range_start = first_dir_index;
    6217             :                 last_range_end = last_dir_index;
    6218           5 : next_batch:
    6219           5 :                 curr = list_next_entry(last, log_list);
    6220             :         }
    6221             : 
    6222             :         return 0;
    6223             : }
    6224             : 
    6225         799 : static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
    6226             :                                       struct btrfs_inode *inode,
    6227             :                                       struct btrfs_path *path,
    6228             :                                       const struct list_head *delayed_del_list,
    6229             :                                       struct btrfs_log_ctx *ctx)
    6230             : {
    6231             :         /*
    6232             :          * We are deleting dir index items from the log tree or adding range
    6233             :          * items to it.
    6234             :          */
    6235         799 :         lockdep_assert_held(&inode->log_mutex);
    6236             : 
    6237         799 :         if (list_empty(delayed_del_list))
    6238             :                 return 0;
    6239             : 
    6240         197 :         if (ctx->logged_before)
    6241           5 :                 return log_delayed_deletions_incremental(trans, inode, path,
    6242             :                                                          delayed_del_list, ctx);
    6243             : 
    6244         192 :         return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
    6245             :                                           ctx);
    6246             : }
    6247             : 
    6248             : /*
    6249             :  * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
    6250             :  * items instead of the subvolume tree.
    6251             :  */
    6252         704 : static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
    6253             :                                     struct btrfs_inode *inode,
    6254             :                                     const struct list_head *delayed_ins_list,
    6255             :                                     struct btrfs_log_ctx *ctx)
    6256             : {
    6257         704 :         const bool orig_log_new_dentries = ctx->log_new_dentries;
    6258         704 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    6259         704 :         struct btrfs_delayed_item *item;
    6260         704 :         int ret = 0;
    6261             : 
    6262             :         /*
    6263             :          * No need for the log mutex, plus to avoid potential deadlocks or
    6264             :          * lockdep annotations due to nesting of delayed inode mutexes and log
    6265             :          * mutexes.
    6266             :          */
    6267         704 :         lockdep_assert_not_held(&inode->log_mutex);
    6268             : 
    6269         704 :         ASSERT(!ctx->logging_new_delayed_dentries);
    6270         704 :         ctx->logging_new_delayed_dentries = true;
    6271             : 
    6272        1814 :         list_for_each_entry(item, delayed_ins_list, log_list) {
    6273        1120 :                 struct btrfs_dir_item *dir_item;
    6274        1120 :                 struct inode *di_inode;
    6275        1120 :                 struct btrfs_key key;
    6276        1120 :                 int log_mode = LOG_INODE_EXISTS;
    6277             : 
    6278        1120 :                 dir_item = (struct btrfs_dir_item *)item->data;
    6279        1120 :                 btrfs_disk_key_to_cpu(&key, &dir_item->location);
    6280             : 
    6281        1120 :                 if (key.type == BTRFS_ROOT_ITEM_KEY)
    6282         198 :                         continue;
    6283             : 
    6284        1120 :                 di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
    6285        1120 :                 if (IS_ERR(di_inode)) {
    6286           0 :                         ret = PTR_ERR(di_inode);
    6287          10 :                         break;
    6288             :                 }
    6289             : 
    6290        1120 :                 if (!need_log_inode(trans, BTRFS_I(di_inode))) {
    6291         198 :                         btrfs_add_delayed_iput(BTRFS_I(di_inode));
    6292         198 :                         continue;
    6293             :                 }
    6294             : 
    6295         922 :                 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
    6296          92 :                         log_mode = LOG_INODE_ALL;
    6297             : 
    6298         922 :                 ctx->log_new_dentries = false;
    6299         922 :                 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
    6300             : 
    6301         922 :                 if (!ret && ctx->log_new_dentries)
    6302          38 :                         ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
    6303             : 
    6304         922 :                 btrfs_add_delayed_iput(BTRFS_I(di_inode));
    6305             : 
    6306         922 :                 if (ret)
    6307             :                         break;
    6308             :         }
    6309             : 
    6310         704 :         ctx->log_new_dentries = orig_log_new_dentries;
    6311         704 :         ctx->logging_new_delayed_dentries = false;
    6312             : 
    6313         704 :         return ret;
    6314             : }
    6315             : 
    6316             : /* log a single inode in the tree log.
    6317             :  * At least one parent directory for this inode must exist in the tree
    6318             :  * or be logged already.
    6319             :  *
    6320             :  * Any items from this inode changed by the current transaction are copied
    6321             :  * to the log tree.  An extra reference is taken on any extents in this
    6322             :  * file, allowing us to avoid a whole pile of corner cases around logging
    6323             :  * blocks that have been removed from the tree.
    6324             :  *
    6325             :  * See LOG_INODE_ALL and related defines for a description of what inode_only
    6326             :  * does.
    6327             :  *
    6328             :  * This handles both files and directories.
    6329             :  */
    6330      255777 : static int btrfs_log_inode(struct btrfs_trans_handle *trans,
    6331             :                            struct btrfs_inode *inode,
    6332             :                            int inode_only,
    6333             :                            struct btrfs_log_ctx *ctx)
    6334             : {
    6335      255777 :         struct btrfs_path *path;
    6336      255777 :         struct btrfs_path *dst_path;
    6337      255777 :         struct btrfs_key min_key;
    6338      255777 :         struct btrfs_key max_key;
    6339      255777 :         struct btrfs_root *log = inode->root->log_root;
    6340      255777 :         int ret;
    6341      255777 :         bool fast_search = false;
    6342      255777 :         u64 ino = btrfs_ino(inode);
    6343      255777 :         struct extent_map_tree *em_tree = &inode->extent_tree;
    6344      255777 :         u64 logged_isize = 0;
    6345      255777 :         bool need_log_inode_item = true;
    6346      255777 :         bool xattrs_logged = false;
    6347      255777 :         bool inode_item_dropped = true;
    6348      255777 :         bool full_dir_logging = false;
    6349      255777 :         LIST_HEAD(delayed_ins_list);
    6350      255777 :         LIST_HEAD(delayed_del_list);
    6351             : 
    6352      255777 :         path = btrfs_alloc_path();
    6353      255768 :         if (!path)
    6354             :                 return -ENOMEM;
    6355      255768 :         dst_path = btrfs_alloc_path();
    6356      255771 :         if (!dst_path) {
    6357           0 :                 btrfs_free_path(path);
    6358           0 :                 return -ENOMEM;
    6359             :         }
    6360             : 
    6361      255771 :         min_key.objectid = ino;
    6362      255771 :         min_key.type = BTRFS_INODE_ITEM_KEY;
    6363      255771 :         min_key.offset = 0;
    6364             : 
    6365      255771 :         max_key.objectid = ino;
    6366             : 
    6367             : 
    6368             :         /* today the code can only do partial logging of directories */
    6369      255771 :         if (S_ISDIR(inode->vfs_inode.i_mode) ||
    6370      253696 :             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
    6371      253696 :                        &inode->runtime_flags) &&
    6372             :              inode_only >= LOG_INODE_EXISTS))
    6373        2429 :                 max_key.type = BTRFS_XATTR_ITEM_KEY;
    6374             :         else
    6375      253342 :                 max_key.type = (u8)-1;
    6376      255771 :         max_key.offset = (u64)-1;
    6377             : 
    6378      255771 :         if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
    6379        1447 :                 full_dir_logging = true;
    6380             : 
    6381             :         /*
    6382             :          * If we are logging a directory while we are logging dentries of the
    6383             :          * delayed items of some other inode, then we need to flush the delayed
    6384             :          * items of this directory and not log the delayed items directly. This
    6385             :          * is to prevent more than one level of recursion into btrfs_log_inode()
    6386             :          * by having something like this:
    6387             :          *
    6388             :          *     $ mkdir -p a/b/c/d/e/f/g/h/...
    6389             :          *     $ xfs_io -c "fsync" a
    6390             :          *
    6391             :          * Where all directories in the path did not exist before and are
    6392             :          * created in the current transaction.
    6393             :          * So in such a case we directly log the delayed items of the main
    6394             :          * directory ("a") without flushing them first, while for each of its
    6395             :          * subdirectories we flush their delayed items before logging them.
    6396             :          * This prevents a potential unbounded recursion like this:
    6397             :          *
    6398             :          * btrfs_log_inode()
    6399             :          *   log_new_delayed_dentries()
    6400             :          *      btrfs_log_inode()
    6401             :          *        log_new_delayed_dentries()
    6402             :          *          btrfs_log_inode()
    6403             :          *            log_new_delayed_dentries()
    6404             :          *              (...)
    6405             :          *
    6406             :          * We have thresholds for the maximum number of delayed items to have in
    6407             :          * memory, and once they are hit, the items are flushed asynchronously.
    6408             :          * However the limit is quite high, so lets prevent deep levels of
    6409             :          * recursion to happen by limiting the maximum depth to be 1.
    6410             :          */
    6411        1447 :         if (full_dir_logging && ctx->logging_new_delayed_dentries) {
    6412         104 :                 ret = btrfs_commit_inode_delayed_items(trans, inode);
    6413         104 :                 if (ret)
    6414           0 :                         goto out;
    6415             :         }
    6416             : 
    6417      255771 :         mutex_lock(&inode->log_mutex);
    6418             : 
    6419             :         /*
    6420             :          * For symlinks, we must always log their content, which is stored in an
    6421             :          * inline extent, otherwise we could end up with an empty symlink after
    6422             :          * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
    6423             :          * one attempts to create an empty symlink).
    6424             :          * We don't need to worry about flushing delalloc, because when we create
    6425             :          * the inline extent when the symlink is created (we never have delalloc
    6426             :          * for symlinks).
    6427             :          */
    6428      255780 :         if (S_ISLNK(inode->vfs_inode.i_mode))
    6429         297 :                 inode_only = LOG_INODE_ALL;
    6430             : 
    6431             :         /*
    6432             :          * Before logging the inode item, cache the value returned by
    6433             :          * inode_logged(), because after that we have the need to figure out if
    6434             :          * the inode was previously logged in this transaction.
    6435             :          */
    6436      255780 :         ret = inode_logged(trans, inode, path);
    6437      255768 :         if (ret < 0)
    6438           0 :                 goto out_unlock;
    6439      255768 :         ctx->logged_before = (ret == 1);
    6440      255768 :         ret = 0;
    6441             : 
    6442             :         /*
    6443             :          * This is for cases where logging a directory could result in losing a
    6444             :          * a file after replaying the log. For example, if we move a file from a
    6445             :          * directory A to a directory B, then fsync directory A, we have no way
    6446             :          * to known the file was moved from A to B, so logging just A would
    6447             :          * result in losing the file after a log replay.
    6448             :          */
    6449      255768 :         if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
    6450         648 :                 ret = BTRFS_LOG_FORCE_COMMIT;
    6451         648 :                 goto out_unlock;
    6452             :         }
    6453             : 
    6454             :         /*
    6455             :          * a brute force approach to making sure we get the most uptodate
    6456             :          * copies of everything.
    6457             :          */
    6458      255120 :         if (S_ISDIR(inode->vfs_inode.i_mode)) {
    6459        1427 :                 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
    6460        1427 :                 if (ctx->logged_before)
    6461         100 :                         ret = drop_inode_items(trans, log, path, inode,
    6462             :                                                BTRFS_XATTR_ITEM_KEY);
    6463             :         } else {
    6464      253693 :                 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
    6465             :                         /*
    6466             :                          * Make sure the new inode item we write to the log has
    6467             :                          * the same isize as the current one (if it exists).
    6468             :                          * This is necessary to prevent data loss after log
    6469             :                          * replay, and also to prevent doing a wrong expanding
    6470             :                          * truncate - for e.g. create file, write 4K into offset
    6471             :                          * 0, fsync, write 4K into offset 4096, add hard link,
    6472             :                          * fsync some other file (to sync log), power fail - if
    6473             :                          * we use the inode's current i_size, after log replay
    6474             :                          * we get a 8Kb file, with the last 4Kb extent as a hole
    6475             :                          * (zeroes), as if an expanding truncate happened,
    6476             :                          * instead of getting a file of 4Kb only.
    6477             :                          */
    6478         732 :                         ret = logged_inode_size(log, inode, path, &logged_isize);
    6479         732 :                         if (ret)
    6480           0 :                                 goto out_unlock;
    6481             :                 }
    6482      507386 :                 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
    6483             :                              &inode->runtime_flags)) {
    6484      117852 :                         if (inode_only == LOG_INODE_EXISTS) {
    6485        1537 :                                 max_key.type = BTRFS_XATTR_ITEM_KEY;
    6486        1537 :                                 if (ctx->logged_before)
    6487         451 :                                         ret = drop_inode_items(trans, log, path,
    6488             :                                                                inode, max_key.type);
    6489             :                         } else {
    6490      116315 :                                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
    6491             :                                           &inode->runtime_flags);
    6492      116316 :                                 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
    6493             :                                           &inode->runtime_flags);
    6494      116316 :                                 if (ctx->logged_before)
    6495      109340 :                                         ret = truncate_inode_items(trans, log,
    6496             :                                                                    inode, 0, 0);
    6497             :                         }
    6498      135841 :                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
    6499      135663 :                                               &inode->runtime_flags) ||
    6500             :                            inode_only == LOG_INODE_EXISTS) {
    6501         426 :                         if (inode_only == LOG_INODE_ALL)
    6502         126 :                                 fast_search = true;
    6503         426 :                         max_key.type = BTRFS_XATTR_ITEM_KEY;
    6504         426 :                         if (ctx->logged_before)
    6505         302 :                                 ret = drop_inode_items(trans, log, path, inode,
    6506             :                                                        max_key.type);
    6507             :                 } else {
    6508      135423 :                         if (inode_only == LOG_INODE_ALL)
    6509      135423 :                                 fast_search = true;
    6510      135423 :                         inode_item_dropped = false;
    6511      135423 :                         goto log_extents;
    6512             :                 }
    6513             : 
    6514             :         }
    6515      118379 :         if (ret)
    6516           0 :                 goto out_unlock;
    6517             : 
    6518             :         /*
    6519             :          * If we are logging a directory in full mode, collect the delayed items
    6520             :          * before iterating the subvolume tree, so that we don't miss any new
    6521             :          * dir index items in case they get flushed while or right after we are
    6522             :          * iterating the subvolume tree.
    6523             :          */
    6524      119706 :         if (full_dir_logging && !ctx->logging_new_delayed_dentries)
    6525         705 :                 btrfs_log_get_delayed_items(inode, &delayed_ins_list,
    6526             :                                             &delayed_del_list);
    6527             : 
    6528      119706 :         ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
    6529             :                                       path, dst_path, logged_isize,
    6530             :                                       inode_only, ctx,
    6531             :                                       &need_log_inode_item);
    6532      119706 :         if (ret)
    6533           0 :                 goto out_unlock;
    6534             : 
    6535      119706 :         btrfs_release_path(path);
    6536      119706 :         btrfs_release_path(dst_path);
    6537      119706 :         ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
    6538      119705 :         if (ret)
    6539           0 :                 goto out_unlock;
    6540      119705 :         xattrs_logged = true;
    6541      119705 :         if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
    6542      116315 :                 btrfs_release_path(path);
    6543      116316 :                 btrfs_release_path(dst_path);
    6544      116316 :                 ret = btrfs_log_holes(trans, inode, path);
    6545      116314 :                 if (ret)
    6546           0 :                         goto out_unlock;
    6547             :         }
    6548      119704 : log_extents:
    6549      255127 :         btrfs_release_path(path);
    6550      255128 :         btrfs_release_path(dst_path);
    6551      255095 :         if (need_log_inode_item) {
    6552      135960 :                 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
    6553      135952 :                 if (ret)
    6554           0 :                         goto out_unlock;
    6555             :                 /*
    6556             :                  * If we are doing a fast fsync and the inode was logged before
    6557             :                  * in this transaction, we don't need to log the xattrs because
    6558             :                  * they were logged before. If xattrs were added, changed or
    6559             :                  * deleted since the last time we logged the inode, then we have
    6560             :                  * already logged them because the inode had the runtime flag
    6561             :                  * BTRFS_INODE_COPY_EVERYTHING set.
    6562             :                  */
    6563      135952 :                 if (!xattrs_logged && inode->logged_trans < trans->transid) {
    6564        1380 :                         ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
    6565        1380 :                         if (ret)
    6566           0 :                                 goto out_unlock;
    6567        1380 :                         btrfs_release_path(path);
    6568             :                 }
    6569             :         }
    6570      255087 :         if (fast_search) {
    6571      135507 :                 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
    6572      135533 :                 if (ret)
    6573           0 :                         goto out_unlock;
    6574      119580 :         } else if (inode_only == LOG_INODE_ALL) {
    6575      117115 :                 struct extent_map *em, *n;
    6576             : 
    6577      117115 :                 write_lock(&em_tree->lock);
    6578     4310826 :                 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
    6579     4193712 :                         list_del_init(&em->list);
    6580      117114 :                 write_unlock(&em_tree->lock);
    6581             :         }
    6582             : 
    6583      255111 :         if (full_dir_logging) {
    6584         799 :                 ret = log_directory_changes(trans, inode, path, dst_path, ctx);
    6585         799 :                 if (ret)
    6586           0 :                         goto out_unlock;
    6587         799 :                 ret = log_delayed_insertion_items(trans, inode, path,
    6588             :                                                   &delayed_ins_list, ctx);
    6589         799 :                 if (ret)
    6590           0 :                         goto out_unlock;
    6591         799 :                 ret = log_delayed_deletion_items(trans, inode, path,
    6592             :                                                  &delayed_del_list, ctx);
    6593         799 :                 if (ret)
    6594           0 :                         goto out_unlock;
    6595             :         }
    6596             : 
    6597      255111 :         spin_lock(&inode->lock);
    6598      255098 :         inode->logged_trans = trans->transid;
    6599             :         /*
    6600             :          * Don't update last_log_commit if we logged that an inode exists.
    6601             :          * We do this for three reasons:
    6602             :          *
    6603             :          * 1) We might have had buffered writes to this inode that were
    6604             :          *    flushed and had their ordered extents completed in this
    6605             :          *    transaction, but we did not previously log the inode with
    6606             :          *    LOG_INODE_ALL. Later the inode was evicted and after that
    6607             :          *    it was loaded again and this LOG_INODE_EXISTS log operation
    6608             :          *    happened. We must make sure that if an explicit fsync against
    6609             :          *    the inode is performed later, it logs the new extents, an
    6610             :          *    updated inode item, etc, and syncs the log. The same logic
    6611             :          *    applies to direct IO writes instead of buffered writes.
    6612             :          *
    6613             :          * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
    6614             :          *    is logged with an i_size of 0 or whatever value was logged
    6615             :          *    before. If later the i_size of the inode is increased by a
    6616             :          *    truncate operation, the log is synced through an fsync of
    6617             :          *    some other inode and then finally an explicit fsync against
    6618             :          *    this inode is made, we must make sure this fsync logs the
    6619             :          *    inode with the new i_size, the hole between old i_size and
    6620             :          *    the new i_size, and syncs the log.
    6621             :          *
    6622             :          * 3) If we are logging that an ancestor inode exists as part of
    6623             :          *    logging a new name from a link or rename operation, don't update
    6624             :          *    its last_log_commit - otherwise if an explicit fsync is made
    6625             :          *    against an ancestor, the fsync considers the inode in the log
    6626             :          *    and doesn't sync the log, resulting in the ancestor missing after
    6627             :          *    a power failure unless the log was synced as part of an fsync
    6628             :          *    against any other unrelated inode.
    6629             :          */
    6630      255098 :         if (inode_only != LOG_INODE_EXISTS)
    6631      252633 :                 inode->last_log_commit = inode->last_sub_trans;
    6632      255098 :         spin_unlock(&inode->lock);
    6633             : 
    6634             :         /*
    6635             :          * Reset the last_reflink_trans so that the next fsync does not need to
    6636             :          * go through the slower path when logging extents and their checksums.
    6637             :          */
    6638      255118 :         if (inode_only == LOG_INODE_ALL)
    6639      252653 :                 inode->last_reflink_trans = 0;
    6640             : 
    6641        2465 : out_unlock:
    6642      255766 :         mutex_unlock(&inode->log_mutex);
    6643      255762 : out:
    6644      255762 :         btrfs_free_path(path);
    6645      255763 :         btrfs_free_path(dst_path);
    6646             : 
    6647      255773 :         if (ret)
    6648         648 :                 free_conflicting_inodes(ctx);
    6649             :         else
    6650      255125 :                 ret = log_conflicting_inodes(trans, inode->root, ctx);
    6651             : 
    6652      255710 :         if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
    6653        1343 :                 if (!ret)
    6654         704 :                         ret = log_new_delayed_dentries(trans, inode,
    6655             :                                                        &delayed_ins_list, ctx);
    6656             : 
    6657        1343 :                 btrfs_log_put_delayed_items(inode, &delayed_ins_list,
    6658             :                                             &delayed_del_list);
    6659             :         }
    6660             : 
    6661             :         return ret;
    6662             : }
    6663             : 
    6664        1762 : static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
    6665             :                                  struct btrfs_inode *inode,
    6666             :                                  struct btrfs_log_ctx *ctx)
    6667             : {
    6668        1762 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    6669        1762 :         int ret;
    6670        1762 :         struct btrfs_path *path;
    6671        1762 :         struct btrfs_key key;
    6672        1762 :         struct btrfs_root *root = inode->root;
    6673        1762 :         const u64 ino = btrfs_ino(inode);
    6674             : 
    6675        1762 :         path = btrfs_alloc_path();
    6676        1762 :         if (!path)
    6677             :                 return -ENOMEM;
    6678        1762 :         path->skip_locking = 1;
    6679        1762 :         path->search_commit_root = 1;
    6680             : 
    6681        1762 :         key.objectid = ino;
    6682        1762 :         key.type = BTRFS_INODE_REF_KEY;
    6683        1762 :         key.offset = 0;
    6684        1762 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    6685        1762 :         if (ret < 0)
    6686           0 :                 goto out;
    6687             : 
    6688        6896 :         while (true) {
    6689        6896 :                 struct extent_buffer *leaf = path->nodes[0];
    6690        6896 :                 int slot = path->slots[0];
    6691        6896 :                 u32 cur_offset = 0;
    6692        6896 :                 u32 item_size;
    6693        6896 :                 unsigned long ptr;
    6694             : 
    6695        6896 :                 if (slot >= btrfs_header_nritems(leaf)) {
    6696         995 :                         ret = btrfs_next_leaf(root, path);
    6697         995 :                         if (ret < 0)
    6698           0 :                                 goto out;
    6699         995 :                         else if (ret > 0)
    6700             :                                 break;
    6701          32 :                         continue;
    6702             :                 }
    6703             : 
    6704        5901 :                 btrfs_item_key_to_cpu(leaf, &key, slot);
    6705             :                 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
    6706        5901 :                 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
    6707             :                         break;
    6708             : 
    6709        5491 :                 item_size = btrfs_item_size(leaf, slot);
    6710        5491 :                 ptr = btrfs_item_ptr_offset(leaf, slot);
    6711       10593 :                 while (cur_offset < item_size) {
    6712        5491 :                         struct btrfs_key inode_key;
    6713        5491 :                         struct inode *dir_inode;
    6714             : 
    6715        5491 :                         inode_key.type = BTRFS_INODE_ITEM_KEY;
    6716        5491 :                         inode_key.offset = 0;
    6717             : 
    6718        5491 :                         if (key.type == BTRFS_INODE_EXTREF_KEY) {
    6719        4588 :                                 struct btrfs_inode_extref *extref;
    6720             : 
    6721        4588 :                                 extref = (struct btrfs_inode_extref *)
    6722        4588 :                                         (ptr + cur_offset);
    6723        4588 :                                 inode_key.objectid = btrfs_inode_extref_parent(
    6724             :                                         leaf, extref);
    6725        4588 :                                 cur_offset += sizeof(*extref);
    6726        4588 :                                 cur_offset += btrfs_inode_extref_name_len(leaf,
    6727             :                                         extref);
    6728             :                         } else {
    6729         903 :                                 inode_key.objectid = key.offset;
    6730         903 :                                 cur_offset = item_size;
    6731             :                         }
    6732             : 
    6733        5491 :                         dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
    6734             :                                                root);
    6735             :                         /*
    6736             :                          * If the parent inode was deleted, return an error to
    6737             :                          * fallback to a transaction commit. This is to prevent
    6738             :                          * getting an inode that was moved from one parent A to
    6739             :                          * a parent B, got its former parent A deleted and then
    6740             :                          * it got fsync'ed, from existing at both parents after
    6741             :                          * a log replay (and the old parent still existing).
    6742             :                          * Example:
    6743             :                          *
    6744             :                          * mkdir /mnt/A
    6745             :                          * mkdir /mnt/B
    6746             :                          * touch /mnt/B/bar
    6747             :                          * sync
    6748             :                          * mv /mnt/B/bar /mnt/A/bar
    6749             :                          * mv -T /mnt/A /mnt/B
    6750             :                          * fsync /mnt/B/bar
    6751             :                          * <power fail>
    6752             :                          *
    6753             :                          * If we ignore the old parent B which got deleted,
    6754             :                          * after a log replay we would have file bar linked
    6755             :                          * at both parents and the old parent B would still
    6756             :                          * exist.
    6757             :                          */
    6758        5491 :                         if (IS_ERR(dir_inode)) {
    6759           1 :                                 ret = PTR_ERR(dir_inode);
    6760           1 :                                 goto out;
    6761             :                         }
    6762             : 
    6763        5490 :                         if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
    6764        4927 :                                 btrfs_add_delayed_iput(BTRFS_I(dir_inode));
    6765        4927 :                                 continue;
    6766             :                         }
    6767             : 
    6768         563 :                         ctx->log_new_dentries = false;
    6769         563 :                         ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
    6770             :                                               LOG_INODE_ALL, ctx);
    6771         563 :                         if (!ret && ctx->log_new_dentries)
    6772           9 :                                 ret = log_new_dir_dentries(trans,
    6773             :                                                    BTRFS_I(dir_inode), ctx);
    6774         563 :                         btrfs_add_delayed_iput(BTRFS_I(dir_inode));
    6775         563 :                         if (ret)
    6776         388 :                                 goto out;
    6777             :                 }
    6778        5102 :                 path->slots[0]++;
    6779             :         }
    6780             :         ret = 0;
    6781        1762 : out:
    6782        1762 :         btrfs_free_path(path);
    6783        1762 :         return ret;
    6784             : }
    6785             : 
    6786         588 : static int log_new_ancestors(struct btrfs_trans_handle *trans,
    6787             :                              struct btrfs_root *root,
    6788             :                              struct btrfs_path *path,
    6789             :                              struct btrfs_log_ctx *ctx)
    6790             : {
    6791         588 :         struct btrfs_key found_key;
    6792             : 
    6793         588 :         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
    6794             : 
    6795        2045 :         while (true) {
    6796        2633 :                 struct btrfs_fs_info *fs_info = root->fs_info;
    6797        2633 :                 struct extent_buffer *leaf = path->nodes[0];
    6798        2633 :                 int slot = path->slots[0];
    6799        2633 :                 struct btrfs_key search_key;
    6800        2633 :                 struct inode *inode;
    6801        2633 :                 u64 ino;
    6802        2633 :                 int ret = 0;
    6803             : 
    6804        2633 :                 btrfs_release_path(path);
    6805             : 
    6806        2633 :                 ino = found_key.offset;
    6807             : 
    6808        2633 :                 search_key.objectid = found_key.offset;
    6809        2633 :                 search_key.type = BTRFS_INODE_ITEM_KEY;
    6810        2633 :                 search_key.offset = 0;
    6811        2633 :                 inode = btrfs_iget(fs_info->sb, ino, root);
    6812        2633 :                 if (IS_ERR(inode))
    6813           2 :                         return PTR_ERR(inode);
    6814             : 
    6815        3199 :                 if (BTRFS_I(inode)->generation >= trans->transid &&
    6816         566 :                     need_log_inode(trans, BTRFS_I(inode)))
    6817          87 :                         ret = btrfs_log_inode(trans, BTRFS_I(inode),
    6818             :                                               LOG_INODE_EXISTS, ctx);
    6819        2633 :                 btrfs_add_delayed_iput(BTRFS_I(inode));
    6820        2633 :                 if (ret)
    6821           2 :                         return ret;
    6822             : 
    6823        2631 :                 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
    6824             :                         break;
    6825             : 
    6826        2045 :                 search_key.type = BTRFS_INODE_REF_KEY;
    6827        2045 :                 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
    6828        2045 :                 if (ret < 0)
    6829           0 :                         return ret;
    6830             : 
    6831        2045 :                 leaf = path->nodes[0];
    6832        2045 :                 slot = path->slots[0];
    6833        2045 :                 if (slot >= btrfs_header_nritems(leaf)) {
    6834          15 :                         ret = btrfs_next_leaf(root, path);
    6835          15 :                         if (ret < 0)
    6836           0 :                                 return ret;
    6837          15 :                         else if (ret > 0)
    6838             :                                 return -ENOENT;
    6839          15 :                         leaf = path->nodes[0];
    6840          15 :                         slot = path->slots[0];
    6841             :                 }
    6842             : 
    6843        2045 :                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
    6844        2045 :                 if (found_key.objectid != search_key.objectid ||
    6845        2045 :                     found_key.type != BTRFS_INODE_REF_KEY)
    6846             :                         return -ENOENT;
    6847             :         }
    6848         586 :         return 0;
    6849             : }
    6850             : 
    6851       47877 : static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
    6852             :                                   struct btrfs_inode *inode,
    6853             :                                   struct dentry *parent,
    6854             :                                   struct btrfs_log_ctx *ctx)
    6855             : {
    6856       47877 :         struct btrfs_root *root = inode->root;
    6857       47877 :         struct dentry *old_parent = NULL;
    6858       47877 :         struct super_block *sb = inode->vfs_inode.i_sb;
    6859       47877 :         int ret = 0;
    6860             : 
    6861       73043 :         while (true) {
    6862       60460 :                 if (!parent || d_really_is_negative(parent) ||
    6863       60460 :                     sb != parent->d_sb)
    6864             :                         break;
    6865             : 
    6866       60460 :                 inode = BTRFS_I(d_inode(parent));
    6867       60460 :                 if (root != inode->root)
    6868             :                         break;
    6869             : 
    6870       65167 :                 if (inode->generation >= trans->transid &&
    6871        4785 :                     need_log_inode(trans, inode)) {
    6872         492 :                         ret = btrfs_log_inode(trans, inode,
    6873             :                                               LOG_INODE_EXISTS, ctx);
    6874         492 :                         if (ret)
    6875             :                                 break;
    6876             :                 }
    6877       60331 :                 if (IS_ROOT(parent))
    6878             :                         break;
    6879             : 
    6880       12583 :                 parent = dget_parent(parent);
    6881       12583 :                 dput(old_parent);
    6882       12583 :                 old_parent = parent;
    6883             :         }
    6884       47877 :         dput(old_parent);
    6885             : 
    6886       47877 :         return ret;
    6887             : }
    6888             : 
    6889       48239 : static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
    6890             :                                  struct btrfs_inode *inode,
    6891             :                                  struct dentry *parent,
    6892             :                                  struct btrfs_log_ctx *ctx)
    6893             : {
    6894       48239 :         struct btrfs_root *root = inode->root;
    6895       48239 :         const u64 ino = btrfs_ino(inode);
    6896       48239 :         struct btrfs_path *path;
    6897       48239 :         struct btrfs_key search_key;
    6898       48239 :         int ret;
    6899             : 
    6900             :         /*
    6901             :          * For a single hard link case, go through a fast path that does not
    6902             :          * need to iterate the fs/subvolume tree.
    6903             :          */
    6904       48239 :         if (inode->vfs_inode.i_nlink < 2)
    6905       47877 :                 return log_new_ancestors_fast(trans, inode, parent, ctx);
    6906             : 
    6907         362 :         path = btrfs_alloc_path();
    6908         362 :         if (!path)
    6909             :                 return -ENOMEM;
    6910             : 
    6911         362 :         search_key.objectid = ino;
    6912         362 :         search_key.type = BTRFS_INODE_REF_KEY;
    6913         362 :         search_key.offset = 0;
    6914         948 : again:
    6915         948 :         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
    6916         948 :         if (ret < 0)
    6917           0 :                 goto out;
    6918         948 :         if (ret == 0)
    6919         586 :                 path->slots[0]++;
    6920             : 
    6921         955 :         while (true) {
    6922         955 :                 struct extent_buffer *leaf = path->nodes[0];
    6923         955 :                 int slot = path->slots[0];
    6924         955 :                 struct btrfs_key found_key;
    6925             : 
    6926         955 :                 if (slot >= btrfs_header_nritems(leaf)) {
    6927          26 :                         ret = btrfs_next_leaf(root, path);
    6928          26 :                         if (ret < 0)
    6929           4 :                                 goto out;
    6930          26 :                         else if (ret > 0)
    6931             :                                 break;
    6932           7 :                         continue;
    6933             :                 }
    6934             : 
    6935         929 :                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
    6936         929 :                 if (found_key.objectid != ino ||
    6937         850 :                     found_key.type > BTRFS_INODE_EXTREF_KEY)
    6938             :                         break;
    6939             : 
    6940             :                 /*
    6941             :                  * Don't deal with extended references because they are rare
    6942             :                  * cases and too complex to deal with (we would need to keep
    6943             :                  * track of which subitem we are processing for each item in
    6944             :                  * this loop, etc). So just return some error to fallback to
    6945             :                  * a transaction commit.
    6946             :                  */
    6947         590 :                 if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
    6948           2 :                         ret = -EMLINK;
    6949           2 :                         goto out;
    6950             :                 }
    6951             : 
    6952             :                 /*
    6953             :                  * Logging ancestors needs to do more searches on the fs/subvol
    6954             :                  * tree, so it releases the path as needed to avoid deadlocks.
    6955             :                  * Keep track of the last inode ref key and resume from that key
    6956             :                  * after logging all new ancestors for the current hard link.
    6957             :                  */
    6958         588 :                 memcpy(&search_key, &found_key, sizeof(search_key));
    6959             : 
    6960         588 :                 ret = log_new_ancestors(trans, root, path, ctx);
    6961         588 :                 if (ret)
    6962           2 :                         goto out;
    6963         586 :                 btrfs_release_path(path);
    6964         586 :                 goto again;
    6965             :         }
    6966         358 :         ret = 0;
    6967         362 : out:
    6968         362 :         btrfs_free_path(path);
    6969         362 :         return ret;
    6970             : }
    6971             : 
    6972             : /*
    6973             :  * helper function around btrfs_log_inode to make sure newly created
    6974             :  * parent directories also end up in the log.  A minimal inode and backref
    6975             :  * only logging is done of any parent directories that are older than
    6976             :  * the last committed transaction
    6977             :  */
    6978      253739 : static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
    6979             :                                   struct btrfs_inode *inode,
    6980             :                                   struct dentry *parent,
    6981             :                                   int inode_only,
    6982             :                                   struct btrfs_log_ctx *ctx)
    6983             : {
    6984      253739 :         struct btrfs_root *root = inode->root;
    6985      253739 :         struct btrfs_fs_info *fs_info = root->fs_info;
    6986      253739 :         int ret = 0;
    6987      253739 :         bool log_dentries = false;
    6988             : 
    6989      253739 :         if (btrfs_test_opt(fs_info, NOTREELOG)) {
    6990           0 :                 ret = BTRFS_LOG_FORCE_COMMIT;
    6991           0 :                 goto end_no_trans;
    6992             :         }
    6993             : 
    6994      253739 :         if (btrfs_root_refs(&root->root_item) == 0) {
    6995           0 :                 ret = BTRFS_LOG_FORCE_COMMIT;
    6996           0 :                 goto end_no_trans;
    6997             :         }
    6998             : 
    6999             :         /*
    7000             :          * Skip already logged inodes or inodes corresponding to tmpfiles
    7001             :          * (since logging them is pointless, a link count of 0 means they
    7002             :          * will never be accessible).
    7003             :          */
    7004      253739 :         if ((btrfs_inode_in_log(inode, trans->transid) &&
    7005          99 :              list_empty(&ctx->ordered_extents)) ||
    7006      253745 :             inode->vfs_inode.i_nlink == 0) {
    7007           1 :                 ret = BTRFS_NO_LOG_SYNC;
    7008           1 :                 goto end_no_trans;
    7009             :         }
    7010             : 
    7011      253744 :         ret = start_log_trans(trans, root, ctx);
    7012      253745 :         if (ret)
    7013         260 :                 goto end_no_trans;
    7014             : 
    7015      253485 :         ret = btrfs_log_inode(trans, inode, inode_only, ctx);
    7016      253404 :         if (ret)
    7017         206 :                 goto end_trans;
    7018             : 
    7019             :         /*
    7020             :          * for regular files, if its inode is already on disk, we don't
    7021             :          * have to worry about the parents at all.  This is because
    7022             :          * we can use the last_unlink_trans field to record renames
    7023             :          * and other fun in this file.
    7024             :          */
    7025      253198 :         if (S_ISREG(inode->vfs_inode.i_mode) &&
    7026      252149 :             inode->generation < trans->transid &&
    7027      205289 :             inode->last_unlink_trans < trans->transid) {
    7028      204570 :                 ret = 0;
    7029      204570 :                 goto end_trans;
    7030             :         }
    7031             : 
    7032       48628 :         if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
    7033          35 :                 log_dentries = true;
    7034             : 
    7035             :         /*
    7036             :          * On unlink we must make sure all our current and old parent directory
    7037             :          * inodes are fully logged. This is to prevent leaving dangling
    7038             :          * directory index entries in directories that were our parents but are
    7039             :          * not anymore. Not doing this results in old parent directory being
    7040             :          * impossible to delete after log replay (rmdir will always fail with
    7041             :          * error -ENOTEMPTY).
    7042             :          *
    7043             :          * Example 1:
    7044             :          *
    7045             :          * mkdir testdir
    7046             :          * touch testdir/foo
    7047             :          * ln testdir/foo testdir/bar
    7048             :          * sync
    7049             :          * unlink testdir/bar
    7050             :          * xfs_io -c fsync testdir/foo
    7051             :          * <power failure>
    7052             :          * mount fs, triggers log replay
    7053             :          *
    7054             :          * If we don't log the parent directory (testdir), after log replay the
    7055             :          * directory still has an entry pointing to the file inode using the bar
    7056             :          * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
    7057             :          * the file inode has a link count of 1.
    7058             :          *
    7059             :          * Example 2:
    7060             :          *
    7061             :          * mkdir testdir
    7062             :          * touch foo
    7063             :          * ln foo testdir/foo2
    7064             :          * ln foo testdir/foo3
    7065             :          * sync
    7066             :          * unlink testdir/foo3
    7067             :          * xfs_io -c fsync foo
    7068             :          * <power failure>
    7069             :          * mount fs, triggers log replay
    7070             :          *
    7071             :          * Similar as the first example, after log replay the parent directory
    7072             :          * testdir still has an entry pointing to the inode file with name foo3
    7073             :          * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
    7074             :          * and has a link count of 2.
    7075             :          */
    7076       48628 :         if (inode->last_unlink_trans >= trans->transid) {
    7077        1762 :                 ret = btrfs_log_all_parents(trans, inode, ctx);
    7078        1762 :                 if (ret)
    7079         389 :                         goto end_trans;
    7080             :         }
    7081             : 
    7082       48239 :         ret = log_all_new_ancestors(trans, inode, parent, ctx);
    7083       48239 :         if (ret)
    7084          55 :                 goto end_trans;
    7085             : 
    7086       48184 :         if (log_dentries)
    7087          35 :                 ret = log_new_dir_dentries(trans, inode, ctx);
    7088             :         else
    7089             :                 ret = 0;
    7090         685 : end_trans:
    7091      205255 :         if (ret < 0) {
    7092         651 :                 btrfs_set_log_full_commit(trans);
    7093         651 :                 ret = BTRFS_LOG_FORCE_COMMIT;
    7094             :         }
    7095             : 
    7096      253404 :         if (ret)
    7097         651 :                 btrfs_remove_log_ctx(root, ctx);
    7098      253404 :         btrfs_end_log_trans(root);
    7099      253710 : end_no_trans:
    7100      253710 :         return ret;
    7101             : }
    7102             : 
    7103             : /*
    7104             :  * it is not safe to log dentry if the chunk root has added new
    7105             :  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
    7106             :  * If this returns 1, you must commit the transaction to safely get your
    7107             :  * data on disk.
    7108             :  */
    7109      252454 : int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
    7110             :                           struct dentry *dentry,
    7111             :                           struct btrfs_log_ctx *ctx)
    7112             : {
    7113      252454 :         struct dentry *parent = dget_parent(dentry);
    7114      252453 :         int ret;
    7115             : 
    7116      252453 :         ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
    7117             :                                      LOG_INODE_ALL, ctx);
    7118      252428 :         dput(parent);
    7119             : 
    7120      252419 :         return ret;
    7121             : }
    7122             : 
    7123             : /*
    7124             :  * should be called during mount to recover any replay any log trees
    7125             :  * from the FS
    7126             :  */
    7127         283 : int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
    7128             : {
    7129         283 :         int ret;
    7130         283 :         struct btrfs_path *path;
    7131         283 :         struct btrfs_trans_handle *trans;
    7132         283 :         struct btrfs_key key;
    7133         283 :         struct btrfs_key found_key;
    7134         283 :         struct btrfs_root *log;
    7135         283 :         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
    7136         283 :         struct walk_control wc = {
    7137             :                 .process_func = process_one_buffer,
    7138             :                 .stage = LOG_WALK_PIN_ONLY,
    7139             :         };
    7140             : 
    7141         283 :         path = btrfs_alloc_path();
    7142         283 :         if (!path)
    7143             :                 return -ENOMEM;
    7144             : 
    7145         283 :         set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
    7146             : 
    7147         283 :         trans = btrfs_start_transaction(fs_info->tree_root, 0);
    7148         283 :         if (IS_ERR(trans)) {
    7149           0 :                 ret = PTR_ERR(trans);
    7150           0 :                 goto error;
    7151             :         }
    7152             : 
    7153         283 :         wc.trans = trans;
    7154         283 :         wc.pin = 1;
    7155             : 
    7156         283 :         ret = walk_log_tree(trans, log_root_tree, &wc);
    7157         283 :         if (ret) {
    7158           0 :                 btrfs_abort_transaction(trans, ret);
    7159           0 :                 goto error;
    7160             :         }
    7161             : 
    7162         283 : again:
    7163        1132 :         key.objectid = BTRFS_TREE_LOG_OBJECTID;
    7164        1132 :         key.offset = (u64)-1;
    7165        1132 :         key.type = BTRFS_ROOT_ITEM_KEY;
    7166             : 
    7167       36716 :         while (1) {
    7168       18924 :                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
    7169             : 
    7170       18924 :                 if (ret < 0) {
    7171           0 :                         btrfs_abort_transaction(trans, ret);
    7172           0 :                         goto error;
    7173             :                 }
    7174       18924 :                 if (ret > 0) {
    7175        2264 :                         if (path->slots[0] == 0)
    7176             :                                 break;
    7177        1132 :                         path->slots[0]--;
    7178             :                 }
    7179       17792 :                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
    7180             :                                       path->slots[0]);
    7181       17792 :                 btrfs_release_path(path);
    7182       17792 :                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
    7183             :                         break;
    7184             : 
    7185       17792 :                 log = btrfs_read_tree_root(log_root_tree, &found_key);
    7186       17792 :                 if (IS_ERR(log)) {
    7187           0 :                         ret = PTR_ERR(log);
    7188           0 :                         btrfs_abort_transaction(trans, ret);
    7189           0 :                         goto error;
    7190             :                 }
    7191             : 
    7192       17792 :                 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
    7193             :                                                    true);
    7194       17792 :                 if (IS_ERR(wc.replay_dest)) {
    7195           0 :                         ret = PTR_ERR(wc.replay_dest);
    7196             : 
    7197             :                         /*
    7198             :                          * We didn't find the subvol, likely because it was
    7199             :                          * deleted.  This is ok, simply skip this log and go to
    7200             :                          * the next one.
    7201             :                          *
    7202             :                          * We need to exclude the root because we can't have
    7203             :                          * other log replays overwriting this log as we'll read
    7204             :                          * it back in a few more times.  This will keep our
    7205             :                          * block from being modified, and we'll just bail for
    7206             :                          * each subsequent pass.
    7207             :                          */
    7208           0 :                         if (ret == -ENOENT)
    7209           0 :                                 ret = btrfs_pin_extent_for_log_replay(trans,
    7210             :                                                         log->node->start,
    7211           0 :                                                         log->node->len);
    7212           0 :                         btrfs_put_root(log);
    7213             : 
    7214           0 :                         if (!ret)
    7215           0 :                                 goto next;
    7216           0 :                         btrfs_abort_transaction(trans, ret);
    7217           0 :                         goto error;
    7218             :                 }
    7219             : 
    7220       17792 :                 wc.replay_dest->log_root = log;
    7221       17792 :                 ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
    7222       17792 :                 if (ret)
    7223             :                         /* The loop needs to continue due to the root refs */
    7224           0 :                         btrfs_abort_transaction(trans, ret);
    7225             :                 else
    7226       17792 :                         ret = walk_log_tree(trans, log, &wc);
    7227             : 
    7228       17792 :                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
    7229        4448 :                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
    7230             :                                                       path);
    7231        4448 :                         if (ret)
    7232           0 :                                 btrfs_abort_transaction(trans, ret);
    7233             :                 }
    7234             : 
    7235       17792 :                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
    7236        4448 :                         struct btrfs_root *root = wc.replay_dest;
    7237             : 
    7238        4448 :                         btrfs_release_path(path);
    7239             : 
    7240             :                         /*
    7241             :                          * We have just replayed everything, and the highest
    7242             :                          * objectid of fs roots probably has changed in case
    7243             :                          * some inode_item's got replayed.
    7244             :                          *
    7245             :                          * root->objectid_mutex is not acquired as log replay
    7246             :                          * could only happen during mount.
    7247             :                          */
    7248        4448 :                         ret = btrfs_init_root_free_objectid(root);
    7249        4448 :                         if (ret)
    7250           0 :                                 btrfs_abort_transaction(trans, ret);
    7251             :                 }
    7252             : 
    7253       17792 :                 wc.replay_dest->log_root = NULL;
    7254       17792 :                 btrfs_put_root(wc.replay_dest);
    7255       17792 :                 btrfs_put_root(log);
    7256             : 
    7257       17792 :                 if (ret)
    7258           0 :                         goto error;
    7259       17792 : next:
    7260       17792 :                 if (found_key.offset == 0)
    7261             :                         break;
    7262       17792 :                 key.offset = found_key.offset - 1;
    7263             :         }
    7264        1132 :         btrfs_release_path(path);
    7265             : 
    7266             :         /* step one is to pin it all, step two is to replay just inodes */
    7267        1132 :         if (wc.pin) {
    7268         283 :                 wc.pin = 0;
    7269         283 :                 wc.process_func = replay_one_buffer;
    7270         283 :                 wc.stage = LOG_WALK_REPLAY_INODES;
    7271         283 :                 goto again;
    7272             :         }
    7273             :         /* step three is to replay everything */
    7274         849 :         if (wc.stage < LOG_WALK_REPLAY_ALL) {
    7275         566 :                 wc.stage++;
    7276         566 :                 goto again;
    7277             :         }
    7278             : 
    7279         283 :         btrfs_free_path(path);
    7280             : 
    7281             :         /* step 4: commit the transaction, which also unpins the blocks */
    7282         283 :         ret = btrfs_commit_transaction(trans);
    7283         283 :         if (ret)
    7284             :                 return ret;
    7285             : 
    7286         283 :         log_root_tree->log_root = NULL;
    7287         283 :         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
    7288         283 :         btrfs_put_root(log_root_tree);
    7289             : 
    7290         283 :         return 0;
    7291           0 : error:
    7292           0 :         if (wc.trans)
    7293           0 :                 btrfs_end_transaction(wc.trans);
    7294           0 :         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
    7295           0 :         btrfs_free_path(path);
    7296           0 :         return ret;
    7297             : }
    7298             : 
    7299             : /*
    7300             :  * there are some corner cases where we want to force a full
    7301             :  * commit instead of allowing a directory to be logged.
    7302             :  *
    7303             :  * They revolve around files there were unlinked from the directory, and
    7304             :  * this function updates the parent directory so that a full commit is
    7305             :  * properly done if it is fsync'd later after the unlinks are done.
    7306             :  *
    7307             :  * Must be called before the unlink operations (updates to the subvolume tree,
    7308             :  * inodes, etc) are done.
    7309             :  */
    7310     1497800 : void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
    7311             :                              struct btrfs_inode *dir, struct btrfs_inode *inode,
    7312             :                              bool for_rename)
    7313             : {
    7314             :         /*
    7315             :          * when we're logging a file, if it hasn't been renamed
    7316             :          * or unlinked, and its inode is fully committed on disk,
    7317             :          * we don't have to worry about walking up the directory chain
    7318             :          * to log its parents.
    7319             :          *
    7320             :          * So, we use the last_unlink_trans field to put this transid
    7321             :          * into the file.  When the file is logged we check it and
    7322             :          * don't log the parents if the file is fully on disk.
    7323             :          */
    7324     1497800 :         mutex_lock(&inode->log_mutex);
    7325     1497808 :         inode->last_unlink_trans = trans->transid;
    7326     1497808 :         mutex_unlock(&inode->log_mutex);
    7327             : 
    7328     1497812 :         if (!for_rename)
    7329             :                 return;
    7330             : 
    7331             :         /*
    7332             :          * If this directory was already logged, any new names will be logged
    7333             :          * with btrfs_log_new_name() and old names will be deleted from the log
    7334             :          * tree with btrfs_del_dir_entries_in_log() or with
    7335             :          * btrfs_del_inode_ref_in_log().
    7336             :          */
    7337      158148 :         if (inode_logged(trans, dir, NULL) == 1)
    7338             :                 return;
    7339             : 
    7340             :         /*
    7341             :          * If the inode we're about to unlink was logged before, the log will be
    7342             :          * properly updated with the new name with btrfs_log_new_name() and the
    7343             :          * old name removed with btrfs_del_dir_entries_in_log() or with
    7344             :          * btrfs_del_inode_ref_in_log().
    7345             :          */
    7346      157674 :         if (inode_logged(trans, inode, NULL) == 1)
    7347             :                 return;
    7348             : 
    7349             :         /*
    7350             :          * when renaming files across directories, if the directory
    7351             :          * there we're unlinking from gets fsync'd later on, there's
    7352             :          * no way to find the destination directory later and fsync it
    7353             :          * properly.  So, we have to be conservative and force commits
    7354             :          * so the new name gets discovered.
    7355             :          */
    7356      157496 :         mutex_lock(&dir->log_mutex);
    7357      157496 :         dir->last_unlink_trans = trans->transid;
    7358      157496 :         mutex_unlock(&dir->log_mutex);
    7359             : }
    7360             : 
    7361             : /*
    7362             :  * Make sure that if someone attempts to fsync the parent directory of a deleted
    7363             :  * snapshot, it ends up triggering a transaction commit. This is to guarantee
    7364             :  * that after replaying the log tree of the parent directory's root we will not
    7365             :  * see the snapshot anymore and at log replay time we will not see any log tree
    7366             :  * corresponding to the deleted snapshot's root, which could lead to replaying
    7367             :  * it after replaying the log tree of the parent directory (which would replay
    7368             :  * the snapshot delete operation).
    7369             :  *
    7370             :  * Must be called before the actual snapshot destroy operation (updates to the
    7371             :  * parent root and tree of tree roots trees, etc) are done.
    7372             :  */
    7373         168 : void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
    7374             :                                    struct btrfs_inode *dir)
    7375             : {
    7376         168 :         mutex_lock(&dir->log_mutex);
    7377         168 :         dir->last_unlink_trans = trans->transid;
    7378         168 :         mutex_unlock(&dir->log_mutex);
    7379         168 : }
    7380             : 
    7381             : /*
    7382             :  * Update the log after adding a new name for an inode.
    7383             :  *
    7384             :  * @trans:              Transaction handle.
    7385             :  * @old_dentry:         The dentry associated with the old name and the old
    7386             :  *                      parent directory.
    7387             :  * @old_dir:            The inode of the previous parent directory for the case
    7388             :  *                      of a rename. For a link operation, it must be NULL.
    7389             :  * @old_dir_index:      The index number associated with the old name, meaningful
    7390             :  *                      only for rename operations (when @old_dir is not NULL).
    7391             :  *                      Ignored for link operations.
    7392             :  * @parent:             The dentry associated with the directory under which the
    7393             :  *                      new name is located.
    7394             :  *
    7395             :  * Call this after adding a new name for an inode, as a result of a link or
    7396             :  * rename operation, and it will properly update the log to reflect the new name.
    7397             :  */
    7398      526317 : void btrfs_log_new_name(struct btrfs_trans_handle *trans,
    7399             :                         struct dentry *old_dentry, struct btrfs_inode *old_dir,
    7400             :                         u64 old_dir_index, struct dentry *parent)
    7401             : {
    7402      526317 :         struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
    7403      526317 :         struct btrfs_root *root = inode->root;
    7404      526317 :         struct btrfs_log_ctx ctx;
    7405      526317 :         bool log_pinned = false;
    7406      526317 :         int ret;
    7407             : 
    7408             :         /*
    7409             :          * this will force the logging code to walk the dentry chain
    7410             :          * up for the file
    7411             :          */
    7412      526317 :         if (!S_ISDIR(inode->vfs_inode.i_mode))
    7413      460548 :                 inode->last_unlink_trans = trans->transid;
    7414             : 
    7415             :         /*
    7416             :          * if this inode hasn't been logged and directory we're renaming it
    7417             :          * from hasn't been logged, we don't need to log it
    7418             :          */
    7419      526317 :         ret = inode_logged(trans, inode, NULL);
    7420      526317 :         if (ret < 0) {
    7421           0 :                 goto out;
    7422      526317 :         } else if (ret == 0) {
    7423      525514 :                 if (!old_dir)
    7424      525027 :                         return;
    7425             :                 /*
    7426             :                  * If the inode was not logged and we are doing a rename (old_dir is not
    7427             :                  * NULL), check if old_dir was logged - if it was not we can return and
    7428             :                  * do nothing.
    7429             :                  */
    7430      462151 :                 ret = inode_logged(trans, old_dir, NULL);
    7431      462151 :                 if (ret < 0)
    7432           0 :                         goto out;
    7433      462151 :                 else if (ret == 0)
    7434             :                         return;
    7435             :         }
    7436        1290 :         ret = 0;
    7437             : 
    7438             :         /*
    7439             :          * If we are doing a rename (old_dir is not NULL) from a directory that
    7440             :          * was previously logged, make sure that on log replay we get the old
    7441             :          * dir entry deleted. This is needed because we will also log the new
    7442             :          * name of the renamed inode, so we need to make sure that after log
    7443             :          * replay we don't end up with both the new and old dir entries existing.
    7444             :          */
    7445        1290 :         if (old_dir && old_dir->logged_trans == trans->transid) {
    7446         924 :                 struct btrfs_root *log = old_dir->root->log_root;
    7447         924 :                 struct btrfs_path *path;
    7448         924 :                 struct fscrypt_name fname;
    7449             : 
    7450         924 :                 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
    7451             : 
    7452         924 :                 ret = fscrypt_setup_filename(&old_dir->vfs_inode,
    7453         924 :                                              &old_dentry->d_name, 0, &fname);
    7454         924 :                 if (ret)
    7455           0 :                         goto out;
    7456             :                 /*
    7457             :                  * We have two inodes to update in the log, the old directory and
    7458             :                  * the inode that got renamed, so we must pin the log to prevent
    7459             :                  * anyone from syncing the log until we have updated both inodes
    7460             :                  * in the log.
    7461             :                  */
    7462         924 :                 ret = join_running_log_trans(root);
    7463             :                 /*
    7464             :                  * At least one of the inodes was logged before, so this should
    7465             :                  * not fail, but if it does, it's not serious, just bail out and
    7466             :                  * mark the log for a full commit.
    7467             :                  */
    7468         924 :                 if (WARN_ON_ONCE(ret < 0)) {
    7469           0 :                         fscrypt_free_filename(&fname);
    7470           0 :                         goto out;
    7471             :                 }
    7472             : 
    7473         924 :                 log_pinned = true;
    7474             : 
    7475         924 :                 path = btrfs_alloc_path();
    7476         924 :                 if (!path) {
    7477           0 :                         ret = -ENOMEM;
    7478           0 :                         fscrypt_free_filename(&fname);
    7479           0 :                         goto out;
    7480             :                 }
    7481             : 
    7482             :                 /*
    7483             :                  * Other concurrent task might be logging the old directory,
    7484             :                  * as it can be triggered when logging other inode that had or
    7485             :                  * still has a dentry in the old directory. We lock the old
    7486             :                  * directory's log_mutex to ensure the deletion of the old
    7487             :                  * name is persisted, because during directory logging we
    7488             :                  * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
    7489             :                  * the old name's dir index item is in the delayed items, so
    7490             :                  * it could be missed by an in progress directory logging.
    7491             :                  */
    7492         924 :                 mutex_lock(&old_dir->log_mutex);
    7493         924 :                 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
    7494             :                                         &fname.disk_name, old_dir_index);
    7495         924 :                 if (ret > 0) {
    7496             :                         /*
    7497             :                          * The dentry does not exist in the log, so record its
    7498             :                          * deletion.
    7499             :                          */
    7500         756 :                         btrfs_release_path(path);
    7501         756 :                         ret = insert_dir_log_key(trans, log, path,
    7502             :                                                  btrfs_ino(old_dir),
    7503             :                                                  old_dir_index, old_dir_index);
    7504             :                 }
    7505         924 :                 mutex_unlock(&old_dir->log_mutex);
    7506             : 
    7507         924 :                 btrfs_free_path(path);
    7508         924 :                 fscrypt_free_filename(&fname);
    7509         924 :                 if (ret < 0)
    7510           0 :                         goto out;
    7511             :         }
    7512             : 
    7513        1290 :         btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
    7514        1290 :         ctx.logging_new_name = true;
    7515             :         /*
    7516             :          * We don't care about the return value. If we fail to log the new name
    7517             :          * then we know the next attempt to sync the log will fallback to a full
    7518             :          * transaction commit (due to a call to btrfs_set_log_full_commit()), so
    7519             :          * we don't need to worry about getting a log committed that has an
    7520             :          * inconsistent state after a rename operation.
    7521             :          */
    7522        1290 :         btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
    7523        1290 :         ASSERT(list_empty(&ctx.conflict_inodes));
    7524        1290 : out:
    7525             :         /*
    7526             :          * If an error happened mark the log for a full commit because it's not
    7527             :          * consistent and up to date or we couldn't find out if one of the
    7528             :          * inodes was logged before in this transaction. Do it before unpinning
    7529             :          * the log, to avoid any races with someone else trying to commit it.
    7530             :          */
    7531        1290 :         if (ret < 0)
    7532           0 :                 btrfs_set_log_full_commit(trans);
    7533        1290 :         if (log_pinned)
    7534         924 :                 btrfs_end_log_trans(root);
    7535             : }
    7536             : 

Generated by: LCOV version 1.14