LCOV - code coverage report
Current view: top level - fs/btrfs - inode.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwx @ Mon Jul 31 20:08:22 PDT 2023 Lines: 4212 5399 78.0 %
Date: 2023-07-31 20:08:22 Functions: 162 177 91.5 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (C) 2007 Oracle.  All rights reserved.
       4             :  */
       5             : 
       6             : #include <crypto/hash.h>
       7             : #include <linux/kernel.h>
       8             : #include <linux/bio.h>
       9             : #include <linux/blk-cgroup.h>
      10             : #include <linux/file.h>
      11             : #include <linux/fs.h>
      12             : #include <linux/pagemap.h>
      13             : #include <linux/highmem.h>
      14             : #include <linux/time.h>
      15             : #include <linux/init.h>
      16             : #include <linux/string.h>
      17             : #include <linux/backing-dev.h>
      18             : #include <linux/writeback.h>
      19             : #include <linux/compat.h>
      20             : #include <linux/xattr.h>
      21             : #include <linux/posix_acl.h>
      22             : #include <linux/falloc.h>
      23             : #include <linux/slab.h>
      24             : #include <linux/ratelimit.h>
      25             : #include <linux/btrfs.h>
      26             : #include <linux/blkdev.h>
      27             : #include <linux/posix_acl_xattr.h>
      28             : #include <linux/uio.h>
      29             : #include <linux/magic.h>
      30             : #include <linux/iversion.h>
      31             : #include <linux/swap.h>
      32             : #include <linux/migrate.h>
      33             : #include <linux/sched/mm.h>
      34             : #include <linux/iomap.h>
      35             : #include <asm/unaligned.h>
      36             : #include <linux/fsverity.h>
      37             : #include "misc.h"
      38             : #include "ctree.h"
      39             : #include "disk-io.h"
      40             : #include "transaction.h"
      41             : #include "btrfs_inode.h"
      42             : #include "print-tree.h"
      43             : #include "ordered-data.h"
      44             : #include "xattr.h"
      45             : #include "tree-log.h"
      46             : #include "bio.h"
      47             : #include "compression.h"
      48             : #include "locking.h"
      49             : #include "free-space-cache.h"
      50             : #include "props.h"
      51             : #include "qgroup.h"
      52             : #include "delalloc-space.h"
      53             : #include "block-group.h"
      54             : #include "space-info.h"
      55             : #include "zoned.h"
      56             : #include "subpage.h"
      57             : #include "inode-item.h"
      58             : #include "fs.h"
      59             : #include "accessors.h"
      60             : #include "extent-tree.h"
      61             : #include "root-tree.h"
      62             : #include "defrag.h"
      63             : #include "dir-item.h"
      64             : #include "file-item.h"
      65             : #include "uuid-tree.h"
      66             : #include "ioctl.h"
      67             : #include "file.h"
      68             : #include "acl.h"
      69             : #include "relocation.h"
      70             : #include "verity.h"
      71             : #include "super.h"
      72             : #include "orphan.h"
      73             : #include "backref.h"
      74             : 
      75             : struct btrfs_iget_args {
      76             :         u64 ino;
      77             :         struct btrfs_root *root;
      78             : };
      79             : 
      80             : struct btrfs_dio_data {
      81             :         ssize_t submitted;
      82             :         struct extent_changeset *data_reserved;
      83             :         struct btrfs_ordered_extent *ordered;
      84             :         bool data_space_reserved;
      85             :         bool nocow_done;
      86             : };
      87             : 
      88             : struct btrfs_dio_private {
      89             :         /* Range of I/O */
      90             :         u64 file_offset;
      91             :         u32 bytes;
      92             : 
      93             :         /* This must be last */
      94             :         struct btrfs_bio bbio;
      95             : };
      96             : 
      97             : static struct bio_set btrfs_dio_bioset;
      98             : 
      99             : struct btrfs_rename_ctx {
     100             :         /* Output field. Stores the index number of the old directory entry. */
     101             :         u64 index;
     102             : };
     103             : 
     104             : /*
     105             :  * Used by data_reloc_print_warning_inode() to pass needed info for filename
     106             :  * resolution and output of error message.
     107             :  */
     108             : struct data_reloc_warn {
     109             :         struct btrfs_path path;
     110             :         struct btrfs_fs_info *fs_info;
     111             :         u64 extent_item_size;
     112             :         u64 logical;
     113             :         int mirror_num;
     114             : };
     115             : 
     116             : static const struct inode_operations btrfs_dir_inode_operations;
     117             : static const struct inode_operations btrfs_symlink_inode_operations;
     118             : static const struct inode_operations btrfs_special_inode_operations;
     119             : static const struct inode_operations btrfs_file_inode_operations;
     120             : static const struct address_space_operations btrfs_aops;
     121             : static const struct file_operations btrfs_dir_file_operations;
     122             : 
     123             : static struct kmem_cache *btrfs_inode_cachep;
     124             : 
     125             : static int btrfs_setsize(struct inode *inode, struct iattr *attr);
     126             : static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
     127             : static noinline int cow_file_range(struct btrfs_inode *inode,
     128             :                                    struct page *locked_page,
     129             :                                    u64 start, u64 end, int *page_started,
     130             :                                    unsigned long *nr_written, int unlock,
     131             :                                    u64 *done_offset);
     132             : static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
     133             :                                        u64 len, u64 orig_start, u64 block_start,
     134             :                                        u64 block_len, u64 orig_block_len,
     135             :                                        u64 ram_bytes, int compress_type,
     136             :                                        int type);
     137             : 
     138           0 : static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
     139             :                                           u64 root, void *warn_ctx)
     140             : {
     141           0 :         struct data_reloc_warn *warn = warn_ctx;
     142           0 :         struct btrfs_fs_info *fs_info = warn->fs_info;
     143           0 :         struct extent_buffer *eb;
     144           0 :         struct btrfs_inode_item *inode_item;
     145           0 :         struct inode_fs_paths *ipath = NULL;
     146           0 :         struct btrfs_root *local_root;
     147           0 :         struct btrfs_key key;
     148           0 :         unsigned int nofs_flag;
     149           0 :         u32 nlink;
     150           0 :         int ret;
     151             : 
     152           0 :         local_root = btrfs_get_fs_root(fs_info, root, true);
     153           0 :         if (IS_ERR(local_root)) {
     154           0 :                 ret = PTR_ERR(local_root);
     155           0 :                 goto err;
     156             :         }
     157             : 
     158             :         /* This makes the path point to (inum INODE_ITEM ioff). */
     159           0 :         key.objectid = inum;
     160           0 :         key.type = BTRFS_INODE_ITEM_KEY;
     161           0 :         key.offset = 0;
     162             : 
     163           0 :         ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
     164           0 :         if (ret) {
     165           0 :                 btrfs_put_root(local_root);
     166           0 :                 btrfs_release_path(&warn->path);
     167           0 :                 goto err;
     168             :         }
     169             : 
     170           0 :         eb = warn->path.nodes[0];
     171           0 :         inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
     172           0 :         nlink = btrfs_inode_nlink(eb, inode_item);
     173           0 :         btrfs_release_path(&warn->path);
     174             : 
     175           0 :         nofs_flag = memalloc_nofs_save();
     176           0 :         ipath = init_ipath(4096, local_root, &warn->path);
     177           0 :         memalloc_nofs_restore(nofs_flag);
     178           0 :         if (IS_ERR(ipath)) {
     179           0 :                 btrfs_put_root(local_root);
     180           0 :                 ret = PTR_ERR(ipath);
     181           0 :                 ipath = NULL;
     182             :                 /*
     183             :                  * -ENOMEM, not a critical error, just output an generic error
     184             :                  * without filename.
     185             :                  */
     186           0 :                 btrfs_warn(fs_info,
     187             : "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
     188             :                            warn->logical, warn->mirror_num, root, inum, offset);
     189           0 :                 return ret;
     190             :         }
     191           0 :         ret = paths_from_inode(inum, ipath);
     192           0 :         if (ret < 0)
     193           0 :                 goto err;
     194             : 
     195             :         /*
     196             :          * We deliberately ignore the bit ipath might have been too small to
     197             :          * hold all of the paths here
     198             :          */
     199           0 :         for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
     200           0 :                 btrfs_warn(fs_info,
     201             : "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
     202             :                            warn->logical, warn->mirror_num, root, inum, offset,
     203             :                            fs_info->sectorsize, nlink,
     204             :                            (char *)(unsigned long)ipath->fspath->val[i]);
     205             :         }
     206             : 
     207           0 :         btrfs_put_root(local_root);
     208           0 :         free_ipath(ipath);
     209           0 :         return 0;
     210             : 
     211           0 : err:
     212           0 :         btrfs_warn(fs_info,
     213             : "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
     214             :                    warn->logical, warn->mirror_num, root, inum, offset, ret);
     215             : 
     216           0 :         free_ipath(ipath);
     217           0 :         return ret;
     218             : }
     219             : 
     220             : /*
     221             :  * Do extra user-friendly error output (e.g. lookup all the affected files).
     222             :  *
     223             :  * Return true if we succeeded doing the backref lookup.
     224             :  * Return false if such lookup failed, and has to fallback to the old error message.
     225             :  */
     226           0 : static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
     227             :                                    const u8 *csum, const u8 *csum_expected,
     228             :                                    int mirror_num)
     229             : {
     230           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
     231           0 :         struct btrfs_path path = { 0 };
     232           0 :         struct btrfs_key found_key = { 0 };
     233           0 :         struct extent_buffer *eb;
     234           0 :         struct btrfs_extent_item *ei;
     235           0 :         const u32 csum_size = fs_info->csum_size;
     236           0 :         u64 logical;
     237           0 :         u64 flags;
     238           0 :         u32 item_size;
     239           0 :         int ret;
     240             : 
     241           0 :         mutex_lock(&fs_info->reloc_mutex);
     242           0 :         logical = btrfs_get_reloc_bg_bytenr(fs_info);
     243           0 :         mutex_unlock(&fs_info->reloc_mutex);
     244             : 
     245           0 :         if (logical == U64_MAX) {
     246           0 :                 btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
     247           0 :                 btrfs_warn_rl(fs_info,
     248             : "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
     249             :                         inode->root->root_key.objectid, btrfs_ino(inode), file_off,
     250             :                         CSUM_FMT_VALUE(csum_size, csum),
     251             :                         CSUM_FMT_VALUE(csum_size, csum_expected),
     252             :                         mirror_num);
     253           0 :                 return;
     254             :         }
     255             : 
     256           0 :         logical += file_off;
     257           0 :         btrfs_warn_rl(fs_info,
     258             : "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
     259             :                         inode->root->root_key.objectid,
     260             :                         btrfs_ino(inode), file_off, logical,
     261             :                         CSUM_FMT_VALUE(csum_size, csum),
     262             :                         CSUM_FMT_VALUE(csum_size, csum_expected),
     263             :                         mirror_num);
     264             : 
     265           0 :         ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
     266           0 :         if (ret < 0) {
     267           0 :                 btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
     268             :                              logical, ret);
     269           0 :                 return;
     270             :         }
     271           0 :         eb = path.nodes[0];
     272           0 :         ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
     273           0 :         item_size = btrfs_item_size(eb, path.slots[0]);
     274           0 :         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
     275           0 :                 unsigned long ptr = 0;
     276           0 :                 u64 ref_root;
     277           0 :                 u8 ref_level;
     278             : 
     279           0 :                 while (true) {
     280           0 :                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
     281             :                                                       item_size, &ref_root,
     282             :                                                       &ref_level);
     283           0 :                         if (ret < 0) {
     284           0 :                                 btrfs_warn_rl(fs_info,
     285             :                                 "failed to resolve tree backref for logical %llu: %d",
     286             :                                               logical, ret);
     287             :                                 break;
     288             :                         }
     289           0 :                         if (ret > 0)
     290             :                                 break;
     291             : 
     292           0 :                         btrfs_warn_rl(fs_info,
     293             : "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
     294             :                                 logical, mirror_num,
     295             :                                 (ref_level ? "node" : "leaf"),
     296             :                                 ref_level, ref_root);
     297             :                 }
     298           0 :                 btrfs_release_path(&path);
     299             :         } else {
     300           0 :                 struct btrfs_backref_walk_ctx ctx = { 0 };
     301           0 :                 struct data_reloc_warn reloc_warn = { 0 };
     302             : 
     303           0 :                 btrfs_release_path(&path);
     304             : 
     305           0 :                 ctx.bytenr = found_key.objectid;
     306           0 :                 ctx.extent_item_pos = logical - found_key.objectid;
     307           0 :                 ctx.fs_info = fs_info;
     308             : 
     309           0 :                 reloc_warn.logical = logical;
     310           0 :                 reloc_warn.extent_item_size = found_key.offset;
     311           0 :                 reloc_warn.mirror_num = mirror_num;
     312           0 :                 reloc_warn.fs_info = fs_info;
     313             : 
     314           0 :                 iterate_extent_inodes(&ctx, true,
     315             :                                       data_reloc_print_warning_inode, &reloc_warn);
     316             :         }
     317             : }
     318             : 
     319           6 : static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
     320             :                 u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
     321             : {
     322           6 :         struct btrfs_root *root = inode->root;
     323           6 :         const u32 csum_size = root->fs_info->csum_size;
     324             : 
     325             :         /* For data reloc tree, it's better to do a backref lookup instead. */
     326           6 :         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
     327           0 :                 return print_data_reloc_error(inode, logical_start, csum,
     328             :                                               csum_expected, mirror_num);
     329             : 
     330             :         /* Output without objectid, which is more meaningful */
     331           6 :         if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
     332           0 :                 btrfs_warn_rl(root->fs_info,
     333             : "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
     334             :                         root->root_key.objectid, btrfs_ino(inode),
     335             :                         logical_start,
     336             :                         CSUM_FMT_VALUE(csum_size, csum),
     337             :                         CSUM_FMT_VALUE(csum_size, csum_expected),
     338             :                         mirror_num);
     339             :         } else {
     340           6 :                 btrfs_warn_rl(root->fs_info,
     341             : "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
     342             :                         root->root_key.objectid, btrfs_ino(inode),
     343             :                         logical_start,
     344             :                         CSUM_FMT_VALUE(csum_size, csum),
     345             :                         CSUM_FMT_VALUE(csum_size, csum_expected),
     346             :                         mirror_num);
     347             :         }
     348             : }
     349             : 
     350             : /*
     351             :  * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
     352             :  *
     353             :  * ilock_flags can have the following bit set:
     354             :  *
     355             :  * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
     356             :  * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
     357             :  *                   return -EAGAIN
     358             :  * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
     359             :  */
     360    32957751 : int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
     361             : {
     362    32957751 :         if (ilock_flags & BTRFS_ILOCK_SHARED) {
     363     2135619 :                 if (ilock_flags & BTRFS_ILOCK_TRY) {
     364           5 :                         if (!inode_trylock_shared(&inode->vfs_inode))
     365             :                                 return -EAGAIN;
     366             :                         else
     367           5 :                                 return 0;
     368             :                 }
     369     2135614 :                 inode_lock_shared(&inode->vfs_inode);
     370             :         } else {
     371    30822132 :                 if (ilock_flags & BTRFS_ILOCK_TRY) {
     372           2 :                         if (!inode_trylock(&inode->vfs_inode))
     373             :                                 return -EAGAIN;
     374             :                         else
     375           2 :                                 return 0;
     376             :                 }
     377    30822130 :                 inode_lock(&inode->vfs_inode);
     378             :         }
     379    32938902 :         if (ilock_flags & BTRFS_ILOCK_MMAP)
     380     3912327 :                 down_write(&inode->i_mmap_lock);
     381             :         return 0;
     382             : }
     383             : 
     384             : /*
     385             :  * btrfs_inode_unlock - unock inode i_rwsem
     386             :  *
     387             :  * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
     388             :  * to decide whether the lock acquired is shared or exclusive.
     389             :  */
     390    32959963 : void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
     391             : {
     392    32959963 :         if (ilock_flags & BTRFS_ILOCK_MMAP)
     393     3912325 :                 up_write(&inode->i_mmap_lock);
     394    32959945 :         if (ilock_flags & BTRFS_ILOCK_SHARED)
     395     2624256 :                 inode_unlock_shared(&inode->vfs_inode);
     396             :         else
     397    30335689 :                 inode_unlock(&inode->vfs_inode);
     398           0 : }
     399             : 
     400             : /*
     401             :  * Cleanup all submitted ordered extents in specified range to handle errors
     402             :  * from the btrfs_run_delalloc_range() callback.
     403             :  *
     404             :  * NOTE: caller must ensure that when an error happens, it can not call
     405             :  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
     406             :  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
     407             :  * to be released, which we want to happen only when finishing the ordered
     408             :  * extent (btrfs_finish_ordered_io()).
     409             :  */
     410           0 : static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
     411             :                                                  struct page *locked_page,
     412             :                                                  u64 offset, u64 bytes)
     413             : {
     414           0 :         unsigned long index = offset >> PAGE_SHIFT;
     415           0 :         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
     416           0 :         u64 page_start = 0, page_end = 0;
     417           0 :         struct page *page;
     418             : 
     419           0 :         if (locked_page) {
     420           0 :                 page_start = page_offset(locked_page);
     421           0 :                 page_end = page_start + PAGE_SIZE - 1;
     422             :         }
     423             : 
     424           0 :         while (index <= end_index) {
     425             :                 /*
     426             :                  * For locked page, we will call end_extent_writepage() on it
     427             :                  * in run_delalloc_range() for the error handling.  That
     428             :                  * end_extent_writepage() function will call
     429             :                  * btrfs_mark_ordered_io_finished() to clear page Ordered and
     430             :                  * run the ordered extent accounting.
     431             :                  *
     432             :                  * Here we can't just clear the Ordered bit, or
     433             :                  * btrfs_mark_ordered_io_finished() would skip the accounting
     434             :                  * for the page range, and the ordered extent will never finish.
     435             :                  */
     436           0 :                 if (locked_page && index == (page_start >> PAGE_SHIFT)) {
     437           0 :                         index++;
     438           0 :                         continue;
     439             :                 }
     440           0 :                 page = find_get_page(inode->vfs_inode.i_mapping, index);
     441           0 :                 index++;
     442           0 :                 if (!page)
     443           0 :                         continue;
     444             : 
     445             :                 /*
     446             :                  * Here we just clear all Ordered bits for every page in the
     447             :                  * range, then btrfs_mark_ordered_io_finished() will handle
     448             :                  * the ordered extent accounting for the range.
     449             :                  */
     450           0 :                 btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
     451             :                                                offset, bytes);
     452           0 :                 put_page(page);
     453             :         }
     454             : 
     455           0 :         if (locked_page) {
     456             :                 /* The locked page covers the full range, nothing needs to be done */
     457           0 :                 if (bytes + offset <= page_start + PAGE_SIZE)
     458             :                         return;
     459             :                 /*
     460             :                  * In case this page belongs to the delalloc range being
     461             :                  * instantiated then skip it, since the first page of a range is
     462             :                  * going to be properly cleaned up by the caller of
     463             :                  * run_delalloc_range
     464             :                  */
     465           0 :                 if (page_start >= offset && page_end <= (offset + bytes - 1)) {
     466           0 :                         bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
     467           0 :                         offset = page_offset(locked_page) + PAGE_SIZE;
     468             :                 }
     469             :         }
     470             : 
     471           0 :         return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
     472             : }
     473             : 
     474             : static int btrfs_dirty_inode(struct btrfs_inode *inode);
     475             : 
     476     3254921 : static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
     477             :                                      struct btrfs_new_inode_args *args)
     478             : {
     479     3254921 :         int err;
     480             : 
     481     3254921 :         if (args->default_acl) {
     482        3051 :                 err = __btrfs_set_acl(trans, args->inode, args->default_acl,
     483             :                                       ACL_TYPE_DEFAULT);
     484        3051 :                 if (err)
     485             :                         return err;
     486             :         }
     487     3254921 :         if (args->acl) {
     488       31620 :                 err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
     489       31620 :                 if (err)
     490             :                         return err;
     491             :         }
     492     3254921 :         if (!args->default_acl && !args->acl)
     493     3223299 :                 cache_no_acl(args->inode);
     494     3254921 :         return btrfs_xattr_security_init(trans, args->inode, args->dir,
     495     3254921 :                                          &args->dentry->d_name);
     496             : }
     497             : 
     498             : /*
     499             :  * this does all the hard work for inserting an inline extent into
     500             :  * the btree.  The caller should have done a btrfs_drop_extents so that
     501             :  * no overlapping inline items exist in the btree
     502             :  */
     503     1077546 : static int insert_inline_extent(struct btrfs_trans_handle *trans,
     504             :                                 struct btrfs_path *path,
     505             :                                 struct btrfs_inode *inode, bool extent_inserted,
     506             :                                 size_t size, size_t compressed_size,
     507             :                                 int compress_type,
     508             :                                 struct page **compressed_pages,
     509             :                                 bool update_i_size)
     510             : {
     511     1077546 :         struct btrfs_root *root = inode->root;
     512     1077546 :         struct extent_buffer *leaf;
     513     1077546 :         struct page *page = NULL;
     514     1077546 :         char *kaddr;
     515     1077546 :         unsigned long ptr;
     516     1077546 :         struct btrfs_file_extent_item *ei;
     517     1077546 :         int ret;
     518     1077546 :         size_t cur_size = size;
     519     1077546 :         u64 i_size;
     520             : 
     521     1077546 :         ASSERT((compressed_size > 0 && compressed_pages) ||
     522             :                (compressed_size == 0 && !compressed_pages));
     523             : 
     524     1077546 :         if (compressed_size && compressed_pages)
     525          23 :                 cur_size = compressed_size;
     526             : 
     527     1077546 :         if (!extent_inserted) {
     528      216690 :                 struct btrfs_key key;
     529      216690 :                 size_t datasize;
     530             : 
     531      216690 :                 key.objectid = btrfs_ino(inode);
     532      216690 :                 key.offset = 0;
     533      216690 :                 key.type = BTRFS_EXTENT_DATA_KEY;
     534             : 
     535      216690 :                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
     536      216690 :                 ret = btrfs_insert_empty_item(trans, root, path, &key,
     537             :                                               datasize);
     538      216690 :                 if (ret)
     539           0 :                         goto fail;
     540             :         }
     541     1077546 :         leaf = path->nodes[0];
     542     1077546 :         ei = btrfs_item_ptr(leaf, path->slots[0],
     543             :                             struct btrfs_file_extent_item);
     544     1077546 :         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
     545     1077545 :         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
     546     1077545 :         btrfs_set_file_extent_encryption(leaf, ei, 0);
     547     1077546 :         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
     548     1077545 :         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
     549     1077545 :         ptr = btrfs_file_extent_inline_start(ei);
     550             : 
     551     1077545 :         if (compress_type != BTRFS_COMPRESS_NONE) {
     552             :                 struct page *cpage;
     553             :                 int i = 0;
     554          46 :                 while (compressed_size > 0) {
     555          23 :                         cpage = compressed_pages[i];
     556          23 :                         cur_size = min_t(unsigned long, compressed_size,
     557             :                                        PAGE_SIZE);
     558             : 
     559          23 :                         kaddr = kmap_local_page(cpage);
     560          23 :                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
     561          23 :                         kunmap_local(kaddr);
     562             : 
     563          23 :                         i++;
     564          23 :                         ptr += cur_size;
     565          23 :                         compressed_size -= cur_size;
     566             :                 }
     567          23 :                 btrfs_set_file_extent_compression(leaf, ei,
     568             :                                                   compress_type);
     569             :         } else {
     570     1077522 :                 page = find_get_page(inode->vfs_inode.i_mapping, 0);
     571     1077522 :                 btrfs_set_file_extent_compression(leaf, ei, 0);
     572     1077522 :                 kaddr = kmap_local_page(page);
     573     1077522 :                 write_extent_buffer(leaf, kaddr, ptr, size);
     574     1077523 :                 kunmap_local(kaddr);
     575     1077523 :                 put_page(page);
     576             :         }
     577     1077546 :         btrfs_mark_buffer_dirty(leaf);
     578     1077546 :         btrfs_release_path(path);
     579             : 
     580             :         /*
     581             :          * We align size to sectorsize for inline extents just for simplicity
     582             :          * sake.
     583             :          */
     584     1077546 :         ret = btrfs_inode_set_file_extent_range(inode, 0,
     585     1077546 :                                         ALIGN(size, root->fs_info->sectorsize));
     586     1077545 :         if (ret)
     587           0 :                 goto fail;
     588             : 
     589             :         /*
     590             :          * We're an inline extent, so nobody can extend the file past i_size
     591             :          * without locking a page we already have locked.
     592             :          *
     593             :          * We must do any i_size and inode updates before we unlock the pages.
     594             :          * Otherwise we could end up racing with unlink.
     595             :          */
     596     1077545 :         i_size = i_size_read(&inode->vfs_inode);
     597     1077545 :         if (update_i_size && size > i_size) {
     598           0 :                 i_size_write(&inode->vfs_inode, size);
     599           0 :                 i_size = size;
     600             :         }
     601     1077545 :         inode->disk_i_size = i_size;
     602             : 
     603     1077545 : fail:
     604     1077545 :         return ret;
     605             : }
     606             : 
     607             : 
     608             : /*
     609             :  * conditionally insert an inline extent into the file.  This
     610             :  * does the checks required to make sure the data is small enough
     611             :  * to fit as an inline extent.
     612             :  */
     613     1724011 : static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
     614             :                                           size_t compressed_size,
     615             :                                           int compress_type,
     616             :                                           struct page **compressed_pages,
     617             :                                           bool update_i_size)
     618             : {
     619     1724011 :         struct btrfs_drop_extents_args drop_args = { 0 };
     620     1724011 :         struct btrfs_root *root = inode->root;
     621     1724011 :         struct btrfs_fs_info *fs_info = root->fs_info;
     622     1724011 :         struct btrfs_trans_handle *trans;
     623     1724011 :         u64 data_len = (compressed_size ?: size);
     624     1724011 :         int ret;
     625     1724011 :         struct btrfs_path *path;
     626             : 
     627             :         /*
     628             :          * We can create an inline extent if it ends at or beyond the current
     629             :          * i_size, is no larger than a sector (decompressed), and the (possibly
     630             :          * compressed) data fits in a leaf and the configured maximum inline
     631             :          * size.
     632             :          */
     633     1724011 :         if (size < i_size_read(&inode->vfs_inode) ||
     634     1709872 :             size > fs_info->sectorsize ||
     635     1283985 :             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
     636     1257528 :             data_len > fs_info->max_inline)
     637             :                 return 1;
     638             : 
     639     1077202 :         path = btrfs_alloc_path();
     640     1077437 :         if (!path)
     641             :                 return -ENOMEM;
     642             : 
     643     1077437 :         trans = btrfs_join_transaction(root);
     644     1077524 :         if (IS_ERR(trans)) {
     645           0 :                 btrfs_free_path(path);
     646           0 :                 return PTR_ERR(trans);
     647             :         }
     648     1077524 :         trans->block_rsv = &inode->block_rsv;
     649             : 
     650     1077524 :         drop_args.path = path;
     651     1077524 :         drop_args.start = 0;
     652     1077524 :         drop_args.end = fs_info->sectorsize;
     653     1077524 :         drop_args.drop_cache = true;
     654     1077524 :         drop_args.replace_extent = true;
     655     1077524 :         drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
     656     1077524 :         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
     657     1077546 :         if (ret) {
     658           0 :                 btrfs_abort_transaction(trans, ret);
     659           0 :                 goto out;
     660             :         }
     661             : 
     662     1077546 :         ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
     663             :                                    size, compressed_size, compress_type,
     664             :                                    compressed_pages, update_i_size);
     665     1077545 :         if (ret && ret != -ENOSPC) {
     666           0 :                 btrfs_abort_transaction(trans, ret);
     667           0 :                 goto out;
     668     1077545 :         } else if (ret == -ENOSPC) {
     669           0 :                 ret = 1;
     670           0 :                 goto out;
     671             :         }
     672             : 
     673     1077545 :         btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
     674     1077546 :         ret = btrfs_update_inode(trans, root, inode);
     675     1077541 :         if (ret && ret != -ENOSPC) {
     676           0 :                 btrfs_abort_transaction(trans, ret);
     677           0 :                 goto out;
     678     1077541 :         } else if (ret == -ENOSPC) {
     679           0 :                 ret = 1;
     680           0 :                 goto out;
     681             :         }
     682             : 
     683     1077541 :         btrfs_set_inode_full_sync(inode);
     684     1077517 : out:
     685             :         /*
     686             :          * Don't forget to free the reserved space, as for inlined extent
     687             :          * it won't count as data extent, free them directly here.
     688             :          * And at reserve time, it's always aligned to page size, so
     689             :          * just free one page here.
     690             :          */
     691     1077517 :         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
     692     1077449 :         btrfs_free_path(path);
     693     1077498 :         btrfs_end_transaction(trans);
     694     1077498 :         return ret;
     695             : }
     696             : 
     697             : struct async_extent {
     698             :         u64 start;
     699             :         u64 ram_size;
     700             :         u64 compressed_size;
     701             :         struct page **pages;
     702             :         unsigned long nr_pages;
     703             :         int compress_type;
     704             :         struct list_head list;
     705             : };
     706             : 
     707             : struct async_chunk {
     708             :         struct btrfs_inode *inode;
     709             :         struct page *locked_page;
     710             :         u64 start;
     711             :         u64 end;
     712             :         blk_opf_t write_flags;
     713             :         struct list_head extents;
     714             :         struct cgroup_subsys_state *blkcg_css;
     715             :         struct btrfs_work work;
     716             :         struct async_cow *async_cow;
     717             : };
     718             : 
     719             : struct async_cow {
     720             :         atomic_t num_chunks;
     721             :         struct async_chunk chunks[];
     722             : };
     723             : 
     724      158354 : static noinline int add_async_extent(struct async_chunk *cow,
     725             :                                      u64 start, u64 ram_size,
     726             :                                      u64 compressed_size,
     727             :                                      struct page **pages,
     728             :                                      unsigned long nr_pages,
     729             :                                      int compress_type)
     730             : {
     731      158354 :         struct async_extent *async_extent;
     732             : 
     733      158354 :         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
     734      158383 :         BUG_ON(!async_extent); /* -ENOMEM */
     735      158383 :         async_extent->start = start;
     736      158383 :         async_extent->ram_size = ram_size;
     737      158383 :         async_extent->compressed_size = compressed_size;
     738      158383 :         async_extent->pages = pages;
     739      158383 :         async_extent->nr_pages = nr_pages;
     740      158383 :         async_extent->compress_type = compress_type;
     741      158383 :         list_add_tail(&async_extent->list, &cow->extents);
     742      158414 :         return 0;
     743             : }
     744             : 
     745             : /*
     746             :  * Check if the inode needs to be submitted to compression, based on mount
     747             :  * options, defragmentation, properties or heuristics.
     748             :  */
     749     2412894 : static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
     750             :                                       u64 end)
     751             : {
     752     2412894 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
     753             : 
     754     2412894 :         if (!btrfs_inode_can_compress(inode)) {
     755             :                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
     756             :                         KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
     757             :                         btrfs_ino(inode));
     758             :                 return 0;
     759             :         }
     760             :         /*
     761             :          * Special check for subpage.
     762             :          *
     763             :          * We lock the full page then run each delalloc range in the page, thus
     764             :          * for the following case, we will hit some subpage specific corner case:
     765             :          *
     766             :          * 0            32K             64K
     767             :          * |    |///////|       |///////|
     768             :          *              \- A            \- B
     769             :          *
     770             :          * In above case, both range A and range B will try to unlock the full
     771             :          * page [0, 64K), causing the one finished later will have page
     772             :          * unlocked already, triggering various page lock requirement BUG_ON()s.
     773             :          *
     774             :          * So here we add an artificial limit that subpage compression can only
     775             :          * if the range is fully page aligned.
     776             :          *
     777             :          * In theory we only need to ensure the first page is fully covered, but
     778             :          * the tailing partial page will be locked until the full compression
     779             :          * finishes, delaying the write of other range.
     780             :          *
     781             :          * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
     782             :          * first to prevent any submitted async extent to unlock the full page.
     783             :          * By this, we can ensure for subpage case that only the last async_cow
     784             :          * will unlock the full page.
     785             :          */
     786     2412894 :         if (fs_info->sectorsize < PAGE_SIZE) {
     787           0 :                 if (!PAGE_ALIGNED(start) ||
     788           0 :                     !PAGE_ALIGNED(end + 1))
     789             :                         return 0;
     790             :         }
     791             : 
     792             :         /* force compress */
     793     2412894 :         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
     794             :                 return 1;
     795             :         /* defrag ioctl */
     796     2412834 :         if (inode->defrag_compress)
     797             :                 return 1;
     798             :         /* bad compression ratios */
     799     2412814 :         if (inode->flags & BTRFS_INODE_NOCOMPRESS)
     800             :                 return 0;
     801     2412814 :         if (btrfs_test_opt(fs_info, COMPRESS) ||
     802     2278908 :             inode->flags & BTRFS_INODE_COMPRESS ||
     803     2254122 :             inode->prop_compress)
     804      158692 :                 return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
     805             :         return 0;
     806             : }
     807             : 
     808     3379854 : static inline void inode_should_defrag(struct btrfs_inode *inode,
     809             :                 u64 start, u64 end, u64 num_bytes, u32 small_write)
     810             : {
     811             :         /* If this is a small write inside eof, kick off a defrag */
     812     3379854 :         if (num_bytes < small_write &&
     813     1622285 :             (start > 0 || end + 1 < inode->disk_i_size))
     814     1419325 :                 btrfs_add_inode_defrag(NULL, inode, small_write);
     815     3379832 : }
     816             : 
     817             : /*
     818             :  * we create compressed extents in two phases.  The first
     819             :  * phase compresses a range of pages that have already been
     820             :  * locked (both pages and state bits are locked).
     821             :  *
     822             :  * This is done inside an ordered work queue, and the compression
     823             :  * is spread across many cpus.  The actual IO submission is step
     824             :  * two, and the ordered work queue takes care of making sure that
     825             :  * happens in the same order things were put onto the queue by
     826             :  * writepages and friends.
     827             :  *
     828             :  * If this code finds it can't get good compression, it puts an
     829             :  * entry onto the work queue to write the uncompressed bytes.  This
     830             :  * makes sure that both compressed inodes and uncompressed inodes
     831             :  * are written in the same order that the flusher thread sent them
     832             :  * down.
     833             :  */
     834       39979 : static noinline int compress_file_range(struct async_chunk *async_chunk)
     835             : {
     836       39979 :         struct btrfs_inode *inode = async_chunk->inode;
     837       39979 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
     838       39979 :         struct address_space *mapping = inode->vfs_inode.i_mapping;
     839       39979 :         u64 blocksize = fs_info->sectorsize;
     840       39979 :         u64 start = async_chunk->start;
     841       39979 :         u64 end = async_chunk->end;
     842       39979 :         u64 actual_end;
     843       39979 :         u64 i_size;
     844       39979 :         int ret = 0;
     845       39979 :         struct page **pages = NULL;
     846       39979 :         unsigned long nr_pages;
     847       39979 :         unsigned long total_compressed = 0;
     848       39979 :         unsigned long total_in = 0;
     849       39979 :         int i;
     850       39979 :         int will_compress;
     851       39979 :         int compress_type = fs_info->compress_type;
     852       39979 :         int compressed_extents = 0;
     853       39979 :         int redirty = 0;
     854             : 
     855       39979 :         inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
     856             : 
     857             :         /*
     858             :          * We need to save i_size before now because it could change in between
     859             :          * us evaluating the size and assigning it.  This is because we lock and
     860             :          * unlock the page in truncate and fallocate, and then modify the i_size
     861             :          * later on.
     862             :          *
     863             :          * The barriers are to emulate READ_ONCE, remove that once i_size_read
     864             :          * does that for us.
     865             :          */
     866       39981 :         barrier();
     867       39981 :         i_size = i_size_read(&inode->vfs_inode);
     868       39981 :         barrier();
     869       39981 :         actual_end = min_t(u64, i_size, end + 1);
     870      158448 : again:
     871      158448 :         will_compress = 0;
     872      158448 :         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
     873      158448 :         nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
     874             : 
     875             :         /*
     876             :          * we don't want to send crud past the end of i_size through
     877             :          * compression, that's just a waste of CPU time.  So, if the
     878             :          * end of the file is before the start of our current
     879             :          * requested range of bytes, we bail out to the uncompressed
     880             :          * cleanup code that can deal with all of this.
     881             :          *
     882             :          * It isn't really the fastest way to fix things, but this is a
     883             :          * very uncommon corner.
     884             :          */
     885      158448 :         if (actual_end <= start)
     886           0 :                 goto cleanup_and_bail_uncompressed;
     887             : 
     888      158448 :         total_compressed = actual_end - start;
     889             : 
     890             :         /*
     891             :          * Skip compression for a small file range(<=blocksize) that
     892             :          * isn't an inline extent, since it doesn't save disk space at all.
     893             :          */
     894      158448 :         if (total_compressed <= blocksize &&
     895          29 :            (start > 0 || end + 1 < inode->disk_i_size))
     896         213 :                 goto cleanup_and_bail_uncompressed;
     897             : 
     898             :         /*
     899             :          * For subpage case, we require full page alignment for the sector
     900             :          * aligned range.
     901             :          * Thus we must also check against @actual_end, not just @end.
     902             :          */
     903      158235 :         if (blocksize < PAGE_SIZE) {
     904           0 :                 if (!PAGE_ALIGNED(start) ||
     905           0 :                     !PAGE_ALIGNED(round_up(actual_end, blocksize)))
     906           0 :                         goto cleanup_and_bail_uncompressed;
     907             :         }
     908             : 
     909      158235 :         total_compressed = min_t(unsigned long, total_compressed,
     910             :                         BTRFS_MAX_UNCOMPRESSED);
     911      158235 :         total_in = 0;
     912      158235 :         ret = 0;
     913             : 
     914             :         /*
     915             :          * we do compression for mount -o compress and when the
     916             :          * inode has not been flagged as nocompress.  This flag can
     917             :          * change at any time if we discover bad compression ratios.
     918             :          */
     919      158235 :         if (inode_need_compress(inode, start, end)) {
     920      158307 :                 WARN_ON(pages);
     921      158307 :                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
     922      158277 :                 if (!pages) {
     923             :                         /* just bail out to the uncompressed code */
     924           0 :                         nr_pages = 0;
     925           0 :                         goto cont;
     926             :                 }
     927             : 
     928      158277 :                 if (inode->defrag_compress)
     929          15 :                         compress_type = inode->defrag_compress;
     930      158262 :                 else if (inode->prop_compress)
     931       24611 :                         compress_type = inode->prop_compress;
     932             : 
     933             :                 /*
     934             :                  * we need to call clear_page_dirty_for_io on each
     935             :                  * page in the range.  Otherwise applications with the file
     936             :                  * mmap'd can wander in and change the page contents while
     937             :                  * we are compressing them.
     938             :                  *
     939             :                  * If the compression fails for any reason, we set the pages
     940             :                  * dirty again later on.
     941             :                  *
     942             :                  * Note that the remaining part is redirtied, the start pointer
     943             :                  * has moved, the end is the original one.
     944             :                  */
     945      158277 :                 if (!redirty) {
     946       39760 :                         extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
     947       39760 :                         redirty = 1;
     948             :                 }
     949             : 
     950             :                 /* Compression level is applied here and only here */
     951      158285 :                 ret = btrfs_compress_pages(
     952      158285 :                         compress_type | (fs_info->compress_level << 4),
     953             :                                            mapping, start,
     954             :                                            pages,
     955             :                                            &nr_pages,
     956             :                                            &total_in,
     957             :                                            &total_compressed);
     958             : 
     959      158292 :                 if (!ret) {
     960      158289 :                         unsigned long offset = offset_in_page(total_compressed);
     961      158289 :                         struct page *page = pages[nr_pages - 1];
     962             : 
     963             :                         /* zero the tail end of the last page, we might be
     964             :                          * sending it down to disk
     965             :                          */
     966      158289 :                         if (offset)
     967      158287 :                                 memzero_page(page, offset, PAGE_SIZE - offset);
     968             :                         will_compress = 1;
     969             :                 }
     970             :         }
     971           3 : cont:
     972             :         /*
     973             :          * Check cow_file_range() for why we don't even try to create inline
     974             :          * extent for subpage case.
     975             :          */
     976      158252 :         if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
     977             :                 /* lets try to make an inline extent */
     978          64 :                 if (ret || total_in < actual_end) {
     979             :                         /* we didn't compress the entire range, try
     980             :                          * to make an uncompressed inline extent.
     981             :                          */
     982          20 :                         ret = cow_file_range_inline(inode, actual_end,
     983             :                                                     0, BTRFS_COMPRESS_NONE,
     984             :                                                     NULL, false);
     985             :                 } else {
     986             :                         /* try making a compressed inline extent */
     987          44 :                         ret = cow_file_range_inline(inode, actual_end,
     988             :                                                     total_compressed,
     989             :                                                     compress_type, pages,
     990             :                                                     false);
     991             :                 }
     992          64 :                 if (ret <= 0) {
     993          23 :                         unsigned long clear_flags = EXTENT_DELALLOC |
     994             :                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
     995             :                                 EXTENT_DO_ACCOUNTING;
     996             : 
     997          23 :                         if (ret < 0)
     998           0 :                                 mapping_set_error(mapping, -EIO);
     999             : 
    1000             :                         /*
    1001             :                          * inline extent creation worked or returned error,
    1002             :                          * we don't need to create any more async work items.
    1003             :                          * Unlock and free up our temp pages.
    1004             :                          *
    1005             :                          * We use DO_ACCOUNTING here because we need the
    1006             :                          * delalloc_release_metadata to be done _after_ we drop
    1007             :                          * our outstanding extent for clearing delalloc for this
    1008             :                          * range.
    1009             :                          */
    1010          23 :                         extent_clear_unlock_delalloc(inode, start, end,
    1011             :                                                      NULL,
    1012             :                                                      clear_flags,
    1013             :                                                      PAGE_UNLOCK |
    1014             :                                                      PAGE_START_WRITEBACK |
    1015             :                                                      PAGE_END_WRITEBACK);
    1016             : 
    1017             :                         /*
    1018             :                          * Ensure we only free the compressed pages if we have
    1019             :                          * them allocated, as we can still reach here with
    1020             :                          * inode_need_compress() == false.
    1021             :                          */
    1022          23 :                         if (pages) {
    1023          46 :                                 for (i = 0; i < nr_pages; i++) {
    1024          23 :                                         WARN_ON(pages[i]->mapping);
    1025          23 :                                         put_page(pages[i]);
    1026             :                                 }
    1027          23 :                                 kfree(pages);
    1028             :                         }
    1029          23 :                         return 0;
    1030             :                 }
    1031             :         }
    1032             : 
    1033      158229 :         if (will_compress) {
    1034             :                 /*
    1035             :                  * we aren't doing an inline extent round the compressed size
    1036             :                  * up to a block size boundary so the allocator does sane
    1037             :                  * things
    1038             :                  */
    1039      158226 :                 total_compressed = ALIGN(total_compressed, blocksize);
    1040             : 
    1041             :                 /*
    1042             :                  * one last check to make sure the compression is really a
    1043             :                  * win, compare the page count read with the blocks on disk,
    1044             :                  * compression must free at least one sector size
    1045             :                  */
    1046      158226 :                 total_in = round_up(total_in, fs_info->sectorsize);
    1047      158226 :                 if (total_compressed + blocksize <= total_in) {
    1048      158226 :                         compressed_extents++;
    1049             : 
    1050             :                         /*
    1051             :                          * The async work queues will take care of doing actual
    1052             :                          * allocation on disk for these compressed pages, and
    1053             :                          * will submit them to the elevator.
    1054             :                          */
    1055      158226 :                         add_async_extent(async_chunk, start, total_in,
    1056             :                                         total_compressed, pages, nr_pages,
    1057             :                                         compress_type);
    1058             : 
    1059      158223 :                         if (start + total_in < end) {
    1060      118490 :                                 start += total_in;
    1061      118490 :                                 pages = NULL;
    1062      118490 :                                 cond_resched();
    1063      118467 :                                 goto again;
    1064             :                         }
    1065       39733 :                         return compressed_extents;
    1066             :                 }
    1067             :         }
    1068           3 :         if (pages) {
    1069             :                 /*
    1070             :                  * the compression code ran but failed to make things smaller,
    1071             :                  * free any pages it allocated and our page pointer array
    1072             :                  */
    1073           6 :                 for (i = 0; i < nr_pages; i++) {
    1074           3 :                         WARN_ON(pages[i]->mapping);
    1075           3 :                         put_page(pages[i]);
    1076             :                 }
    1077           3 :                 kfree(pages);
    1078           3 :                 pages = NULL;
    1079           3 :                 total_compressed = 0;
    1080           3 :                 nr_pages = 0;
    1081             : 
    1082             :                 /* flag the file so we don't compress in the future */
    1083           3 :                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
    1084           0 :                     !(inode->prop_compress)) {
    1085           0 :                         inode->flags |= BTRFS_INODE_NOCOMPRESS;
    1086             :                 }
    1087             :         }
    1088           3 : cleanup_and_bail_uncompressed:
    1089             :         /*
    1090             :          * No compression, but we still need to write the pages in the file
    1091             :          * we've been given so far.  redirty the locked page if it corresponds
    1092             :          * to our extent and set things up for the async work queue to run
    1093             :          * cow_file_range to do the normal delalloc dance.
    1094             :          */
    1095         216 :         if (async_chunk->locked_page &&
    1096         222 :             (page_offset(async_chunk->locked_page) >= start &&
    1097             :              page_offset(async_chunk->locked_page)) <= end) {
    1098         216 :                 __set_page_dirty_nobuffers(async_chunk->locked_page);
    1099             :                 /* unlocked later on in the async handlers */
    1100             :         }
    1101             : 
    1102         216 :         if (redirty)
    1103           5 :                 extent_range_redirty_for_io(&inode->vfs_inode, start, end);
    1104         216 :         add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
    1105             :                          BTRFS_COMPRESS_NONE);
    1106         216 :         compressed_extents++;
    1107             : 
    1108         216 :         return compressed_extents;
    1109             : }
    1110             : 
    1111           0 : static void free_async_extent_pages(struct async_extent *async_extent)
    1112             : {
    1113           0 :         int i;
    1114             : 
    1115           0 :         if (!async_extent->pages)
    1116             :                 return;
    1117             : 
    1118           0 :         for (i = 0; i < async_extent->nr_pages; i++) {
    1119           0 :                 WARN_ON(async_extent->pages[i]->mapping);
    1120           0 :                 put_page(async_extent->pages[i]);
    1121             :         }
    1122           0 :         kfree(async_extent->pages);
    1123           0 :         async_extent->nr_pages = 0;
    1124           0 :         async_extent->pages = NULL;
    1125             : }
    1126             : 
    1127         216 : static int submit_uncompressed_range(struct btrfs_inode *inode,
    1128             :                                      struct async_extent *async_extent,
    1129             :                                      struct page *locked_page)
    1130             : {
    1131         216 :         u64 start = async_extent->start;
    1132         216 :         u64 end = async_extent->start + async_extent->ram_size - 1;
    1133         216 :         unsigned long nr_written = 0;
    1134         216 :         int page_started = 0;
    1135         216 :         int ret;
    1136         216 :         struct writeback_control wbc = {
    1137             :                 .sync_mode              = WB_SYNC_ALL,
    1138             :                 .range_start            = start,
    1139             :                 .range_end              = end,
    1140             :                 .no_cgroup_owner        = 1,
    1141             :         };
    1142             : 
    1143             :         /*
    1144             :          * Call cow_file_range() to run the delalloc range directly, since we
    1145             :          * won't go to NOCOW or async path again.
    1146             :          *
    1147             :          * Also we call cow_file_range() with @unlock_page == 0, so that we
    1148             :          * can directly submit them without interruption.
    1149             :          */
    1150         216 :         ret = cow_file_range(inode, locked_page, start, end, &page_started,
    1151             :                              &nr_written, 0, NULL);
    1152             :         /* Inline extent inserted, page gets unlocked and everything is done */
    1153         216 :         if (page_started)
    1154             :                 return 0;
    1155             : 
    1156         216 :         if (ret < 0) {
    1157           0 :                 btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
    1158           0 :                 if (locked_page) {
    1159           0 :                         const u64 page_start = page_offset(locked_page);
    1160           0 :                         const u64 page_end = page_start + PAGE_SIZE - 1;
    1161             : 
    1162           0 :                         set_page_writeback(locked_page);
    1163           0 :                         end_page_writeback(locked_page);
    1164           0 :                         end_extent_writepage(locked_page, ret, page_start, page_end);
    1165           0 :                         unlock_page(locked_page);
    1166             :                 }
    1167           0 :                 return ret;
    1168             :         }
    1169             : 
    1170             :         /* All pages will be unlocked, including @locked_page */
    1171         216 :         wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
    1172         216 :         ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc);
    1173         216 :         wbc_detach_inode(&wbc);
    1174         216 :         return ret;
    1175             : }
    1176             : 
    1177      158498 : static int submit_one_async_extent(struct btrfs_inode *inode,
    1178             :                                    struct async_chunk *async_chunk,
    1179             :                                    struct async_extent *async_extent,
    1180             :                                    u64 *alloc_hint)
    1181             : {
    1182      158498 :         struct extent_io_tree *io_tree = &inode->io_tree;
    1183      158498 :         struct btrfs_root *root = inode->root;
    1184      158498 :         struct btrfs_fs_info *fs_info = root->fs_info;
    1185      158498 :         struct btrfs_ordered_extent *ordered;
    1186      158498 :         struct btrfs_key ins;
    1187      158498 :         struct page *locked_page = NULL;
    1188      158498 :         struct extent_map *em;
    1189      158498 :         int ret = 0;
    1190      158498 :         u64 start = async_extent->start;
    1191      158498 :         u64 end = async_extent->start + async_extent->ram_size - 1;
    1192             : 
    1193      158498 :         if (async_chunk->blkcg_css)
    1194      158498 :                 kthread_associate_blkcg(async_chunk->blkcg_css);
    1195             : 
    1196             :         /*
    1197             :          * If async_chunk->locked_page is in the async_extent range, we need to
    1198             :          * handle it.
    1199             :          */
    1200      158498 :         if (async_chunk->locked_page) {
    1201        1184 :                 u64 locked_page_start = page_offset(async_chunk->locked_page);
    1202        1184 :                 u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
    1203             : 
    1204        1184 :                 if (!(start >= locked_page_end || end <= locked_page_start))
    1205         624 :                         locked_page = async_chunk->locked_page;
    1206             :         }
    1207      158498 :         lock_extent(io_tree, start, end, NULL);
    1208             : 
    1209             :         /* We have fall back to uncompressed write */
    1210      158498 :         if (!async_extent->pages) {
    1211         216 :                 ret = submit_uncompressed_range(inode, async_extent, locked_page);
    1212         216 :                 goto done;
    1213             :         }
    1214             : 
    1215      158282 :         ret = btrfs_reserve_extent(root, async_extent->ram_size,
    1216             :                                    async_extent->compressed_size,
    1217             :                                    async_extent->compressed_size,
    1218             :                                    0, *alloc_hint, &ins, 1, 1);
    1219      158282 :         if (ret) {
    1220           0 :                 free_async_extent_pages(async_extent);
    1221             :                 /*
    1222             :                  * Here we used to try again by going back to non-compressed
    1223             :                  * path for ENOSPC.  But we can't reserve space even for
    1224             :                  * compressed size, how could it work for uncompressed size
    1225             :                  * which requires larger size?  So here we directly go error
    1226             :                  * path.
    1227             :                  */
    1228           0 :                 goto out_free;
    1229             :         }
    1230             : 
    1231             :         /* Here we're doing allocation and writeback of the compressed pages */
    1232      158282 :         em = create_io_em(inode, start,
    1233             :                           async_extent->ram_size,    /* len */
    1234             :                           start,                        /* orig_start */
    1235             :                           ins.objectid,                 /* block_start */
    1236             :                           ins.offset,                   /* block_len */
    1237             :                           ins.offset,                   /* orig_block_len */
    1238             :                           async_extent->ram_size,    /* ram_bytes */
    1239             :                           async_extent->compress_type,
    1240             :                           BTRFS_ORDERED_COMPRESSED);
    1241      158282 :         if (IS_ERR(em)) {
    1242           0 :                 ret = PTR_ERR(em);
    1243           0 :                 goto out_free_reserve;
    1244             :         }
    1245      158282 :         free_extent_map(em);
    1246             : 
    1247      158282 :         ordered = btrfs_alloc_ordered_extent(inode, start,      /* file_offset */
    1248             :                                        async_extent->ram_size,       /* num_bytes */
    1249             :                                        async_extent->ram_size,       /* ram_bytes */
    1250             :                                        ins.objectid,            /* disk_bytenr */
    1251             :                                        ins.offset,              /* disk_num_bytes */
    1252             :                                        0,                       /* offset */
    1253             :                                        1 << BTRFS_ORDERED_COMPRESSED,
    1254             :                                        async_extent->compress_type);
    1255      158282 :         if (IS_ERR(ordered)) {
    1256           0 :                 btrfs_drop_extent_map_range(inode, start, end, false);
    1257           0 :                 ret = PTR_ERR(ordered);
    1258           0 :                 goto out_free_reserve;
    1259             :         }
    1260      158282 :         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
    1261             : 
    1262             :         /* Clear dirty, set writeback and unlock the pages. */
    1263      158282 :         extent_clear_unlock_delalloc(inode, start, end,
    1264             :                         NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
    1265             :                         PAGE_UNLOCK | PAGE_START_WRITEBACK);
    1266      158282 :         btrfs_submit_compressed_write(ordered,
    1267             :                             async_extent->pages,     /* compressed_pages */
    1268      158282 :                             async_extent->nr_pages,
    1269             :                             async_chunk->write_flags, true);
    1270      158282 :         *alloc_hint = ins.objectid + ins.offset;
    1271      158498 : done:
    1272      158498 :         if (async_chunk->blkcg_css)
    1273      158498 :                 kthread_associate_blkcg(NULL);
    1274      158498 :         kfree(async_extent);
    1275      158498 :         return ret;
    1276             : 
    1277           0 : out_free_reserve:
    1278           0 :         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
    1279           0 :         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
    1280           0 : out_free:
    1281           0 :         mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
    1282           0 :         extent_clear_unlock_delalloc(inode, start, end,
    1283             :                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
    1284             :                                      EXTENT_DELALLOC_NEW |
    1285             :                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
    1286             :                                      PAGE_UNLOCK | PAGE_START_WRITEBACK |
    1287             :                                      PAGE_END_WRITEBACK);
    1288           0 :         free_async_extent_pages(async_extent);
    1289           0 :         goto done;
    1290             : }
    1291             : 
    1292             : /*
    1293             :  * Phase two of compressed writeback.  This is the ordered portion of the code,
    1294             :  * which only gets called in the order the work was queued.  We walk all the
    1295             :  * async extents created by compress_file_range and send them down to the disk.
    1296             :  */
    1297       39958 : static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
    1298             : {
    1299       39958 :         struct btrfs_inode *inode = async_chunk->inode;
    1300       39958 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    1301       39958 :         struct async_extent *async_extent;
    1302       39958 :         u64 alloc_hint = 0;
    1303       39958 :         int ret = 0;
    1304             : 
    1305      198456 :         while (!list_empty(&async_chunk->extents)) {
    1306      158498 :                 u64 extent_start;
    1307      158498 :                 u64 ram_size;
    1308             : 
    1309      158498 :                 async_extent = list_entry(async_chunk->extents.next,
    1310             :                                           struct async_extent, list);
    1311      158498 :                 list_del(&async_extent->list);
    1312      158498 :                 extent_start = async_extent->start;
    1313      158498 :                 ram_size = async_extent->ram_size;
    1314             : 
    1315      158498 :                 ret = submit_one_async_extent(inode, async_chunk, async_extent,
    1316             :                                               &alloc_hint);
    1317      158498 :                 btrfs_debug(fs_info,
    1318             : "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
    1319             :                             inode->root->root_key.objectid,
    1320             :                             btrfs_ino(inode), extent_start, ram_size, ret);
    1321             :         }
    1322       39958 : }
    1323             : 
    1324     3044755 : static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
    1325             :                                       u64 num_bytes)
    1326             : {
    1327     3044755 :         struct extent_map_tree *em_tree = &inode->extent_tree;
    1328     3044755 :         struct extent_map *em;
    1329     3044755 :         u64 alloc_hint = 0;
    1330             : 
    1331     3044755 :         read_lock(&em_tree->lock);
    1332     3044895 :         em = search_extent_mapping(em_tree, start, num_bytes);
    1333     3044939 :         if (em) {
    1334             :                 /*
    1335             :                  * if block start isn't an actual block number then find the
    1336             :                  * first block in this inode and use that as a hint.  If that
    1337             :                  * block is also bogus then just don't worry about it.
    1338             :                  */
    1339     2433437 :                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
    1340     1122762 :                         free_extent_map(em);
    1341     1122750 :                         em = search_extent_mapping(em_tree, 0, 0);
    1342     1122752 :                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
    1343      655315 :                                 alloc_hint = em->block_start;
    1344     1122752 :                         if (em)
    1345     1122751 :                                 free_extent_map(em);
    1346             :                 } else {
    1347     1310675 :                         alloc_hint = em->block_start;
    1348     1310675 :                         free_extent_map(em);
    1349             :                 }
    1350             :         }
    1351     3044931 :         read_unlock(&em_tree->lock);
    1352             : 
    1353     3044918 :         return alloc_hint;
    1354             : }
    1355             : 
    1356             : /*
    1357             :  * when extent_io.c finds a delayed allocation range in the file,
    1358             :  * the call backs end up in this code.  The basic idea is to
    1359             :  * allocate extents on disk for the range, and create ordered data structs
    1360             :  * in ram to track those extents.
    1361             :  *
    1362             :  * locked_page is the page that writepage had locked already.  We use
    1363             :  * it to make sure we don't do extra locks or unlocks.
    1364             :  *
    1365             :  * *page_started is set to one if we unlock locked_page and do everything
    1366             :  * required to start IO on it.  It may be clean and already done with
    1367             :  * IO when we return.
    1368             :  *
    1369             :  * When unlock == 1, we unlock the pages in successfully allocated regions.
    1370             :  * When unlock == 0, we leave them locked for writing them out.
    1371             :  *
    1372             :  * However, we unlock all the pages except @locked_page in case of failure.
    1373             :  *
    1374             :  * In summary, page locking state will be as follow:
    1375             :  *
    1376             :  * - page_started == 1 (return value)
    1377             :  *     - All the pages are unlocked. IO is started.
    1378             :  *     - Note that this can happen only on success
    1379             :  * - unlock == 1
    1380             :  *     - All the pages except @locked_page are unlocked in any case
    1381             :  * - unlock == 0
    1382             :  *     - On success, all the pages are locked for writing out them
    1383             :  *     - On failure, all the pages except @locked_page are unlocked
    1384             :  *
    1385             :  * When a failure happens in the second or later iteration of the
    1386             :  * while-loop, the ordered extents created in previous iterations are kept
    1387             :  * intact. So, the caller must clean them up by calling
    1388             :  * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
    1389             :  * example.
    1390             :  */
    1391     3339951 : static noinline int cow_file_range(struct btrfs_inode *inode,
    1392             :                                    struct page *locked_page,
    1393             :                                    u64 start, u64 end, int *page_started,
    1394             :                                    unsigned long *nr_written, int unlock,
    1395             :                                    u64 *done_offset)
    1396             : {
    1397     3339951 :         struct btrfs_root *root = inode->root;
    1398     3339951 :         struct btrfs_fs_info *fs_info = root->fs_info;
    1399     3339951 :         u64 alloc_hint = 0;
    1400     3339951 :         u64 orig_start = start;
    1401     3339951 :         u64 num_bytes;
    1402     3339951 :         unsigned long ram_size;
    1403     3339951 :         u64 cur_alloc_size = 0;
    1404     3339951 :         u64 min_alloc_size;
    1405     3339951 :         u64 blocksize = fs_info->sectorsize;
    1406     3339951 :         struct btrfs_key ins;
    1407     3339951 :         struct extent_map *em;
    1408     3339951 :         unsigned clear_bits;
    1409     3339951 :         unsigned long page_ops;
    1410     3339951 :         bool extent_reserved = false;
    1411     3339951 :         int ret = 0;
    1412             : 
    1413     6679902 :         if (btrfs_is_free_space_inode(inode)) {
    1414           0 :                 ret = -EINVAL;
    1415           0 :                 goto out_unlock;
    1416             :         }
    1417             : 
    1418     3339951 :         num_bytes = ALIGN(end - start + 1, blocksize);
    1419     3339951 :         num_bytes = max(blocksize,  num_bytes);
    1420     3339951 :         ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
    1421             : 
    1422     3339951 :         inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
    1423             : 
    1424             :         /*
    1425             :          * Due to the page size limit, for subpage we can only trigger the
    1426             :          * writeback for the dirty sectors of page, that means data writeback
    1427             :          * is doing more writeback than what we want.
    1428             :          *
    1429             :          * This is especially unexpected for some call sites like fallocate,
    1430             :          * where we only increase i_size after everything is done.
    1431             :          * This means we can trigger inline extent even if we didn't want to.
    1432             :          * So here we skip inline extent creation completely.
    1433             :          */
    1434     3339854 :         if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
    1435     1724059 :                 u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
    1436             :                                        end + 1);
    1437             : 
    1438             :                 /* lets try to make an inline extent */
    1439     1724059 :                 ret = cow_file_range_inline(inode, actual_end, 0,
    1440             :                                             BTRFS_COMPRESS_NONE, NULL, false);
    1441     1724199 :                 if (ret == 0) {
    1442             :                         /*
    1443             :                          * We use DO_ACCOUNTING here because we need the
    1444             :                          * delalloc_release_metadata to be run _after_ we drop
    1445             :                          * our outstanding extent for clearing delalloc for this
    1446             :                          * range.
    1447             :                          */
    1448     1077485 :                         extent_clear_unlock_delalloc(inode, start, end,
    1449             :                                      locked_page,
    1450             :                                      EXTENT_LOCKED | EXTENT_DELALLOC |
    1451             :                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
    1452             :                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
    1453             :                                      PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
    1454     1077505 :                         *nr_written = *nr_written +
    1455     1077505 :                              (end - start + PAGE_SIZE) / PAGE_SIZE;
    1456     1077505 :                         *page_started = 1;
    1457             :                         /*
    1458             :                          * locked_page is locked by the caller of
    1459             :                          * writepage_delalloc(), not locked by
    1460             :                          * __process_pages_contig().
    1461             :                          *
    1462             :                          * We can't let __process_pages_contig() to unlock it,
    1463             :                          * as it doesn't have any subpage::writers recorded.
    1464             :                          *
    1465             :                          * Here we manually unlock the page, since the caller
    1466             :                          * can't use page_started to determine if it's an
    1467             :                          * inline extent or a compressed extent.
    1468             :                          */
    1469     1077505 :                         unlock_page(locked_page);
    1470     1077506 :                         goto out;
    1471      646714 :                 } else if (ret < 0) {
    1472           0 :                         goto out_unlock;
    1473             :                 }
    1474             :         }
    1475             : 
    1476     2262509 :         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
    1477             : 
    1478             :         /*
    1479             :          * Relocation relies on the relocated extents to have exactly the same
    1480             :          * size as the original extents. Normally writeback for relocation data
    1481             :          * extents follows a NOCOW path because relocation preallocates the
    1482             :          * extents. However, due to an operation such as scrub turning a block
    1483             :          * group to RO mode, it may fallback to COW mode, so we must make sure
    1484             :          * an extent allocated during COW has exactly the requested size and can
    1485             :          * not be split into smaller extents, otherwise relocation breaks and
    1486             :          * fails during the stage where it updates the bytenr of file extent
    1487             :          * items.
    1488             :          */
    1489     2262764 :         if (btrfs_is_data_reloc_root(root))
    1490             :                 min_alloc_size = num_bytes;
    1491             :         else
    1492     2262764 :                 min_alloc_size = fs_info->sectorsize;
    1493             : 
    1494     4532243 :         while (num_bytes > 0) {
    1495     2269562 :                 struct btrfs_ordered_extent *ordered;
    1496             : 
    1497     2269562 :                 cur_alloc_size = num_bytes;
    1498     2269562 :                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
    1499             :                                            min_alloc_size, 0, alloc_hint,
    1500             :                                            &ins, 1, 1);
    1501     2269460 :                 if (ret < 0)
    1502           0 :                         goto out_unlock;
    1503     2269460 :                 cur_alloc_size = ins.offset;
    1504     2269460 :                 extent_reserved = true;
    1505             : 
    1506     2269460 :                 ram_size = ins.offset;
    1507     2269460 :                 em = create_io_em(inode, start, ins.offset, /* len */
    1508             :                                   start, /* orig_start */
    1509             :                                   ins.objectid, /* block_start */
    1510             :                                   ins.offset, /* block_len */
    1511             :                                   ins.offset, /* orig_block_len */
    1512             :                                   ram_size, /* ram_bytes */
    1513             :                                   BTRFS_COMPRESS_NONE, /* compress_type */
    1514             :                                   BTRFS_ORDERED_REGULAR /* type */);
    1515     2269490 :                 if (IS_ERR(em)) {
    1516           0 :                         ret = PTR_ERR(em);
    1517           0 :                         goto out_reserve;
    1518             :                 }
    1519     2269490 :                 free_extent_map(em);
    1520             : 
    1521     2269530 :                 ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
    1522             :                                         ram_size, ins.objectid, cur_alloc_size,
    1523             :                                         0, 1 << BTRFS_ORDERED_REGULAR,
    1524             :                                         BTRFS_COMPRESS_NONE);
    1525     2269554 :                 if (IS_ERR(ordered)) {
    1526           0 :                         ret = PTR_ERR(ordered);
    1527           0 :                         goto out_drop_extent_cache;
    1528             :                 }
    1529             : 
    1530     2269554 :                 if (btrfs_is_data_reloc_root(root)) {
    1531           0 :                         ret = btrfs_reloc_clone_csums(ordered);
    1532             : 
    1533             :                         /*
    1534             :                          * Only drop cache here, and process as normal.
    1535             :                          *
    1536             :                          * We must not allow extent_clear_unlock_delalloc()
    1537             :                          * at out_unlock label to free meta of this ordered
    1538             :                          * extent, as its meta should be freed by
    1539             :                          * btrfs_finish_ordered_io().
    1540             :                          *
    1541             :                          * So we must continue until @start is increased to
    1542             :                          * skip current ordered extent.
    1543             :                          */
    1544           0 :                         if (ret)
    1545           0 :                                 btrfs_drop_extent_map_range(inode, start,
    1546           0 :                                                             start + ram_size - 1,
    1547             :                                                             false);
    1548             :                 }
    1549     2269554 :                 btrfs_put_ordered_extent(ordered);
    1550             : 
    1551     2269529 :                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
    1552             : 
    1553             :                 /*
    1554             :                  * We're not doing compressed IO, don't unlock the first page
    1555             :                  * (which the caller expects to stay locked), don't clear any
    1556             :                  * dirty bits and don't set any writeback bits
    1557             :                  *
    1558             :                  * Do set the Ordered (Private2) bit so we know this page was
    1559             :                  * properly setup for writepage.
    1560             :                  */
    1561     2269551 :                 page_ops = unlock ? PAGE_UNLOCK : 0;
    1562     2269551 :                 page_ops |= PAGE_SET_ORDERED;
    1563             : 
    1564     2269551 :                 extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
    1565             :                                              locked_page,
    1566             :                                              EXTENT_LOCKED | EXTENT_DELALLOC,
    1567             :                                              page_ops);
    1568     2269479 :                 if (num_bytes < cur_alloc_size)
    1569             :                         num_bytes = 0;
    1570             :                 else
    1571     2269475 :                         num_bytes -= cur_alloc_size;
    1572     2269479 :                 alloc_hint = ins.objectid + ins.offset;
    1573     2269479 :                 start += cur_alloc_size;
    1574     2269479 :                 extent_reserved = false;
    1575             : 
    1576             :                 /*
    1577             :                  * btrfs_reloc_clone_csums() error, since start is increased
    1578             :                  * extent_clear_unlock_delalloc() at out_unlock label won't
    1579             :                  * free metadata of current ordered extent, we're OK to exit.
    1580             :                  */
    1581     2269479 :                 if (ret)
    1582           0 :                         goto out_unlock;
    1583             :         }
    1584     2262681 : out:
    1585             :         return ret;
    1586             : 
    1587             : out_drop_extent_cache:
    1588           0 :         btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
    1589           0 : out_reserve:
    1590           0 :         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
    1591           0 :         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
    1592           0 : out_unlock:
    1593             :         /*
    1594             :          * If done_offset is non-NULL and ret == -EAGAIN, we expect the
    1595             :          * caller to write out the successfully allocated region and retry.
    1596             :          */
    1597           0 :         if (done_offset && ret == -EAGAIN) {
    1598           0 :                 if (orig_start < start)
    1599           0 :                         *done_offset = start - 1;
    1600             :                 else
    1601           0 :                         *done_offset = start;
    1602           0 :                 return ret;
    1603           0 :         } else if (ret == -EAGAIN) {
    1604             :                 /* Convert to -ENOSPC since the caller cannot retry. */
    1605           0 :                 ret = -ENOSPC;
    1606             :         }
    1607             : 
    1608             :         /*
    1609             :          * Now, we have three regions to clean up:
    1610             :          *
    1611             :          * |-------(1)----|---(2)---|-------------(3)----------|
    1612             :          * `- orig_start  `- start  `- start + cur_alloc_size  `- end
    1613             :          *
    1614             :          * We process each region below.
    1615             :          */
    1616             : 
    1617           0 :         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
    1618             :                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
    1619           0 :         page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
    1620             : 
    1621             :         /*
    1622             :          * For the range (1). We have already instantiated the ordered extents
    1623             :          * for this region. They are cleaned up by
    1624             :          * btrfs_cleanup_ordered_extents() in e.g,
    1625             :          * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
    1626             :          * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
    1627             :          * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
    1628             :          * function.
    1629             :          *
    1630             :          * However, in case of unlock == 0, we still need to unlock the pages
    1631             :          * (except @locked_page) to ensure all the pages are unlocked.
    1632             :          */
    1633           0 :         if (!unlock && orig_start < start) {
    1634           0 :                 if (!locked_page)
    1635           0 :                         mapping_set_error(inode->vfs_inode.i_mapping, ret);
    1636           0 :                 extent_clear_unlock_delalloc(inode, orig_start, start - 1,
    1637             :                                              locked_page, 0, page_ops);
    1638             :         }
    1639             : 
    1640             :         /*
    1641             :          * For the range (2). If we reserved an extent for our delalloc range
    1642             :          * (or a subrange) and failed to create the respective ordered extent,
    1643             :          * then it means that when we reserved the extent we decremented the
    1644             :          * extent's size from the data space_info's bytes_may_use counter and
    1645             :          * incremented the space_info's bytes_reserved counter by the same
    1646             :          * amount. We must make sure extent_clear_unlock_delalloc() does not try
    1647             :          * to decrement again the data space_info's bytes_may_use counter,
    1648             :          * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
    1649             :          */
    1650           0 :         if (extent_reserved) {
    1651           0 :                 extent_clear_unlock_delalloc(inode, start,
    1652           0 :                                              start + cur_alloc_size - 1,
    1653             :                                              locked_page,
    1654             :                                              clear_bits,
    1655             :                                              page_ops);
    1656           0 :                 start += cur_alloc_size;
    1657           0 :                 if (start >= end)
    1658             :                         return ret;
    1659             :         }
    1660             : 
    1661             :         /*
    1662             :          * For the range (3). We never touched the region. In addition to the
    1663             :          * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
    1664             :          * space_info's bytes_may_use counter, reserved in
    1665             :          * btrfs_check_data_free_space().
    1666             :          */
    1667           0 :         extent_clear_unlock_delalloc(inode, start, end, locked_page,
    1668             :                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
    1669             :                                      page_ops);
    1670           0 :         return ret;
    1671             : }
    1672             : 
    1673             : /*
    1674             :  * work queue call back to started compression on a file and pages
    1675             :  */
    1676       39978 : static noinline void async_cow_start(struct btrfs_work *work)
    1677             : {
    1678       39978 :         struct async_chunk *async_chunk;
    1679       39978 :         int compressed_extents;
    1680             : 
    1681       39978 :         async_chunk = container_of(work, struct async_chunk, work);
    1682             : 
    1683       39978 :         compressed_extents = compress_file_range(async_chunk);
    1684       39963 :         if (compressed_extents == 0) {
    1685          23 :                 btrfs_add_delayed_iput(async_chunk->inode);
    1686          23 :                 async_chunk->inode = NULL;
    1687             :         }
    1688       39963 : }
    1689             : 
    1690             : /*
    1691             :  * work queue call back to submit previously compressed pages
    1692             :  */
    1693       39981 : static noinline void async_cow_submit(struct btrfs_work *work)
    1694             : {
    1695       39981 :         struct async_chunk *async_chunk = container_of(work, struct async_chunk,
    1696             :                                                      work);
    1697       39981 :         struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
    1698       39981 :         unsigned long nr_pages;
    1699             : 
    1700       39981 :         nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
    1701             :                 PAGE_SHIFT;
    1702             : 
    1703             :         /*
    1704             :          * ->inode could be NULL if async_chunk_start has failed to compress,
    1705             :          * in which case we don't have anything to submit, yet we need to
    1706             :          * always adjust ->async_delalloc_pages as its paired with the init
    1707             :          * happening in run_delalloc_compressed
    1708             :          */
    1709       39981 :         if (async_chunk->inode)
    1710       39958 :                 submit_compressed_extents(async_chunk);
    1711             : 
    1712             :         /* atomic_sub_return implies a barrier */
    1713       39981 :         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
    1714             :             5 * SZ_1M)
    1715       39981 :                 cond_wake_up_nomb(&fs_info->async_submit_wait);
    1716       39981 : }
    1717             : 
    1718       39981 : static noinline void async_cow_free(struct btrfs_work *work)
    1719             : {
    1720       39981 :         struct async_chunk *async_chunk;
    1721       39981 :         struct async_cow *async_cow;
    1722             : 
    1723       39981 :         async_chunk = container_of(work, struct async_chunk, work);
    1724       39981 :         if (async_chunk->inode)
    1725       39958 :                 btrfs_add_delayed_iput(async_chunk->inode);
    1726       39981 :         if (async_chunk->blkcg_css)
    1727       39981 :                 css_put(async_chunk->blkcg_css);
    1728             : 
    1729       39981 :         async_cow = async_chunk->async_cow;
    1730       39981 :         if (atomic_dec_and_test(&async_cow->num_chunks))
    1731         647 :                 kvfree(async_cow);
    1732       39981 : }
    1733             : 
    1734         647 : static bool run_delalloc_compressed(struct btrfs_inode *inode,
    1735             :                                     struct writeback_control *wbc,
    1736             :                                     struct page *locked_page,
    1737             :                                     u64 start, u64 end, int *page_started,
    1738             :                                     unsigned long *nr_written)
    1739             : {
    1740         647 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    1741         647 :         struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
    1742         647 :         struct async_cow *ctx;
    1743         647 :         struct async_chunk *async_chunk;
    1744         647 :         unsigned long nr_pages;
    1745         647 :         u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
    1746         647 :         int i;
    1747         647 :         unsigned nofs_flag;
    1748         647 :         const blk_opf_t write_flags = wbc_to_write_flags(wbc);
    1749             : 
    1750         647 :         nofs_flag = memalloc_nofs_save();
    1751         647 :         ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
    1752         647 :         memalloc_nofs_restore(nofs_flag);
    1753         647 :         if (!ctx)
    1754             :                 return false;
    1755             : 
    1756         647 :         unlock_extent(&inode->io_tree, start, end, NULL);
    1757         647 :         set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
    1758             : 
    1759         647 :         async_chunk = ctx->chunks;
    1760         647 :         atomic_set(&ctx->num_chunks, num_chunks);
    1761             : 
    1762       40628 :         for (i = 0; i < num_chunks; i++) {
    1763       39981 :                 u64 cur_end = min(end, start + SZ_512K - 1);
    1764             : 
    1765             :                 /*
    1766             :                  * igrab is called higher up in the call chain, take only the
    1767             :                  * lightweight reference for the callback lifetime
    1768             :                  */
    1769       39981 :                 ihold(&inode->vfs_inode);
    1770       39981 :                 async_chunk[i].async_cow = ctx;
    1771       39981 :                 async_chunk[i].inode = inode;
    1772       39981 :                 async_chunk[i].start = start;
    1773       39981 :                 async_chunk[i].end = cur_end;
    1774       39981 :                 async_chunk[i].write_flags = write_flags;
    1775       39981 :                 INIT_LIST_HEAD(&async_chunk[i].extents);
    1776             : 
    1777             :                 /*
    1778             :                  * The locked_page comes all the way from writepage and its
    1779             :                  * the original page we were actually given.  As we spread
    1780             :                  * this large delalloc region across multiple async_chunk
    1781             :                  * structs, only the first struct needs a pointer to locked_page
    1782             :                  *
    1783             :                  * This way we don't need racey decisions about who is supposed
    1784             :                  * to unlock it.
    1785             :                  */
    1786       39981 :                 if (locked_page) {
    1787             :                         /*
    1788             :                          * Depending on the compressibility, the pages might or
    1789             :                          * might not go through async.  We want all of them to
    1790             :                          * be accounted against wbc once.  Let's do it here
    1791             :                          * before the paths diverge.  wbc accounting is used
    1792             :                          * only for foreign writeback detection and doesn't
    1793             :                          * need full accuracy.  Just account the whole thing
    1794             :                          * against the first page.
    1795             :                          */
    1796         647 :                         wbc_account_cgroup_owner(wbc, locked_page,
    1797         647 :                                                  cur_end - start);
    1798         647 :                         async_chunk[i].locked_page = locked_page;
    1799         647 :                         locked_page = NULL;
    1800             :                 } else {
    1801       39334 :                         async_chunk[i].locked_page = NULL;
    1802             :                 }
    1803             : 
    1804       39981 :                 if (blkcg_css != blkcg_root_css) {
    1805       39981 :                         css_get(blkcg_css);
    1806       39981 :                         async_chunk[i].blkcg_css = blkcg_css;
    1807       39981 :                         async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
    1808             :                 } else {
    1809           0 :                         async_chunk[i].blkcg_css = NULL;
    1810             :                 }
    1811             : 
    1812       39981 :                 btrfs_init_work(&async_chunk[i].work, async_cow_start,
    1813             :                                 async_cow_submit, async_cow_free);
    1814             : 
    1815       39981 :                 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
    1816       39981 :                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
    1817             : 
    1818       39981 :                 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
    1819             : 
    1820       39981 :                 *nr_written += nr_pages;
    1821       39981 :                 start = cur_end + 1;
    1822             :         }
    1823         647 :         *page_started = 1;
    1824         647 :         return true;
    1825             : }
    1826             : 
    1827           0 : static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
    1828             :                                        struct page *locked_page, u64 start,
    1829             :                                        u64 end, int *page_started,
    1830             :                                        unsigned long *nr_written,
    1831             :                                        struct writeback_control *wbc)
    1832             : {
    1833           0 :         u64 done_offset = end;
    1834           0 :         int ret;
    1835           0 :         bool locked_page_done = false;
    1836             : 
    1837           0 :         while (start <= end) {
    1838           0 :                 ret = cow_file_range(inode, locked_page, start, end, page_started,
    1839             :                                      nr_written, 0, &done_offset);
    1840           0 :                 if (ret && ret != -EAGAIN)
    1841           0 :                         return ret;
    1842             : 
    1843           0 :                 if (*page_started) {
    1844             :                         ASSERT(ret == 0);
    1845             :                         return 0;
    1846             :                 }
    1847             : 
    1848           0 :                 if (ret == 0)
    1849           0 :                         done_offset = end;
    1850             : 
    1851           0 :                 if (done_offset == start) {
    1852           0 :                         wait_on_bit_io(&inode->root->fs_info->flags,
    1853             :                                        BTRFS_FS_NEED_ZONE_FINISH,
    1854             :                                        TASK_UNINTERRUPTIBLE);
    1855           0 :                         continue;
    1856             :                 }
    1857             : 
    1858           0 :                 if (!locked_page_done) {
    1859           0 :                         __set_page_dirty_nobuffers(locked_page);
    1860           0 :                         account_page_redirty(locked_page);
    1861             :                 }
    1862           0 :                 locked_page_done = true;
    1863           0 :                 extent_write_locked_range(&inode->vfs_inode, start, done_offset,
    1864             :                                           wbc);
    1865           0 :                 start = done_offset + 1;
    1866             :         }
    1867             : 
    1868           0 :         *page_started = 1;
    1869             : 
    1870           0 :         return 0;
    1871             : }
    1872             : 
    1873      297257 : static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
    1874             :                                         u64 bytenr, u64 num_bytes, bool nowait)
    1875             : {
    1876      297257 :         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
    1877      297258 :         struct btrfs_ordered_sum *sums;
    1878      297258 :         int ret;
    1879      297258 :         LIST_HEAD(list);
    1880             : 
    1881      297258 :         ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
    1882             :                                       &list, 0, nowait);
    1883      297258 :         if (ret == 0 && list_empty(&list))
    1884             :                 return 0;
    1885             : 
    1886           0 :         while (!list_empty(&list)) {
    1887           0 :                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
    1888           0 :                 list_del(&sums->list);
    1889           0 :                 kfree(sums);
    1890             :         }
    1891           0 :         if (ret < 0)
    1892           0 :                 return ret;
    1893             :         return 1;
    1894             : }
    1895             : 
    1896     1085615 : static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
    1897             :                            const u64 start, const u64 end,
    1898             :                            int *page_started, unsigned long *nr_written)
    1899             : {
    1900     1085615 :         const bool is_space_ino = btrfs_is_free_space_inode(inode);
    1901     1085615 :         const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
    1902     1085615 :         const u64 range_bytes = end + 1 - start;
    1903     1085615 :         struct extent_io_tree *io_tree = &inode->io_tree;
    1904     1085615 :         u64 range_start = start;
    1905     1085615 :         u64 count;
    1906             : 
    1907             :         /*
    1908             :          * If EXTENT_NORESERVE is set it means that when the buffered write was
    1909             :          * made we had not enough available data space and therefore we did not
    1910             :          * reserve data space for it, since we though we could do NOCOW for the
    1911             :          * respective file range (either there is prealloc extent or the inode
    1912             :          * has the NOCOW bit set).
    1913             :          *
    1914             :          * However when we need to fallback to COW mode (because for example the
    1915             :          * block group for the corresponding extent was turned to RO mode by a
    1916             :          * scrub or relocation) we need to do the following:
    1917             :          *
    1918             :          * 1) We increment the bytes_may_use counter of the data space info.
    1919             :          *    If COW succeeds, it allocates a new data extent and after doing
    1920             :          *    that it decrements the space info's bytes_may_use counter and
    1921             :          *    increments its bytes_reserved counter by the same amount (we do
    1922             :          *    this at btrfs_add_reserved_bytes()). So we need to increment the
    1923             :          *    bytes_may_use counter to compensate (when space is reserved at
    1924             :          *    buffered write time, the bytes_may_use counter is incremented);
    1925             :          *
    1926             :          * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
    1927             :          *    that if the COW path fails for any reason, it decrements (through
    1928             :          *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
    1929             :          *    data space info, which we incremented in the step above.
    1930             :          *
    1931             :          * If we need to fallback to cow and the inode corresponds to a free
    1932             :          * space cache inode or an inode of the data relocation tree, we must
    1933             :          * also increment bytes_may_use of the data space_info for the same
    1934             :          * reason. Space caches and relocated data extents always get a prealloc
    1935             :          * extent for them, however scrub or balance may have set the block
    1936             :          * group that contains that extent to RO mode and therefore force COW
    1937             :          * when starting writeback.
    1938             :          */
    1939     1085615 :         count = count_range_bits(io_tree, &range_start, end, range_bytes,
    1940             :                                  EXTENT_NORESERVE, 0, NULL);
    1941     1085584 :         if (count > 0 || is_space_ino || is_reloc_ino) {
    1942           0 :                 u64 bytes = count;
    1943           0 :                 struct btrfs_fs_info *fs_info = inode->root->fs_info;
    1944           0 :                 struct btrfs_space_info *sinfo = fs_info->data_sinfo;
    1945             : 
    1946           0 :                 if (is_space_ino || is_reloc_ino)
    1947           0 :                         bytes = range_bytes;
    1948             : 
    1949           0 :                 spin_lock(&sinfo->lock);
    1950           0 :                 btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
    1951           0 :                 spin_unlock(&sinfo->lock);
    1952             : 
    1953           0 :                 if (count > 0)
    1954           0 :                         clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
    1955             :                                          NULL);
    1956             :         }
    1957             : 
    1958     1085584 :         return cow_file_range(inode, locked_page, start, end, page_started,
    1959             :                               nr_written, 1, NULL);
    1960             : }
    1961             : 
    1962             : struct can_nocow_file_extent_args {
    1963             :         /* Input fields. */
    1964             : 
    1965             :         /* Start file offset of the range we want to NOCOW. */
    1966             :         u64 start;
    1967             :         /* End file offset (inclusive) of the range we want to NOCOW. */
    1968             :         u64 end;
    1969             :         bool writeback_path;
    1970             :         bool strict;
    1971             :         /*
    1972             :          * Free the path passed to can_nocow_file_extent() once it's not needed
    1973             :          * anymore.
    1974             :          */
    1975             :         bool free_path;
    1976             : 
    1977             :         /* Output fields. Only set when can_nocow_file_extent() returns 1. */
    1978             : 
    1979             :         u64 disk_bytenr;
    1980             :         u64 disk_num_bytes;
    1981             :         u64 extent_offset;
    1982             :         /* Number of bytes that can be written to in NOCOW mode. */
    1983             :         u64 num_bytes;
    1984             : };
    1985             : 
    1986             : /*
    1987             :  * Check if we can NOCOW the file extent that the path points to.
    1988             :  * This function may return with the path released, so the caller should check
    1989             :  * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
    1990             :  *
    1991             :  * Returns: < 0 on error
    1992             :  *            0 if we can not NOCOW
    1993             :  *            1 if we can NOCOW
    1994             :  */
    1995     1234882 : static int can_nocow_file_extent(struct btrfs_path *path,
    1996             :                                  struct btrfs_key *key,
    1997             :                                  struct btrfs_inode *inode,
    1998             :                                  struct can_nocow_file_extent_args *args)
    1999             : {
    2000     1234882 :         const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
    2001     1234882 :         struct extent_buffer *leaf = path->nodes[0];
    2002     1234882 :         struct btrfs_root *root = inode->root;
    2003     1234882 :         struct btrfs_file_extent_item *fi;
    2004     1234882 :         u64 extent_end;
    2005     1234882 :         u8 extent_type;
    2006     1234882 :         int can_nocow = 0;
    2007     1234882 :         int ret = 0;
    2008     1234882 :         bool nowait = path->nowait;
    2009             : 
    2010     1234882 :         fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
    2011     1234867 :         extent_type = btrfs_file_extent_type(leaf, fi);
    2012             : 
    2013     1234868 :         if (extent_type == BTRFS_FILE_EXTENT_INLINE)
    2014        2630 :                 goto out;
    2015             : 
    2016             :         /* Can't access these fields unless we know it's not an inline extent. */
    2017     1232238 :         args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
    2018     1232246 :         args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
    2019     1232252 :         args->extent_offset = btrfs_file_extent_offset(leaf, fi);
    2020             : 
    2021     1232247 :         if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
    2022             :             extent_type == BTRFS_FILE_EXTENT_REG)
    2023      864266 :                 goto out;
    2024             : 
    2025             :         /*
    2026             :          * If the extent was created before the generation where the last snapshot
    2027             :          * for its subvolume was created, then this implies the extent is shared,
    2028             :          * hence we must COW.
    2029             :          */
    2030      735404 :         if (!args->strict &&
    2031             :             btrfs_file_extent_generation(leaf, fi) <=
    2032             :             btrfs_root_last_snapshot(&root->root_item))
    2033         972 :                 goto out;
    2034             : 
    2035             :         /* An explicit hole, must COW. */
    2036      367007 :         if (args->disk_bytenr == 0)
    2037           1 :                 goto out;
    2038             : 
    2039             :         /* Compressed/encrypted/encoded extents must be COWed. */
    2040      734012 :         if (btrfs_file_extent_compression(leaf, fi) ||
    2041      367008 :             btrfs_file_extent_encryption(leaf, fi) ||
    2042             :             btrfs_file_extent_other_encoding(leaf, fi))
    2043           0 :                 goto out;
    2044             : 
    2045      367008 :         extent_end = btrfs_file_extent_end(path);
    2046             : 
    2047             :         /*
    2048             :          * The following checks can be expensive, as they need to take other
    2049             :          * locks and do btree or rbtree searches, so release the path to avoid
    2050             :          * blocking other tasks for too long.
    2051             :          */
    2052      367007 :         btrfs_release_path(path);
    2053             : 
    2054      367008 :         ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
    2055      367008 :                                     key->offset - args->extent_offset,
    2056      367008 :                                     args->disk_bytenr, args->strict, path);
    2057      367008 :         WARN_ON_ONCE(ret > 0 && is_freespace_inode);
    2058      367008 :         if (ret != 0)
    2059       69749 :                 goto out;
    2060             : 
    2061      297259 :         if (args->free_path) {
    2062             :                 /*
    2063             :                  * We don't need the path anymore, plus through the
    2064             :                  * csum_exist_in_range() call below we will end up allocating
    2065             :                  * another path. So free the path to avoid unnecessary extra
    2066             :                  * memory usage.
    2067             :                  */
    2068       80805 :                 btrfs_free_path(path);
    2069       80805 :                 path = NULL;
    2070             :         }
    2071             : 
    2072             :         /* If there are pending snapshots for this root, we must COW. */
    2073      297258 :         if (args->writeback_path && !is_freespace_inode &&
    2074             :             atomic_read(&root->snapshot_force_cow))
    2075           1 :                 goto out;
    2076             : 
    2077      297257 :         args->disk_bytenr += args->extent_offset;
    2078      297257 :         args->disk_bytenr += args->start - key->offset;
    2079      297257 :         args->num_bytes = min(args->end + 1, extent_end) - args->start;
    2080             : 
    2081             :         /*
    2082             :          * Force COW if csums exist in the range. This ensures that csums for a
    2083             :          * given extent are either valid or do not exist.
    2084             :          */
    2085      297257 :         ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
    2086             :                                   nowait);
    2087      297258 :         WARN_ON_ONCE(ret > 0 && is_freespace_inode);
    2088      297258 :         if (ret != 0)
    2089           0 :                 goto out;
    2090             : 
    2091             :         can_nocow = 1;
    2092     1234877 :  out:
    2093     1234877 :         if (args->free_path && path)
    2094       19684 :                 btrfs_free_path(path);
    2095             : 
    2096     1234877 :         return ret < 0 ? ret : can_nocow;
    2097             : }
    2098             : 
    2099             : /*
    2100             :  * when nowcow writeback call back.  This checks for snapshots or COW copies
    2101             :  * of the extents that exist in the file, and COWs the file as required.
    2102             :  *
    2103             :  * If no cow copies or snapshots exist, we write directly to the existing
    2104             :  * blocks on disk
    2105             :  */
    2106     1217627 : static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
    2107             :                                        struct page *locked_page,
    2108             :                                        const u64 start, const u64 end,
    2109             :                                        int *page_started,
    2110             :                                        unsigned long *nr_written)
    2111             : {
    2112     1217627 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    2113     1217627 :         struct btrfs_root *root = inode->root;
    2114     1217627 :         struct btrfs_path *path;
    2115     1217627 :         u64 cow_start = (u64)-1;
    2116     1217627 :         u64 cur_offset = start;
    2117     1217627 :         int ret;
    2118     1217627 :         bool check_prev = true;
    2119     1217627 :         u64 ino = btrfs_ino(inode);
    2120     1217627 :         struct btrfs_block_group *bg;
    2121     1217627 :         bool nocow = false;
    2122     1217627 :         struct can_nocow_file_extent_args nocow_args = { 0 };
    2123             : 
    2124     1217627 :         path = btrfs_alloc_path();
    2125     1217658 :         if (!path) {
    2126           0 :                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
    2127             :                                              EXTENT_LOCKED | EXTENT_DELALLOC |
    2128             :                                              EXTENT_DO_ACCOUNTING |
    2129             :                                              EXTENT_DEFRAG, PAGE_UNLOCK |
    2130             :                                              PAGE_START_WRITEBACK |
    2131             :                                              PAGE_END_WRITEBACK);
    2132           0 :                 return -ENOMEM;
    2133             :         }
    2134             : 
    2135     1217658 :         nocow_args.end = end;
    2136     1217658 :         nocow_args.writeback_path = true;
    2137             : 
    2138     1298454 :         while (1) {
    2139     1298454 :                 struct btrfs_ordered_extent *ordered;
    2140     1298454 :                 struct btrfs_key found_key;
    2141     1298454 :                 struct btrfs_file_extent_item *fi;
    2142     1298454 :                 struct extent_buffer *leaf;
    2143     1298454 :                 u64 extent_end;
    2144     1298454 :                 u64 ram_bytes;
    2145     1298454 :                 u64 nocow_end;
    2146     1298454 :                 int extent_type;
    2147     1298454 :                 bool is_prealloc;
    2148             : 
    2149     1298454 :                 nocow = false;
    2150             : 
    2151     1298454 :                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
    2152             :                                                cur_offset, 0);
    2153     1298438 :                 if (ret < 0)
    2154           0 :                         goto error;
    2155             : 
    2156             :                 /*
    2157             :                  * If there is no extent for our range when doing the initial
    2158             :                  * search, then go back to the previous slot as it will be the
    2159             :                  * one containing the search offset
    2160             :                  */
    2161     1298438 :                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
    2162      632416 :                         leaf = path->nodes[0];
    2163      632416 :                         btrfs_item_key_to_cpu(leaf, &found_key,
    2164             :                                               path->slots[0] - 1);
    2165      632417 :                         if (found_key.objectid == ino &&
    2166      632417 :                             found_key.type == BTRFS_EXTENT_DATA_KEY)
    2167      622195 :                                 path->slots[0]--;
    2168             :                 }
    2169             :                 check_prev = false;
    2170             : next_slot:
    2171             :                 /* Go to next leaf if we have exhausted the current one */
    2172     1988957 :                 leaf = path->nodes[0];
    2173     1988957 :                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
    2174       36244 :                         ret = btrfs_next_leaf(root, path);
    2175       36244 :                         if (ret < 0) {
    2176           0 :                                 if (cow_start != (u64)-1)
    2177           0 :                                         cur_offset = cow_start;
    2178           0 :                                 goto error;
    2179             :                         }
    2180       36244 :                         if (ret > 0)
    2181             :                                 break;
    2182         827 :                         leaf = path->nodes[0];
    2183             :                 }
    2184             : 
    2185     1953540 :                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
    2186             : 
    2187             :                 /* Didn't find anything for our INO */
    2188     1953535 :                 if (found_key.objectid > ino)
    2189             :                         break;
    2190             :                 /*
    2191             :                  * Keep searching until we find an EXTENT_ITEM or there are no
    2192             :                  * more extents for this inode
    2193             :                  */
    2194     1830685 :                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
    2195     1830685 :                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
    2196           0 :                         path->slots[0]++;
    2197           0 :                         goto next_slot;
    2198             :                 }
    2199             : 
    2200             :                 /* Found key is not EXTENT_DATA_KEY or starts after req range */
    2201     1830685 :                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
    2202     1830685 :                     found_key.offset > end)
    2203             :                         break;
    2204             : 
    2205             :                 /*
    2206             :                  * If the found extent starts after requested offset, then
    2207             :                  * adjust extent_end to be right before this extent begins
    2208             :                  */
    2209     1613483 :                 if (found_key.offset > cur_offset) {
    2210      200459 :                         extent_end = found_key.offset;
    2211      200459 :                         extent_type = 0;
    2212      200459 :                         goto out_check;
    2213             :                 }
    2214             : 
    2215             :                 /*
    2216             :                  * Found extent which begins before our range and potentially
    2217             :                  * intersect it
    2218             :                  */
    2219     1413024 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
    2220             :                                     struct btrfs_file_extent_item);
    2221     1412975 :                 extent_type = btrfs_file_extent_type(leaf, fi);
    2222             :                 /* If this is triggered then we have a memory corruption. */
    2223     1413034 :                 ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
    2224     1413034 :                 if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
    2225           0 :                         ret = -EUCLEAN;
    2226           0 :                         goto error;
    2227             :                 }
    2228     1413034 :                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
    2229     1413031 :                 extent_end = btrfs_file_extent_end(path);
    2230             : 
    2231             :                 /*
    2232             :                  * If the extent we got ends before our current offset, skip to
    2233             :                  * the next extent.
    2234             :                  */
    2235     1412986 :                 if (extent_end <= cur_offset) {
    2236      278600 :                         path->slots[0]++;
    2237      278600 :                         goto next_slot;
    2238             :                 }
    2239             : 
    2240     1134386 :                 nocow_args.start = cur_offset;
    2241     1134386 :                 ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
    2242     1134402 :                 if (ret < 0) {
    2243           0 :                         if (cow_start != (u64)-1)
    2244           0 :                                 cur_offset = cow_start;
    2245           0 :                         goto error;
    2246     1134402 :                 } else if (ret == 0) {
    2247      917949 :                         goto out_check;
    2248             :                 }
    2249             : 
    2250      216453 :                 ret = 0;
    2251      216453 :                 bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
    2252      216453 :                 if (bg)
    2253      216453 :                         nocow = true;
    2254           0 : out_check:
    2255             :                 /*
    2256             :                  * If nocow is false then record the beginning of the range
    2257             :                  * that needs to be COWed
    2258             :                  */
    2259     1334861 :                 if (!nocow) {
    2260     1118408 :                         if (cow_start == (u64)-1)
    2261      854421 :                                 cow_start = cur_offset;
    2262     1118408 :                         cur_offset = extent_end;
    2263     1118408 :                         if (cur_offset > end)
    2264             :                                 break;
    2265      436109 :                         if (!path->nodes[0])
    2266       24191 :                                 continue;
    2267      411918 :                         path->slots[0]++;
    2268      411918 :                         goto next_slot;
    2269             :                 }
    2270             : 
    2271             :                 /*
    2272             :                  * COW range from cow_start to found_key.offset - 1. As the key
    2273             :                  * will contain the beginning of the first extent that can be
    2274             :                  * NOCOW, following one which needs to be COW'ed
    2275             :                  */
    2276      216453 :                 if (cow_start != (u64)-1) {
    2277       27814 :                         ret = fallback_to_cow(inode, locked_page,
    2278       27814 :                                               cow_start, found_key.offset - 1,
    2279             :                                               page_started, nr_written);
    2280       27814 :                         if (ret)
    2281           0 :                                 goto error;
    2282             :                         cow_start = (u64)-1;
    2283             :                 }
    2284             : 
    2285      216453 :                 nocow_end = cur_offset + nocow_args.num_bytes - 1;
    2286      216453 :                 is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
    2287      216453 :                 if (is_prealloc) {
    2288      216373 :                         u64 orig_start = found_key.offset - nocow_args.extent_offset;
    2289      216373 :                         struct extent_map *em;
    2290             : 
    2291      216373 :                         em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
    2292             :                                           orig_start,
    2293             :                                           nocow_args.disk_bytenr, /* block_start */
    2294             :                                           nocow_args.num_bytes, /* block_len */
    2295             :                                           nocow_args.disk_num_bytes, /* orig_block_len */
    2296             :                                           ram_bytes, BTRFS_COMPRESS_NONE,
    2297             :                                           BTRFS_ORDERED_PREALLOC);
    2298      216373 :                         if (IS_ERR(em)) {
    2299           0 :                                 ret = PTR_ERR(em);
    2300           0 :                                 goto error;
    2301             :                         }
    2302      216373 :                         free_extent_map(em);
    2303             :                 }
    2304             : 
    2305      216533 :                 ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
    2306             :                                 nocow_args.num_bytes, nocow_args.num_bytes,
    2307             :                                 nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
    2308             :                                 is_prealloc
    2309             :                                 ? (1 << BTRFS_ORDERED_PREALLOC)
    2310             :                                 : (1 << BTRFS_ORDERED_NOCOW),
    2311             :                                 BTRFS_COMPRESS_NONE);
    2312      216453 :                 if (IS_ERR(ordered)) {
    2313           0 :                         if (is_prealloc) {
    2314           0 :                                 btrfs_drop_extent_map_range(inode, cur_offset,
    2315             :                                                             nocow_end, false);
    2316             :                         }
    2317           0 :                         ret = PTR_ERR(ordered);
    2318           0 :                         goto error;
    2319             :                 }
    2320             : 
    2321      216453 :                 if (nocow) {
    2322      216453 :                         btrfs_dec_nocow_writers(bg);
    2323      216453 :                         nocow = false;
    2324             :                 }
    2325             : 
    2326      216453 :                 if (btrfs_is_data_reloc_root(root))
    2327             :                         /*
    2328             :                          * Error handled later, as we must prevent
    2329             :                          * extent_clear_unlock_delalloc() in error handler
    2330             :                          * from freeing metadata of created ordered extent.
    2331             :                          */
    2332       23024 :                         ret = btrfs_reloc_clone_csums(ordered);
    2333      216453 :                 btrfs_put_ordered_extent(ordered);
    2334             : 
    2335      216453 :                 extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
    2336             :                                              locked_page, EXTENT_LOCKED |
    2337             :                                              EXTENT_DELALLOC |
    2338             :                                              EXTENT_CLEAR_DATA_RESV,
    2339             :                                              PAGE_UNLOCK | PAGE_SET_ORDERED);
    2340             : 
    2341      216453 :                 cur_offset = extent_end;
    2342             : 
    2343             :                 /*
    2344             :                  * btrfs_reloc_clone_csums() error, now we're OK to call error
    2345             :                  * handler, as metadata for created ordered extent will only
    2346             :                  * be freed by btrfs_finish_ordered_io().
    2347             :                  */
    2348      216453 :                 if (ret)
    2349           0 :                         goto error;
    2350      216453 :                 if (cur_offset > end)
    2351             :                         break;
    2352             :         }
    2353     1217616 :         btrfs_release_path(path);
    2354             : 
    2355     1217666 :         if (cur_offset <= end && cow_start == (u64)-1)
    2356      231166 :                 cow_start = cur_offset;
    2357             : 
    2358     1217666 :         if (cow_start != (u64)-1) {
    2359     1057818 :                 cur_offset = end;
    2360     1057818 :                 ret = fallback_to_cow(inode, locked_page, cow_start, end,
    2361             :                                       page_started, nr_written);
    2362     1057769 :                 if (ret)
    2363           0 :                         goto error;
    2364             :         }
    2365             : 
    2366     1217617 : error:
    2367     1217617 :         if (nocow)
    2368           0 :                 btrfs_dec_nocow_writers(bg);
    2369             : 
    2370     1217617 :         if (ret && cur_offset < end)
    2371           0 :                 extent_clear_unlock_delalloc(inode, cur_offset, end,
    2372             :                                              locked_page, EXTENT_LOCKED |
    2373             :                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
    2374             :                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
    2375             :                                              PAGE_START_WRITEBACK |
    2376             :                                              PAGE_END_WRITEBACK);
    2377     1217617 :         btrfs_free_path(path);
    2378     1217617 :         return ret;
    2379             : }
    2380             : 
    2381     3472383 : static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
    2382             : {
    2383     3472383 :         if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
    2384     1217642 :                 if (inode->defrag_bytes &&
    2385           9 :                     test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
    2386             :                                    0, NULL))
    2387             :                         return false;
    2388     1217624 :                 return true;
    2389             :         }
    2390             :         return false;
    2391             : }
    2392             : 
    2393             : /*
    2394             :  * Function to process delayed allocation (create CoW) for ranges which are
    2395             :  * being touched for the first time.
    2396             :  */
    2397     3472498 : int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
    2398             :                 u64 start, u64 end, int *page_started, unsigned long *nr_written,
    2399             :                 struct writeback_control *wbc)
    2400             : {
    2401     3472498 :         int ret = 0;
    2402     3472498 :         const bool zoned = btrfs_is_zoned(inode->root->fs_info);
    2403             : 
    2404             :         /*
    2405             :          * The range must cover part of the @locked_page, or the returned
    2406             :          * @page_started can confuse the caller.
    2407             :          */
    2408     3472498 :         ASSERT(!(end <= page_offset(locked_page) ||
    2409             :                  start >= page_offset(locked_page) + PAGE_SIZE));
    2410             : 
    2411     3472498 :         if (should_nocow(inode, start, end)) {
    2412             :                 /*
    2413             :                  * Normally on a zoned device we're only doing COW writes, but
    2414             :                  * in case of relocation on a zoned filesystem we have taken
    2415             :                  * precaution, that we're only writing sequentially. It's safe
    2416             :                  * to use run_delalloc_nocow() here, like for  regular
    2417             :                  * preallocated inodes.
    2418             :                  */
    2419     1217612 :                 ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
    2420     1217612 :                 ret = run_delalloc_nocow(inode, locked_page, start, end,
    2421             :                                          page_started, nr_written);
    2422     1217635 :                 goto out;
    2423             :         }
    2424             : 
    2425     4509418 :         if (btrfs_inode_can_compress(inode) &&
    2426     2255328 :             inode_need_compress(inode, start, end) &&
    2427         647 :             run_delalloc_compressed(inode, wbc, locked_page, start,
    2428             :                                     end, page_started, nr_written))
    2429         647 :                 goto out;
    2430             : 
    2431     2254090 :         if (zoned)
    2432           0 :                 ret = run_delalloc_zoned(inode, locked_page, start, end,
    2433             :                                          page_started, nr_written, wbc);
    2434             :         else
    2435     2254090 :                 ret = cow_file_range(inode, locked_page, start, end,
    2436             :                                      page_started, nr_written, 1, NULL);
    2437             : 
    2438     3472044 : out:
    2439     3472691 :         ASSERT(ret <= 0);
    2440     3472691 :         if (ret)
    2441           0 :                 btrfs_cleanup_ordered_extents(inode, locked_page, start,
    2442           0 :                                               end - start + 1);
    2443     3472691 :         return ret;
    2444             : }
    2445             : 
    2446    49153370 : void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
    2447             :                                  struct extent_state *orig, u64 split)
    2448             : {
    2449    49153370 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    2450    49153370 :         u64 size;
    2451             : 
    2452             :         /* not delalloc, ignore it */
    2453    49153370 :         if (!(orig->state & EXTENT_DELALLOC))
    2454             :                 return;
    2455             : 
    2456    22144370 :         size = orig->end - orig->start + 1;
    2457    22144370 :         if (size > fs_info->max_extent_size) {
    2458     3902172 :                 u32 num_extents;
    2459     3902172 :                 u64 new_size;
    2460             : 
    2461             :                 /*
    2462             :                  * See the explanation in btrfs_merge_delalloc_extent, the same
    2463             :                  * applies here, just in reverse.
    2464             :                  */
    2465     3902172 :                 new_size = orig->end - split + 1;
    2466     3902172 :                 num_extents = count_max_extents(fs_info, new_size);
    2467     3902172 :                 new_size = split - orig->start;
    2468     3902172 :                 num_extents += count_max_extents(fs_info, new_size);
    2469     3902172 :                 if (count_max_extents(fs_info, size) >= num_extents)
    2470             :                         return;
    2471             :         }
    2472             : 
    2473    20993791 :         spin_lock(&inode->lock);
    2474    21040901 :         btrfs_mod_outstanding_extents(inode, 1);
    2475    21003423 :         spin_unlock(&inode->lock);
    2476             : }
    2477             : 
    2478             : /*
    2479             :  * Handle merged delayed allocation extents so we can keep track of new extents
    2480             :  * that are just merged onto old extents, such as when we are doing sequential
    2481             :  * writes, so we can properly account for the metadata space we'll need.
    2482             :  */
    2483    43439035 : void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
    2484             :                                  struct extent_state *other)
    2485             : {
    2486    43439035 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    2487    43439035 :         u64 new_size, old_size;
    2488    43439035 :         u32 num_extents;
    2489             : 
    2490             :         /* not delalloc, ignore it */
    2491    43439035 :         if (!(other->state & EXTENT_DELALLOC))
    2492             :                 return;
    2493             : 
    2494    41758632 :         if (new->start > other->start)
    2495    34832414 :                 new_size = new->end - other->start + 1;
    2496             :         else
    2497     6926218 :                 new_size = other->end - new->start + 1;
    2498             : 
    2499             :         /* we're not bigger than the max, unreserve the space and go */
    2500    41758632 :         if (new_size <= fs_info->max_extent_size) {
    2501    33083354 :                 spin_lock(&inode->lock);
    2502    33221223 :                 btrfs_mod_outstanding_extents(inode, -1);
    2503    33165648 :                 spin_unlock(&inode->lock);
    2504    33165648 :                 return;
    2505             :         }
    2506             : 
    2507             :         /*
    2508             :          * We have to add up either side to figure out how many extents were
    2509             :          * accounted for before we merged into one big extent.  If the number of
    2510             :          * extents we accounted for is <= the amount we need for the new range
    2511             :          * then we can return, otherwise drop.  Think of it like this
    2512             :          *
    2513             :          * [ 4k][MAX_SIZE]
    2514             :          *
    2515             :          * So we've grown the extent by a MAX_SIZE extent, this would mean we
    2516             :          * need 2 outstanding extents, on one side we have 1 and the other side
    2517             :          * we have 1 so they are == and we can return.  But in this case
    2518             :          *
    2519             :          * [MAX_SIZE+4k][MAX_SIZE+4k]
    2520             :          *
    2521             :          * Each range on their own accounts for 2 extents, but merged together
    2522             :          * they are only 3 extents worth of accounting, so we need to drop in
    2523             :          * this case.
    2524             :          */
    2525     8675278 :         old_size = other->end - other->start + 1;
    2526     8675278 :         num_extents = count_max_extents(fs_info, old_size);
    2527     8675278 :         old_size = new->end - new->start + 1;
    2528     8675278 :         num_extents += count_max_extents(fs_info, old_size);
    2529     8675278 :         if (count_max_extents(fs_info, new_size) >= num_extents)
    2530             :                 return;
    2531             : 
    2532     7517626 :         spin_lock(&inode->lock);
    2533     7517626 :         btrfs_mod_outstanding_extents(inode, -1);
    2534     7517626 :         spin_unlock(&inode->lock);
    2535             : }
    2536             : 
    2537     3461050 : static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
    2538             :                                       struct btrfs_inode *inode)
    2539             : {
    2540     3461050 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    2541             : 
    2542     3461050 :         spin_lock(&root->delalloc_lock);
    2543     3464869 :         if (list_empty(&inode->delalloc_inodes)) {
    2544     3464871 :                 list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
    2545     3464862 :                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
    2546     3464867 :                 root->nr_delalloc_inodes++;
    2547     3464867 :                 if (root->nr_delalloc_inodes == 1) {
    2548     1587244 :                         spin_lock(&fs_info->delalloc_root_lock);
    2549     1587264 :                         BUG_ON(!list_empty(&root->delalloc_root));
    2550     1587264 :                         list_add_tail(&root->delalloc_root,
    2551             :                                       &fs_info->delalloc_roots);
    2552     1587264 :                         spin_unlock(&fs_info->delalloc_root_lock);
    2553             :                 }
    2554             :         }
    2555     3464884 :         spin_unlock(&root->delalloc_lock);
    2556     3464864 : }
    2557             : 
    2558     3464856 : void __btrfs_del_delalloc_inode(struct btrfs_root *root,
    2559             :                                 struct btrfs_inode *inode)
    2560             : {
    2561     3464856 :         struct btrfs_fs_info *fs_info = root->fs_info;
    2562             : 
    2563     3464856 :         if (!list_empty(&inode->delalloc_inodes)) {
    2564     3464856 :                 list_del_init(&inode->delalloc_inodes);
    2565     3464835 :                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
    2566     3464835 :                           &inode->runtime_flags);
    2567     3464862 :                 root->nr_delalloc_inodes--;
    2568     3464862 :                 if (!root->nr_delalloc_inodes) {
    2569     1587245 :                         ASSERT(list_empty(&root->delalloc_inodes));
    2570     1587245 :                         spin_lock(&fs_info->delalloc_root_lock);
    2571     1587264 :                         BUG_ON(list_empty(&root->delalloc_root));
    2572     1587264 :                         list_del_init(&root->delalloc_root);
    2573     1587264 :                         spin_unlock(&fs_info->delalloc_root_lock);
    2574             :                 }
    2575             :         }
    2576     3464881 : }
    2577             : 
    2578     3464787 : static void btrfs_del_delalloc_inode(struct btrfs_root *root,
    2579             :                                      struct btrfs_inode *inode)
    2580             : {
    2581     3464787 :         spin_lock(&root->delalloc_lock);
    2582     3464869 :         __btrfs_del_delalloc_inode(root, inode);
    2583     3464881 :         spin_unlock(&root->delalloc_lock);
    2584     3464880 : }
    2585             : 
    2586             : /*
    2587             :  * Properly track delayed allocation bytes in the inode and to maintain the
    2588             :  * list of inodes that have pending delalloc work to be done.
    2589             :  */
    2590   154003589 : void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
    2591             :                                u32 bits)
    2592             : {
    2593   154003589 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    2594             : 
    2595   154003589 :         if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
    2596           0 :                 WARN_ON(1);
    2597             :         /*
    2598             :          * set_bit and clear bit hooks normally require _irqsave/restore
    2599             :          * but in this case, we are only testing for the DELALLOC
    2600             :          * bit, which is only set or cleared with irqs on
    2601             :          */
    2602   154003589 :         if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
    2603    39567548 :                 struct btrfs_root *root = inode->root;
    2604    39567548 :                 u64 len = state->end + 1 - state->start;
    2605    39567548 :                 u32 num_extents = count_max_extents(fs_info, len);
    2606    39567548 :                 bool do_list = !btrfs_is_free_space_inode(inode);
    2607             : 
    2608    39567548 :                 spin_lock(&inode->lock);
    2609    39572905 :                 btrfs_mod_outstanding_extents(inode, num_extents);
    2610    39469794 :                 spin_unlock(&inode->lock);
    2611             : 
    2612             :                 /* For sanity tests */
    2613    39587701 :                 if (btrfs_is_testing(fs_info))
    2614             :                         return;
    2615             : 
    2616    39587701 :                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
    2617             :                                          fs_info->delalloc_batch);
    2618    39580259 :                 spin_lock(&inode->lock);
    2619    39598061 :                 inode->delalloc_bytes += len;
    2620    39598061 :                 if (bits & EXTENT_DEFRAG)
    2621          79 :                         inode->defrag_bytes += len;
    2622    39598061 :                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
    2623             :                                          &inode->runtime_flags))
    2624     3461837 :                         btrfs_add_delalloc_inodes(root, inode);
    2625    39601085 :                 spin_unlock(&inode->lock);
    2626             :         }
    2627             : 
    2628   154030366 :         if (!(state->state & EXTENT_DELALLOC_NEW) &&
    2629   104638082 :             (bits & EXTENT_DELALLOC_NEW)) {
    2630    18707814 :                 spin_lock(&inode->lock);
    2631    18731247 :                 inode->new_delalloc_bytes += state->end + 1 - state->start;
    2632    18731247 :                 spin_unlock(&inode->lock);
    2633             :         }
    2634             : }
    2635             : 
    2636             : /*
    2637             :  * Once a range is no longer delalloc this function ensures that proper
    2638             :  * accounting happens.
    2639             :  */
    2640   136920569 : void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
    2641             :                                  struct extent_state *state, u32 bits)
    2642             : {
    2643   136920569 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    2644   136920569 :         u64 len = state->end + 1 - state->start;
    2645   136920569 :         u32 num_extents = count_max_extents(fs_info, len);
    2646             : 
    2647   136920569 :         if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
    2648          84 :                 spin_lock(&inode->lock);
    2649          84 :                 inode->defrag_bytes -= len;
    2650          84 :                 spin_unlock(&inode->lock);
    2651             :         }
    2652             : 
    2653             :         /*
    2654             :          * set_bit and clear bit hooks normally require _irqsave/restore
    2655             :          * but in this case, we are only testing for the DELALLOC
    2656             :          * bit, which is only set or cleared with irqs on
    2657             :          */
    2658   136920569 :         if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
    2659    19830395 :                 struct btrfs_root *root = inode->root;
    2660    19830395 :                 bool do_list = !btrfs_is_free_space_inode(inode);
    2661             : 
    2662    19830395 :                 spin_lock(&inode->lock);
    2663    19893809 :                 btrfs_mod_outstanding_extents(inode, -num_extents);
    2664    19874588 :                 spin_unlock(&inode->lock);
    2665             : 
    2666             :                 /*
    2667             :                  * We don't reserve metadata space for space cache inodes so we
    2668             :                  * don't need to call delalloc_release_metadata if there is an
    2669             :                  * error.
    2670             :                  */
    2671    19871133 :                 if (bits & EXTENT_CLEAR_META_RESV &&
    2672    16957927 :                     root != fs_info->tree_root)
    2673    16957927 :                         btrfs_delalloc_release_metadata(inode, len, false);
    2674             : 
    2675             :                 /* For sanity tests. */
    2676    19921358 :                 if (btrfs_is_testing(fs_info))
    2677             :                         return;
    2678             : 
    2679    19921358 :                 if (!btrfs_is_data_reloc_root(root) &&
    2680    19879560 :                     do_list && !(state->state & EXTENT_NORESERVE) &&
    2681    19874254 :                     (bits & EXTENT_CLEAR_DATA_RESV))
    2682    17196820 :                         btrfs_free_reserved_data_space_noquota(fs_info, len);
    2683             : 
    2684    19916087 :                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
    2685             :                                          fs_info->delalloc_batch);
    2686    19919222 :                 spin_lock(&inode->lock);
    2687    19921744 :                 inode->delalloc_bytes -= len;
    2688    23386564 :                 if (do_list && inode->delalloc_bytes == 0 &&
    2689           0 :                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
    2690             :                                         &inode->runtime_flags))
    2691     3464802 :                         btrfs_del_delalloc_inode(root, inode);
    2692    19921822 :                 spin_unlock(&inode->lock);
    2693             :         }
    2694             : 
    2695   136990428 :         if ((state->state & EXTENT_DELALLOC_NEW) &&
    2696    37557919 :             (bits & EXTENT_DELALLOC_NEW)) {
    2697     3248442 :                 spin_lock(&inode->lock);
    2698     3248640 :                 ASSERT(inode->new_delalloc_bytes >= len);
    2699     3248640 :                 inode->new_delalloc_bytes -= len;
    2700     3248640 :                 if (bits & EXTENT_ADD_INODE_BYTES)
    2701     1634647 :                         inode_add_bytes(&inode->vfs_inode, len);
    2702     3248656 :                 spin_unlock(&inode->lock);
    2703             :         }
    2704             : }
    2705             : 
    2706      859481 : static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
    2707             :                                         struct btrfs_ordered_extent *ordered)
    2708             : {
    2709      859481 :         u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
    2710      859481 :         u64 len = bbio->bio.bi_iter.bi_size;
    2711      859481 :         struct btrfs_ordered_extent *new;
    2712      859481 :         int ret;
    2713             : 
    2714             :         /* Must always be called for the beginning of an ordered extent. */
    2715      859481 :         if (WARN_ON_ONCE(start != ordered->disk_bytenr))
    2716             :                 return -EINVAL;
    2717             : 
    2718             :         /* No need to split if the ordered extent covers the entire bio. */
    2719      859481 :         if (ordered->disk_num_bytes == len) {
    2720      856531 :                 refcount_inc(&ordered->refs);
    2721      856800 :                 bbio->ordered = ordered;
    2722      856800 :                 return 0;
    2723             :         }
    2724             : 
    2725             :         /*
    2726             :          * Don't split the extent_map for NOCOW extents, as we're writing into
    2727             :          * a pre-existing one.
    2728             :          */
    2729        2950 :         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
    2730        2947 :                 ret = split_extent_map(bbio->inode, bbio->file_offset,
    2731             :                                        ordered->num_bytes, len,
    2732             :                                        ordered->disk_bytenr);
    2733        2947 :                 if (ret)
    2734             :                         return ret;
    2735             :         }
    2736             : 
    2737        2950 :         new = btrfs_split_ordered_extent(ordered, len);
    2738        2950 :         if (IS_ERR(new))
    2739           0 :                 return PTR_ERR(new);
    2740        2950 :         bbio->ordered = new;
    2741        2950 :         return 0;
    2742             : }
    2743             : 
    2744             : /*
    2745             :  * given a list of ordered sums record them in the inode.  This happens
    2746             :  * at IO completion time based on sums calculated at bio submission time.
    2747             :  */
    2748     3503754 : static int add_pending_csums(struct btrfs_trans_handle *trans,
    2749             :                              struct list_head *list)
    2750             : {
    2751     3503754 :         struct btrfs_ordered_sum *sum;
    2752     3503754 :         struct btrfs_root *csum_root = NULL;
    2753     3503754 :         int ret;
    2754             : 
    2755     7059874 :         list_for_each_entry(sum, list, list) {
    2756     3555948 :                 trans->adding_csums = true;
    2757     3555948 :                 if (!csum_root)
    2758     3502245 :                         csum_root = btrfs_csum_root(trans->fs_info,
    2759             :                                                     sum->logical);
    2760     3555785 :                 ret = btrfs_csum_file_blocks(trans, csum_root, sum);
    2761     3556120 :                 trans->adding_csums = false;
    2762     3556120 :                 if (ret)
    2763           0 :                         return ret;
    2764             :         }
    2765             :         return 0;
    2766             : }
    2767             : 
    2768    26491180 : static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
    2769             :                                          const u64 start,
    2770             :                                          const u64 len,
    2771             :                                          struct extent_state **cached_state)
    2772             : {
    2773    26491180 :         u64 search_start = start;
    2774    26491180 :         const u64 end = start + len - 1;
    2775             : 
    2776    53461053 :         while (search_start < end) {
    2777    26949498 :                 const u64 search_len = end - search_start + 1;
    2778    26949498 :                 struct extent_map *em;
    2779    26949498 :                 u64 em_len;
    2780    26949498 :                 int ret = 0;
    2781             : 
    2782    26949498 :                 em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
    2783    26938669 :                 if (IS_ERR(em))
    2784           0 :                         return PTR_ERR(em);
    2785             : 
    2786    26938669 :                 if (em->block_start != EXTENT_MAP_HOLE)
    2787     8375492 :                         goto next;
    2788             : 
    2789    18563177 :                 em_len = em->len;
    2790    18563177 :                 if (em->start < search_start)
    2791    10625990 :                         em_len -= search_start - em->start;
    2792    18563177 :                 if (em_len > search_len)
    2793             :                         em_len = search_len;
    2794             : 
    2795    18563177 :                 ret = set_extent_bit(&inode->io_tree, search_start,
    2796    18563177 :                                      search_start + em_len - 1,
    2797             :                                      EXTENT_DELALLOC_NEW, cached_state);
    2798    26969641 : next:
    2799    26969641 :                 search_start = extent_map_end(em);
    2800    26969641 :                 free_extent_map(em);
    2801    26969873 :                 if (ret)
    2802           0 :                         return ret;
    2803             :         }
    2804             :         return 0;
    2805             : }
    2806             : 
    2807    39275810 : int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
    2808             :                               unsigned int extra_bits,
    2809             :                               struct extent_state **cached_state)
    2810             : {
    2811    39275810 :         WARN_ON(PAGE_ALIGNED(end));
    2812             : 
    2813    39275810 :         if (start >= i_size_read(&inode->vfs_inode) &&
    2814    12865037 :             !(inode->flags & BTRFS_INODE_PREALLOC)) {
    2815             :                 /*
    2816             :                  * There can't be any extents following eof in this case so just
    2817             :                  * set the delalloc new bit for the range directly.
    2818             :                  */
    2819    12762601 :                 extra_bits |= EXTENT_DELALLOC_NEW;
    2820             :         } else {
    2821    26513209 :                 int ret;
    2822             : 
    2823    26513209 :                 ret = btrfs_find_new_delalloc_bytes(inode, start,
    2824    26513209 :                                                     end + 1 - start,
    2825             :                                                     cached_state);
    2826    26510500 :                 if (ret)
    2827             :                         return ret;
    2828             :         }
    2829             : 
    2830    39273101 :         return set_extent_bit(&inode->io_tree, start, end,
    2831             :                               EXTENT_DELALLOC | extra_bits, cached_state);
    2832             : }
    2833             : 
    2834             : /* see btrfs_writepage_start_hook for details on why this is required */
    2835             : struct btrfs_writepage_fixup {
    2836             :         struct page *page;
    2837             :         struct btrfs_inode *inode;
    2838             :         struct btrfs_work work;
    2839             : };
    2840             : 
    2841           0 : static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
    2842             : {
    2843           0 :         struct btrfs_writepage_fixup *fixup;
    2844           0 :         struct btrfs_ordered_extent *ordered;
    2845           0 :         struct extent_state *cached_state = NULL;
    2846           0 :         struct extent_changeset *data_reserved = NULL;
    2847           0 :         struct page *page;
    2848           0 :         struct btrfs_inode *inode;
    2849           0 :         u64 page_start;
    2850           0 :         u64 page_end;
    2851           0 :         int ret = 0;
    2852           0 :         bool free_delalloc_space = true;
    2853             : 
    2854           0 :         fixup = container_of(work, struct btrfs_writepage_fixup, work);
    2855           0 :         page = fixup->page;
    2856           0 :         inode = fixup->inode;
    2857           0 :         page_start = page_offset(page);
    2858           0 :         page_end = page_offset(page) + PAGE_SIZE - 1;
    2859             : 
    2860             :         /*
    2861             :          * This is similar to page_mkwrite, we need to reserve the space before
    2862             :          * we take the page lock.
    2863             :          */
    2864           0 :         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
    2865             :                                            PAGE_SIZE);
    2866           0 : again:
    2867           0 :         lock_page(page);
    2868             : 
    2869             :         /*
    2870             :          * Before we queued this fixup, we took a reference on the page.
    2871             :          * page->mapping may go NULL, but it shouldn't be moved to a different
    2872             :          * address space.
    2873             :          */
    2874           0 :         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
    2875             :                 /*
    2876             :                  * Unfortunately this is a little tricky, either
    2877             :                  *
    2878             :                  * 1) We got here and our page had already been dealt with and
    2879             :                  *    we reserved our space, thus ret == 0, so we need to just
    2880             :                  *    drop our space reservation and bail.  This can happen the
    2881             :                  *    first time we come into the fixup worker, or could happen
    2882             :                  *    while waiting for the ordered extent.
    2883             :                  * 2) Our page was already dealt with, but we happened to get an
    2884             :                  *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
    2885             :                  *    this case we obviously don't have anything to release, but
    2886             :                  *    because the page was already dealt with we don't want to
    2887             :                  *    mark the page with an error, so make sure we're resetting
    2888             :                  *    ret to 0.  This is why we have this check _before_ the ret
    2889             :                  *    check, because we do not want to have a surprise ENOSPC
    2890             :                  *    when the page was already properly dealt with.
    2891             :                  */
    2892           0 :                 if (!ret) {
    2893           0 :                         btrfs_delalloc_release_extents(inode, PAGE_SIZE);
    2894           0 :                         btrfs_delalloc_release_space(inode, data_reserved,
    2895             :                                                      page_start, PAGE_SIZE,
    2896             :                                                      true);
    2897             :                 }
    2898           0 :                 ret = 0;
    2899           0 :                 goto out_page;
    2900             :         }
    2901             : 
    2902             :         /*
    2903             :          * We can't mess with the page state unless it is locked, so now that
    2904             :          * it is locked bail if we failed to make our space reservation.
    2905             :          */
    2906           0 :         if (ret)
    2907           0 :                 goto out_page;
    2908             : 
    2909           0 :         lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
    2910             : 
    2911             :         /* already ordered? We're done */
    2912           0 :         if (PageOrdered(page))
    2913           0 :                 goto out_reserved;
    2914             : 
    2915           0 :         ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
    2916           0 :         if (ordered) {
    2917           0 :                 unlock_extent(&inode->io_tree, page_start, page_end,
    2918             :                               &cached_state);
    2919           0 :                 unlock_page(page);
    2920           0 :                 btrfs_start_ordered_extent(ordered);
    2921           0 :                 btrfs_put_ordered_extent(ordered);
    2922           0 :                 goto again;
    2923             :         }
    2924             : 
    2925           0 :         ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
    2926             :                                         &cached_state);
    2927           0 :         if (ret)
    2928           0 :                 goto out_reserved;
    2929             : 
    2930             :         /*
    2931             :          * Everything went as planned, we're now the owner of a dirty page with
    2932             :          * delayed allocation bits set and space reserved for our COW
    2933             :          * destination.
    2934             :          *
    2935             :          * The page was dirty when we started, nothing should have cleaned it.
    2936             :          */
    2937           0 :         BUG_ON(!PageDirty(page));
    2938             :         free_delalloc_space = false;
    2939           0 : out_reserved:
    2940           0 :         btrfs_delalloc_release_extents(inode, PAGE_SIZE);
    2941           0 :         if (free_delalloc_space)
    2942           0 :                 btrfs_delalloc_release_space(inode, data_reserved, page_start,
    2943             :                                              PAGE_SIZE, true);
    2944           0 :         unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
    2945           0 : out_page:
    2946           0 :         if (ret) {
    2947             :                 /*
    2948             :                  * We hit ENOSPC or other errors.  Update the mapping and page
    2949             :                  * to reflect the errors and clean the page.
    2950             :                  */
    2951           0 :                 mapping_set_error(page->mapping, ret);
    2952           0 :                 end_extent_writepage(page, ret, page_start, page_end);
    2953           0 :                 clear_page_dirty_for_io(page);
    2954             :         }
    2955           0 :         btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
    2956           0 :         unlock_page(page);
    2957           0 :         put_page(page);
    2958           0 :         kfree(fixup);
    2959           0 :         extent_changeset_free(data_reserved);
    2960             :         /*
    2961             :          * As a precaution, do a delayed iput in case it would be the last iput
    2962             :          * that could need flushing space. Recursing back to fixup worker would
    2963             :          * deadlock.
    2964             :          */
    2965           0 :         btrfs_add_delayed_iput(inode);
    2966           0 : }
    2967             : 
    2968             : /*
    2969             :  * There are a few paths in the higher layers of the kernel that directly
    2970             :  * set the page dirty bit without asking the filesystem if it is a
    2971             :  * good idea.  This causes problems because we want to make sure COW
    2972             :  * properly happens and the data=ordered rules are followed.
    2973             :  *
    2974             :  * In our case any range that doesn't have the ORDERED bit set
    2975             :  * hasn't been properly setup for IO.  We kick off an async process
    2976             :  * to fix it up.  The async helper will wait for ordered extents, set
    2977             :  * the delalloc bit and make it safe to write the page.
    2978             :  */
    2979    50231367 : int btrfs_writepage_cow_fixup(struct page *page)
    2980             : {
    2981    50231367 :         struct inode *inode = page->mapping->host;
    2982    50231367 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    2983    50231367 :         struct btrfs_writepage_fixup *fixup;
    2984             : 
    2985             :         /* This page has ordered extent covering it already */
    2986    50231367 :         if (PageOrdered(page))
    2987             :                 return 0;
    2988             : 
    2989             :         /*
    2990             :          * PageChecked is set below when we create a fixup worker for this page,
    2991             :          * don't try to create another one if we're already PageChecked()
    2992             :          *
    2993             :          * The extent_io writepage code will redirty the page if we send back
    2994             :          * EAGAIN.
    2995             :          */
    2996           0 :         if (PageChecked(page))
    2997             :                 return -EAGAIN;
    2998             : 
    2999           0 :         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
    3000           0 :         if (!fixup)
    3001             :                 return -EAGAIN;
    3002             : 
    3003             :         /*
    3004             :          * We are already holding a reference to this inode from
    3005             :          * write_cache_pages.  We need to hold it because the space reservation
    3006             :          * takes place outside of the page lock, and we can't trust
    3007             :          * page->mapping outside of the page lock.
    3008             :          */
    3009           0 :         ihold(inode);
    3010           0 :         btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
    3011           0 :         get_page(page);
    3012           0 :         btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
    3013           0 :         fixup->page = page;
    3014           0 :         fixup->inode = BTRFS_I(inode);
    3015           0 :         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
    3016             : 
    3017           0 :         return -EAGAIN;
    3018             : }
    3019             : 
    3020     3212167 : static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
    3021             :                                        struct btrfs_inode *inode, u64 file_pos,
    3022             :                                        struct btrfs_file_extent_item *stack_fi,
    3023             :                                        const bool update_inode_bytes,
    3024             :                                        u64 qgroup_reserved)
    3025             : {
    3026     3212167 :         struct btrfs_root *root = inode->root;
    3027     3212167 :         const u64 sectorsize = root->fs_info->sectorsize;
    3028     3212167 :         struct btrfs_path *path;
    3029     3212167 :         struct extent_buffer *leaf;
    3030     3212167 :         struct btrfs_key ins;
    3031     3212167 :         u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
    3032     3212167 :         u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
    3033     3212167 :         u64 offset = btrfs_stack_file_extent_offset(stack_fi);
    3034     3212167 :         u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
    3035     3212167 :         u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
    3036     3212167 :         struct btrfs_drop_extents_args drop_args = { 0 };
    3037     3212167 :         int ret;
    3038             : 
    3039     3212167 :         path = btrfs_alloc_path();
    3040     3212420 :         if (!path)
    3041             :                 return -ENOMEM;
    3042             : 
    3043             :         /*
    3044             :          * we may be replacing one extent in the tree with another.
    3045             :          * The new extent is pinned in the extent map, and we don't want
    3046             :          * to drop it from the cache until it is completely in the btree.
    3047             :          *
    3048             :          * So, tell btrfs_drop_extents to leave this extent in the cache.
    3049             :          * the caller is expected to unpin it and allow it to be merged
    3050             :          * with the others.
    3051             :          */
    3052     3212420 :         drop_args.path = path;
    3053     3212420 :         drop_args.start = file_pos;
    3054     3212420 :         drop_args.end = file_pos + num_bytes;
    3055     3212420 :         drop_args.replace_extent = true;
    3056     3212420 :         drop_args.extent_item_size = sizeof(*stack_fi);
    3057     3212420 :         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
    3058     3212556 :         if (ret)
    3059           0 :                 goto out;
    3060             : 
    3061     3212556 :         if (!drop_args.extent_inserted) {
    3062      474751 :                 ins.objectid = btrfs_ino(inode);
    3063      474751 :                 ins.offset = file_pos;
    3064      474751 :                 ins.type = BTRFS_EXTENT_DATA_KEY;
    3065             : 
    3066      474751 :                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
    3067             :                                               sizeof(*stack_fi));
    3068      474760 :                 if (ret)
    3069           0 :                         goto out;
    3070             :         }
    3071     3212565 :         leaf = path->nodes[0];
    3072     3212565 :         btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
    3073     6425092 :         write_extent_buffer(leaf, stack_fi,
    3074     3212565 :                         btrfs_item_ptr_offset(leaf, path->slots[0]),
    3075             :                         sizeof(struct btrfs_file_extent_item));
    3076             : 
    3077     3212562 :         btrfs_mark_buffer_dirty(leaf);
    3078     3212627 :         btrfs_release_path(path);
    3079             : 
    3080             :         /*
    3081             :          * If we dropped an inline extent here, we know the range where it is
    3082             :          * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
    3083             :          * number of bytes only for that range containing the inline extent.
    3084             :          * The remaining of the range will be processed when clearning the
    3085             :          * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
    3086             :          */
    3087     3212656 :         if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
    3088         181 :                 u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
    3089             : 
    3090         181 :                 inline_size = drop_args.bytes_found - inline_size;
    3091         181 :                 btrfs_update_inode_bytes(inode, sectorsize, inline_size);
    3092         181 :                 drop_args.bytes_found -= inline_size;
    3093         181 :                 num_bytes -= sectorsize;
    3094             :         }
    3095             : 
    3096     3212656 :         if (update_inode_bytes)
    3097      785048 :                 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
    3098             : 
    3099     3212642 :         ins.objectid = disk_bytenr;
    3100     3212642 :         ins.offset = disk_num_bytes;
    3101     3212642 :         ins.type = BTRFS_EXTENT_ITEM_KEY;
    3102             : 
    3103     3212642 :         ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
    3104     3212568 :         if (ret)
    3105           0 :                 goto out;
    3106             : 
    3107     3212568 :         ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
    3108             :                                                file_pos - offset,
    3109             :                                                qgroup_reserved, &ins);
    3110     3212652 : out:
    3111     3212652 :         btrfs_free_path(path);
    3112             : 
    3113     3212652 :         return ret;
    3114             : }
    3115             : 
    3116     3212633 : static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
    3117             :                                          u64 start, u64 len)
    3118             : {
    3119     3212633 :         struct btrfs_block_group *cache;
    3120             : 
    3121     3212633 :         cache = btrfs_lookup_block_group(fs_info, start);
    3122     3212583 :         ASSERT(cache);
    3123             : 
    3124     3212583 :         spin_lock(&cache->lock);
    3125     3212659 :         cache->delalloc_bytes -= len;
    3126     3212659 :         spin_unlock(&cache->lock);
    3127             : 
    3128     3212652 :         btrfs_put_block_group(cache);
    3129     3212636 : }
    3130             : 
    3131     3212255 : static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
    3132             :                                              struct btrfs_ordered_extent *oe)
    3133             : {
    3134     3212255 :         struct btrfs_file_extent_item stack_fi;
    3135     3212255 :         bool update_inode_bytes;
    3136     3212255 :         u64 num_bytes = oe->num_bytes;
    3137     3212255 :         u64 ram_bytes = oe->ram_bytes;
    3138             : 
    3139     3212255 :         memset(&stack_fi, 0, sizeof(stack_fi));
    3140     3212255 :         btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
    3141     3212255 :         btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
    3142     3212255 :         btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
    3143             :                                                    oe->disk_num_bytes);
    3144     3212255 :         btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
    3145     6424510 :         if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
    3146          34 :                 num_bytes = oe->truncated_len;
    3147          34 :                 ram_bytes = num_bytes;
    3148             :         }
    3149     3212255 :         btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
    3150     3212255 :         btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
    3151     3212255 :         btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
    3152             :         /* Encryption and other encoding is reserved and all 0 */
    3153             : 
    3154             :         /*
    3155             :          * For delalloc, when completing an ordered extent we update the inode's
    3156             :          * bytes when clearing the range in the inode's io tree, so pass false
    3157             :          * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
    3158             :          * except if the ordered extent was truncated.
    3159             :          */
    3160     5639441 :         update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
    3161     8066627 :                              test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
    3162           0 :                              test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
    3163             : 
    3164     6424908 :         return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
    3165             :                                            oe->file_offset, &stack_fi,
    3166     3212255 :                                            update_inode_bytes, oe->qgroup_rsv);
    3167             : }
    3168             : 
    3169             : /*
    3170             :  * As ordered data IO finishes, this gets called so we can finish
    3171             :  * an ordered extent if the range of bytes in the file it covers are
    3172             :  * fully written.
    3173             :  */
    3174     3504138 : int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
    3175             : {
    3176     3504138 :         struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
    3177     3504138 :         struct btrfs_root *root = inode->root;
    3178     3504138 :         struct btrfs_fs_info *fs_info = root->fs_info;
    3179     3504138 :         struct btrfs_trans_handle *trans = NULL;
    3180     3504138 :         struct extent_io_tree *io_tree = &inode->io_tree;
    3181     3504138 :         struct extent_state *cached_state = NULL;
    3182     3504138 :         u64 start, end;
    3183     3504138 :         int compress_type = 0;
    3184     3504138 :         int ret = 0;
    3185     3504138 :         u64 logical_len = ordered_extent->num_bytes;
    3186     3504138 :         bool freespace_inode;
    3187     3504138 :         bool truncated = false;
    3188     3504138 :         bool clear_reserved_extent = true;
    3189     3504138 :         unsigned int clear_bits = EXTENT_DEFRAG;
    3190             : 
    3191     3504138 :         start = ordered_extent->file_offset;
    3192     3504138 :         end = start + ordered_extent->num_bytes - 1;
    3193             : 
    3194     3504138 :         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
    3195     3212699 :             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
    3196     2427711 :             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
    3197           0 :             !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
    3198     2427707 :                 clear_bits |= EXTENT_DELALLOC_NEW;
    3199             : 
    3200     3504138 :         freespace_inode = btrfs_is_free_space_inode(inode);
    3201     3504138 :         if (!freespace_inode)
    3202     3504138 :                 btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
    3203             : 
    3204     7008276 :         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
    3205          44 :                 ret = -EIO;
    3206          44 :                 goto out;
    3207             :         }
    3208             : 
    3209     3504094 :         if (btrfs_is_zoned(fs_info))
    3210           0 :                 btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
    3211             :                                         ordered_extent->disk_num_bytes);
    3212             : 
    3213     7008188 :         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
    3214         259 :                 truncated = true;
    3215         259 :                 logical_len = ordered_extent->truncated_len;
    3216             :                 /* Truncated the entire extent, don't bother adding */
    3217         259 :                 if (!logical_len)
    3218         223 :                         goto out;
    3219             :         }
    3220             : 
    3221     7007742 :         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
    3222          84 :                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
    3223             : 
    3224          84 :                 btrfs_inode_safe_disk_i_size_write(inode, 0);
    3225          84 :                 if (freespace_inode)
    3226          24 :                         trans = btrfs_join_transaction_spacecache(root);
    3227             :                 else
    3228          60 :                         trans = btrfs_join_transaction(root);
    3229          84 :                 if (IS_ERR(trans)) {
    3230           0 :                         ret = PTR_ERR(trans);
    3231           0 :                         trans = NULL;
    3232           0 :                         goto out;
    3233             :                 }
    3234          84 :                 trans->block_rsv = &inode->block_rsv;
    3235          84 :                 ret = btrfs_update_inode_fallback(trans, root, inode);
    3236          84 :                 if (ret) /* -ENOMEM or corruption */
    3237           0 :                         btrfs_abort_transaction(trans, ret);
    3238          84 :                 goto out;
    3239             :         }
    3240             : 
    3241     3503787 :         clear_bits |= EXTENT_LOCKED;
    3242     3503787 :         lock_extent(io_tree, start, end, &cached_state);
    3243             : 
    3244     3503627 :         if (freespace_inode)
    3245          16 :                 trans = btrfs_join_transaction_spacecache(root);
    3246             :         else
    3247     3503611 :                 trans = btrfs_join_transaction(root);
    3248     3503700 :         if (IS_ERR(trans)) {
    3249           0 :                 ret = PTR_ERR(trans);
    3250           0 :                 trans = NULL;
    3251           0 :                 goto out;
    3252             :         }
    3253             : 
    3254     3503700 :         trans->block_rsv = &inode->block_rsv;
    3255             : 
    3256     7007400 :         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
    3257      158201 :                 compress_type = ordered_extent->compress_type;
    3258     7007400 :         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
    3259      291278 :                 BUG_ON(compress_type);
    3260      291278 :                 ret = btrfs_mark_extent_written(trans, inode,
    3261             :                                                 ordered_extent->file_offset,
    3262      291278 :                                                 ordered_extent->file_offset +
    3263             :                                                 logical_len);
    3264      291284 :                 btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
    3265             :                                                   ordered_extent->disk_num_bytes);
    3266             :         } else {
    3267     3212422 :                 BUG_ON(root == fs_info->tree_root);
    3268     3212422 :                 ret = insert_ordered_extent_file_extent(trans, ordered_extent);
    3269     3212641 :                 if (!ret) {
    3270     3212643 :                         clear_reserved_extent = false;
    3271     3212643 :                         btrfs_release_delalloc_bytes(fs_info,
    3272             :                                                 ordered_extent->disk_bytenr,
    3273             :                                                 ordered_extent->disk_num_bytes);
    3274             :                 }
    3275             :         }
    3276     3503900 :         unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
    3277             :                            ordered_extent->num_bytes, trans->transid);
    3278     3503792 :         if (ret < 0) {
    3279           0 :                 btrfs_abort_transaction(trans, ret);
    3280           0 :                 goto out;
    3281             :         }
    3282             : 
    3283     3503792 :         ret = add_pending_csums(trans, &ordered_extent->list);
    3284     3503918 :         if (ret) {
    3285           0 :                 btrfs_abort_transaction(trans, ret);
    3286           0 :                 goto out;
    3287             :         }
    3288             : 
    3289             :         /*
    3290             :          * If this is a new delalloc range, clear its new delalloc flag to
    3291             :          * update the inode's number of bytes. This needs to be done first
    3292             :          * before updating the inode item.
    3293             :          */
    3294     3503918 :         if ((clear_bits & EXTENT_DELALLOC_NEW) &&
    3295           0 :             !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
    3296     2427602 :                 clear_extent_bit(&inode->io_tree, start, end,
    3297             :                                  EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
    3298             :                                  &cached_state);
    3299             : 
    3300     3503891 :         btrfs_inode_safe_disk_i_size_write(inode, 0);
    3301     3503882 :         ret = btrfs_update_inode_fallback(trans, root, inode);
    3302     3503916 :         if (ret) { /* -ENOMEM or corruption */
    3303           0 :                 btrfs_abort_transaction(trans, ret);
    3304           0 :                 goto out;
    3305             :         }
    3306             :         ret = 0;
    3307     3504267 : out:
    3308     3504267 :         clear_extent_bit(&inode->io_tree, start, end, clear_bits,
    3309             :                          &cached_state);
    3310             : 
    3311     3504171 :         if (trans)
    3312     3503904 :                 btrfs_end_transaction(trans);
    3313             : 
    3314     3503907 :         if (ret || truncated) {
    3315         303 :                 u64 unwritten_start = start;
    3316             : 
    3317             :                 /*
    3318             :                  * If we failed to finish this ordered extent for any reason we
    3319             :                  * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
    3320             :                  * extent, and mark the inode with the error if it wasn't
    3321             :                  * already set.  Any error during writeback would have already
    3322             :                  * set the mapping error, so we need to set it if we're the ones
    3323             :                  * marking this ordered extent as failed.
    3324             :                  */
    3325         347 :                 if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
    3326             :                                              &ordered_extent->flags))
    3327           0 :                         mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
    3328             : 
    3329         303 :                 if (truncated)
    3330         259 :                         unwritten_start += logical_len;
    3331         303 :                 clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
    3332             : 
    3333             :                 /* Drop extent maps for the part of the extent we didn't write. */
    3334         303 :                 btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
    3335             : 
    3336             :                 /*
    3337             :                  * If the ordered extent had an IOERR or something else went
    3338             :                  * wrong we need to return the space for this ordered extent
    3339             :                  * back to the allocator.  We only free the extent in the
    3340             :                  * truncated case if we didn't write out the extent at all.
    3341             :                  *
    3342             :                  * If we made it past insert_reserved_file_extent before we
    3343             :                  * errored out then we don't need to do this as the accounting
    3344             :                  * has already been done.
    3345             :                  */
    3346         303 :                 if ((ret || !logical_len) &&
    3347         267 :                     clear_reserved_extent &&
    3348         267 :                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
    3349           0 :                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
    3350             :                         /*
    3351             :                          * Discard the range before returning it back to the
    3352             :                          * free space pool
    3353             :                          */
    3354         183 :                         if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
    3355           2 :                                 btrfs_discard_extent(fs_info,
    3356             :                                                 ordered_extent->disk_bytenr,
    3357             :                                                 ordered_extent->disk_num_bytes,
    3358             :                                                 NULL);
    3359         183 :                         btrfs_free_reserved_extent(fs_info,
    3360             :                                         ordered_extent->disk_bytenr,
    3361             :                                         ordered_extent->disk_num_bytes, 1);
    3362             :                 }
    3363             :         }
    3364             : 
    3365             :         /*
    3366             :          * This needs to be done to make sure anybody waiting knows we are done
    3367             :          * updating everything for this ordered extent.
    3368             :          */
    3369     3503907 :         btrfs_remove_ordered_extent(inode, ordered_extent);
    3370             : 
    3371             :         /* once for us */
    3372     3504182 :         btrfs_put_ordered_extent(ordered_extent);
    3373             :         /* once for the tree */
    3374     3504166 :         btrfs_put_ordered_extent(ordered_extent);
    3375             : 
    3376     3504034 :         return ret;
    3377             : }
    3378             : 
    3379     3504269 : int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
    3380             : {
    3381     3504269 :         if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
    3382           0 :             !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
    3383           0 :                 btrfs_finish_ordered_zoned(ordered);
    3384     3504269 :         return btrfs_finish_one_ordered(ordered);
    3385             : }
    3386             : 
    3387           0 : void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
    3388             :                                           struct page *page, u64 start,
    3389             :                                           u64 end, bool uptodate)
    3390             : {
    3391           0 :         trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
    3392             : 
    3393           0 :         btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate);
    3394           0 : }
    3395             : 
    3396             : /*
    3397             :  * Verify the checksum for a single sector without any extra action that depend
    3398             :  * on the type of I/O.
    3399             :  */
    3400    22639463 : int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
    3401             :                             u32 pgoff, u8 *csum, const u8 * const csum_expected)
    3402             : {
    3403    22639463 :         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
    3404    22639463 :         char *kaddr;
    3405             : 
    3406    22639463 :         ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
    3407             : 
    3408    22639463 :         shash->tfm = fs_info->csum_shash;
    3409             : 
    3410    22639463 :         kaddr = kmap_local_page(page) + pgoff;
    3411    22639463 :         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
    3412    22642613 :         kunmap_local(kaddr);
    3413             : 
    3414    45285226 :         if (memcmp(csum, csum_expected, fs_info->csum_size))
    3415          38 :                 return -EIO;
    3416             :         return 0;
    3417             : }
    3418             : 
    3419             : /*
    3420             :  * Verify the checksum of a single data sector.
    3421             :  *
    3422             :  * @bbio:       btrfs_io_bio which contains the csum
    3423             :  * @dev:        device the sector is on
    3424             :  * @bio_offset: offset to the beginning of the bio (in bytes)
    3425             :  * @bv:         bio_vec to check
    3426             :  *
    3427             :  * Check if the checksum on a data block is valid.  When a checksum mismatch is
    3428             :  * detected, report the error and fill the corrupted range with zero.
    3429             :  *
    3430             :  * Return %true if the sector is ok or had no checksum to start with, else %false.
    3431             :  */
    3432    21942569 : bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
    3433             :                         u32 bio_offset, struct bio_vec *bv)
    3434             : {
    3435    21942569 :         struct btrfs_inode *inode = bbio->inode;
    3436    21942569 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    3437    21942569 :         u64 file_offset = bbio->file_offset + bio_offset;
    3438    21942569 :         u64 end = file_offset + bv->bv_len - 1;
    3439    21942569 :         u8 *csum_expected;
    3440    21942569 :         u8 csum[BTRFS_CSUM_SIZE];
    3441             : 
    3442    21942569 :         ASSERT(bv->bv_len == fs_info->sectorsize);
    3443             : 
    3444    21942569 :         if (!bbio->csum)
    3445             :                 return true;
    3446             : 
    3447    22992731 :         if (btrfs_is_data_reloc_root(inode->root) &&
    3448     1282498 :             test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
    3449             :                            1, NULL)) {
    3450             :                 /* Skip the range without csum for data reloc inode */
    3451      121428 :                 clear_extent_bits(&inode->io_tree, file_offset, end,
    3452             :                                   EXTENT_NODATASUM);
    3453      121428 :                 return true;
    3454             :         }
    3455             : 
    3456    21588805 :         csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
    3457    21588805 :                                 fs_info->csum_size;
    3458    21588805 :         if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
    3459             :                                     csum_expected))
    3460           6 :                 goto zeroit;
    3461             :         return true;
    3462             : 
    3463             : zeroit:
    3464           6 :         btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
    3465           6 :                                     bbio->mirror_num);
    3466           6 :         if (dev)
    3467           6 :                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
    3468           6 :         memzero_bvec(bv);
    3469           6 :         return false;
    3470             : }
    3471             : 
    3472             : /*
    3473             :  * btrfs_add_delayed_iput - perform a delayed iput on @inode
    3474             :  *
    3475             :  * @inode: The inode we want to perform iput on
    3476             :  *
    3477             :  * This function uses the generic vfs_inode::i_count to track whether we should
    3478             :  * just decrement it (in case it's > 1) or if this is the last iput then link
    3479             :  * the inode to the delayed iput machinery. Delayed iputs are processed at
    3480             :  * transaction commit time/superblock commit/cleaner kthread.
    3481             :  */
    3482    10683650 : void btrfs_add_delayed_iput(struct btrfs_inode *inode)
    3483             : {
    3484    10683650 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    3485    10683650 :         unsigned long flags;
    3486             : 
    3487    21367731 :         if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
    3488             :                 return;
    3489             : 
    3490       95012 :         atomic_inc(&fs_info->nr_delayed_iputs);
    3491             :         /*
    3492             :          * Need to be irq safe here because we can be called from either an irq
    3493             :          * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
    3494             :          * context.
    3495             :          */
    3496       95011 :         spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
    3497       95017 :         ASSERT(list_empty(&inode->delayed_iput));
    3498       95017 :         list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
    3499       95017 :         spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
    3500       95016 :         if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
    3501       80828 :                 wake_up_process(fs_info->cleaner_kthread);
    3502             : }
    3503             : 
    3504       95017 : static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
    3505             :                                     struct btrfs_inode *inode)
    3506             : {
    3507       95017 :         list_del_init(&inode->delayed_iput);
    3508       95017 :         spin_unlock_irq(&fs_info->delayed_iput_lock);
    3509       95017 :         iput(&inode->vfs_inode);
    3510       95017 :         if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
    3511       41445 :                 wake_up(&fs_info->delayed_iputs_wait);
    3512       95017 :         spin_lock_irq(&fs_info->delayed_iput_lock);
    3513       95017 : }
    3514             : 
    3515     1885386 : static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
    3516             :                                    struct btrfs_inode *inode)
    3517             : {
    3518     1885386 :         if (!list_empty(&inode->delayed_iput)) {
    3519           0 :                 spin_lock_irq(&fs_info->delayed_iput_lock);
    3520           0 :                 if (!list_empty(&inode->delayed_iput))
    3521           0 :                         run_delayed_iput_locked(fs_info, inode);
    3522           0 :                 spin_unlock_irq(&fs_info->delayed_iput_lock);
    3523             :         }
    3524     1885386 : }
    3525             : 
    3526     4172871 : void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
    3527             : {
    3528             :         /*
    3529             :          * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
    3530             :          * calls btrfs_add_delayed_iput() and that needs to lock
    3531             :          * fs_info->delayed_iput_lock. So we need to disable irqs here to
    3532             :          * prevent a deadlock.
    3533             :          */
    3534     4172871 :         spin_lock_irq(&fs_info->delayed_iput_lock);
    3535     4267888 :         while (!list_empty(&fs_info->delayed_iputs)) {
    3536       95017 :                 struct btrfs_inode *inode;
    3537             : 
    3538       95017 :                 inode = list_first_entry(&fs_info->delayed_iputs,
    3539             :                                 struct btrfs_inode, delayed_iput);
    3540       95017 :                 run_delayed_iput_locked(fs_info, inode);
    3541       95017 :                 if (need_resched()) {
    3542          13 :                         spin_unlock_irq(&fs_info->delayed_iput_lock);
    3543          13 :                         cond_resched();
    3544          13 :                         spin_lock_irq(&fs_info->delayed_iput_lock);
    3545             :                 }
    3546             :         }
    3547     4172871 :         spin_unlock_irq(&fs_info->delayed_iput_lock);
    3548     4172871 : }
    3549             : 
    3550             : /*
    3551             :  * Wait for flushing all delayed iputs
    3552             :  *
    3553             :  * @fs_info:  the filesystem
    3554             :  *
    3555             :  * This will wait on any delayed iputs that are currently running with KILLABLE
    3556             :  * set.  Once they are all done running we will return, unless we are killed in
    3557             :  * which case we return EINTR. This helps in user operations like fallocate etc
    3558             :  * that might get blocked on the iputs.
    3559             :  *
    3560             :  * Return EINTR if we were killed, 0 if nothing's pending
    3561             :  */
    3562      173000 : int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
    3563             : {
    3564      173001 :         int ret = wait_event_killable(fs_info->delayed_iputs_wait,
    3565             :                         atomic_read(&fs_info->nr_delayed_iputs) == 0);
    3566           1 :         if (ret)
    3567           0 :                 return -EINTR;
    3568             :         return 0;
    3569             : }
    3570             : 
    3571             : /*
    3572             :  * This creates an orphan entry for the given inode in case something goes wrong
    3573             :  * in the middle of an unlink.
    3574             :  */
    3575     1776728 : int btrfs_orphan_add(struct btrfs_trans_handle *trans,
    3576             :                      struct btrfs_inode *inode)
    3577             : {
    3578     1776728 :         int ret;
    3579             : 
    3580     1776728 :         ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
    3581     1776737 :         if (ret && ret != -EEXIST) {
    3582           0 :                 btrfs_abort_transaction(trans, ret);
    3583           0 :                 return ret;
    3584             :         }
    3585             : 
    3586             :         return 0;
    3587             : }
    3588             : 
    3589             : /*
    3590             :  * We have done the delete so we can go ahead and remove the orphan item for
    3591             :  * this particular inode.
    3592             :  */
    3593             : static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
    3594             :                             struct btrfs_inode *inode)
    3595             : {
    3596     1776740 :         return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
    3597             : }
    3598             : 
    3599             : /*
    3600             :  * this cleans up any orphans that may be left on the list from the last use
    3601             :  * of this root.
    3602             :  */
    3603       22228 : int btrfs_orphan_cleanup(struct btrfs_root *root)
    3604             : {
    3605       22228 :         struct btrfs_fs_info *fs_info = root->fs_info;
    3606       22228 :         struct btrfs_path *path;
    3607       22228 :         struct extent_buffer *leaf;
    3608       22228 :         struct btrfs_key key, found_key;
    3609       22228 :         struct btrfs_trans_handle *trans;
    3610       22228 :         struct inode *inode;
    3611       22228 :         u64 last_objectid = 0;
    3612       22228 :         int ret = 0, nr_unlink = 0;
    3613             : 
    3614       22228 :         if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
    3615             :                 return 0;
    3616             : 
    3617       14840 :         path = btrfs_alloc_path();
    3618       14840 :         if (!path) {
    3619           0 :                 ret = -ENOMEM;
    3620           0 :                 goto out;
    3621             :         }
    3622       14840 :         path->reada = READA_BACK;
    3623             : 
    3624       14840 :         key.objectid = BTRFS_ORPHAN_OBJECTID;
    3625       14840 :         key.type = BTRFS_ORPHAN_ITEM_KEY;
    3626       14840 :         key.offset = (u64)-1;
    3627             : 
    3628       14978 :         while (1) {
    3629       14978 :                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    3630       14978 :                 if (ret < 0)
    3631           0 :                         goto out;
    3632             : 
    3633             :                 /*
    3634             :                  * if ret == 0 means we found what we were searching for, which
    3635             :                  * is weird, but possible, so only screw with path if we didn't
    3636             :                  * find the key and see if we have stuff that matches
    3637             :                  */
    3638       14978 :                 if (ret > 0) {
    3639       14872 :                         ret = 0;
    3640       14872 :                         if (path->slots[0] == 0)
    3641             :                                 break;
    3642       14872 :                         path->slots[0]--;
    3643             :                 }
    3644             : 
    3645             :                 /* pull out the item */
    3646       14978 :                 leaf = path->nodes[0];
    3647       14978 :                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
    3648             : 
    3649             :                 /* make sure the item matches what we want */
    3650       14978 :                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
    3651             :                         break;
    3652         138 :                 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
    3653             :                         break;
    3654             : 
    3655             :                 /* release the path since we're done with it */
    3656         138 :                 btrfs_release_path(path);
    3657             : 
    3658             :                 /*
    3659             :                  * this is where we are basically btrfs_lookup, without the
    3660             :                  * crossing root thing.  we store the inode number in the
    3661             :                  * offset of the orphan item.
    3662             :                  */
    3663             : 
    3664         138 :                 if (found_key.offset == last_objectid) {
    3665           0 :                         btrfs_err(fs_info,
    3666             :                                   "Error removing orphan entry, stopping orphan cleanup");
    3667           0 :                         ret = -EINVAL;
    3668           0 :                         goto out;
    3669             :                 }
    3670             : 
    3671         138 :                 last_objectid = found_key.offset;
    3672             : 
    3673         138 :                 found_key.objectid = found_key.offset;
    3674         138 :                 found_key.type = BTRFS_INODE_ITEM_KEY;
    3675         138 :                 found_key.offset = 0;
    3676         138 :                 inode = btrfs_iget(fs_info->sb, last_objectid, root);
    3677         138 :                 if (IS_ERR(inode)) {
    3678         132 :                         ret = PTR_ERR(inode);
    3679         132 :                         inode = NULL;
    3680         132 :                         if (ret != -ENOENT)
    3681           0 :                                 goto out;
    3682             :                 }
    3683             : 
    3684         138 :                 if (!inode && root == fs_info->tree_root) {
    3685         132 :                         struct btrfs_root *dead_root;
    3686         132 :                         int is_dead_root = 0;
    3687             : 
    3688             :                         /*
    3689             :                          * This is an orphan in the tree root. Currently these
    3690             :                          * could come from 2 sources:
    3691             :                          *  a) a root (snapshot/subvolume) deletion in progress
    3692             :                          *  b) a free space cache inode
    3693             :                          * We need to distinguish those two, as the orphan item
    3694             :                          * for a root must not get deleted before the deletion
    3695             :                          * of the snapshot/subvolume's tree completes.
    3696             :                          *
    3697             :                          * btrfs_find_orphan_roots() ran before us, which has
    3698             :                          * found all deleted roots and loaded them into
    3699             :                          * fs_info->fs_roots_radix. So here we can find if an
    3700             :                          * orphan item corresponds to a deleted root by looking
    3701             :                          * up the root from that radix tree.
    3702             :                          */
    3703             : 
    3704         132 :                         spin_lock(&fs_info->fs_roots_radix_lock);
    3705         132 :                         dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
    3706         132 :                                                          (unsigned long)found_key.objectid);
    3707         132 :                         if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
    3708         132 :                                 is_dead_root = 1;
    3709         132 :                         spin_unlock(&fs_info->fs_roots_radix_lock);
    3710             : 
    3711         132 :                         if (is_dead_root) {
    3712             :                                 /* prevent this orphan from being found again */
    3713         132 :                                 key.offset = found_key.objectid - 1;
    3714         132 :                                 continue;
    3715             :                         }
    3716             : 
    3717             :                 }
    3718             : 
    3719             :                 /*
    3720             :                  * If we have an inode with links, there are a couple of
    3721             :                  * possibilities:
    3722             :                  *
    3723             :                  * 1. We were halfway through creating fsverity metadata for the
    3724             :                  * file. In that case, the orphan item represents incomplete
    3725             :                  * fsverity metadata which must be cleaned up with
    3726             :                  * btrfs_drop_verity_items and deleting the orphan item.
    3727             : 
    3728             :                  * 2. Old kernels (before v3.12) used to create an
    3729             :                  * orphan item for truncate indicating that there were possibly
    3730             :                  * extent items past i_size that needed to be deleted. In v3.12,
    3731             :                  * truncate was changed to update i_size in sync with the extent
    3732             :                  * items, but the (useless) orphan item was still created. Since
    3733             :                  * v4.18, we don't create the orphan item for truncate at all.
    3734             :                  *
    3735             :                  * So, this item could mean that we need to do a truncate, but
    3736             :                  * only if this filesystem was last used on a pre-v3.12 kernel
    3737             :                  * and was not cleanly unmounted. The odds of that are quite
    3738             :                  * slim, and it's a pain to do the truncate now, so just delete
    3739             :                  * the orphan item.
    3740             :                  *
    3741             :                  * It's also possible that this orphan item was supposed to be
    3742             :                  * deleted but wasn't. The inode number may have been reused,
    3743             :                  * but either way, we can delete the orphan item.
    3744             :                  */
    3745           6 :                 if (!inode || inode->i_nlink) {
    3746           0 :                         if (inode) {
    3747           0 :                                 ret = btrfs_drop_verity_items(BTRFS_I(inode));
    3748           0 :                                 iput(inode);
    3749           0 :                                 inode = NULL;
    3750           0 :                                 if (ret)
    3751             :                                         goto out;
    3752             :                         }
    3753           0 :                         trans = btrfs_start_transaction(root, 1);
    3754           0 :                         if (IS_ERR(trans)) {
    3755           0 :                                 ret = PTR_ERR(trans);
    3756           0 :                                 goto out;
    3757             :                         }
    3758           0 :                         btrfs_debug(fs_info, "auto deleting %Lu",
    3759             :                                     found_key.objectid);
    3760           0 :                         ret = btrfs_del_orphan_item(trans, root,
    3761             :                                                     found_key.objectid);
    3762           0 :                         btrfs_end_transaction(trans);
    3763           0 :                         if (ret)
    3764           0 :                                 goto out;
    3765           0 :                         continue;
    3766             :                 }
    3767             : 
    3768           6 :                 nr_unlink++;
    3769             : 
    3770             :                 /* this will do delete_inode and everything for us */
    3771           6 :                 iput(inode);
    3772             :         }
    3773             :         /* release the path since we're done with it */
    3774       14840 :         btrfs_release_path(path);
    3775             : 
    3776       29680 :         if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
    3777           0 :                 trans = btrfs_join_transaction(root);
    3778           0 :                 if (!IS_ERR(trans))
    3779           0 :                         btrfs_end_transaction(trans);
    3780             :         }
    3781             : 
    3782             :         if (nr_unlink)
    3783             :                 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
    3784             : 
    3785       14840 : out:
    3786       14840 :         if (ret)
    3787           0 :                 btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
    3788       14840 :         btrfs_free_path(path);
    3789       14840 :         return ret;
    3790             : }
    3791             : 
    3792             : /*
    3793             :  * very simple check to peek ahead in the leaf looking for xattrs.  If we
    3794             :  * don't find any xattrs, we know there can't be any acls.
    3795             :  *
    3796             :  * slot is the slot the inode is in, objectid is the objectid of the inode
    3797             :  */
    3798      554123 : static noinline int acls_after_inode_item(struct extent_buffer *leaf,
    3799             :                                           int slot, u64 objectid,
    3800             :                                           int *first_xattr_slot)
    3801             : {
    3802      554123 :         u32 nritems = btrfs_header_nritems(leaf);
    3803      554123 :         struct btrfs_key found_key;
    3804      554123 :         static u64 xattr_access = 0;
    3805      554123 :         static u64 xattr_default = 0;
    3806      554123 :         int scanned = 0;
    3807             : 
    3808      554123 :         if (!xattr_access) {
    3809           0 :                 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
    3810             :                                         strlen(XATTR_NAME_POSIX_ACL_ACCESS));
    3811           0 :                 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
    3812             :                                         strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
    3813             :         }
    3814             : 
    3815      554123 :         slot++;
    3816      554123 :         *first_xattr_slot = -1;
    3817      803722 :         while (slot < nritems) {
    3818      710507 :                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
    3819             : 
    3820             :                 /* we found a different objectid, there must not be acls */
    3821      710528 :                 if (found_key.objectid != objectid)
    3822             :                         return 0;
    3823             : 
    3824             :                 /* we found an xattr, assume we've got an acl */
    3825      666193 :                 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
    3826      245662 :                         if (*first_xattr_slot == -1)
    3827      244973 :                                 *first_xattr_slot = slot;
    3828      245662 :                         if (found_key.offset == xattr_access ||
    3829      245659 :                             found_key.offset == xattr_default)
    3830             :                                 return 1;
    3831             :                 }
    3832             : 
    3833             :                 /*
    3834             :                  * we found a key greater than an xattr key, there can't
    3835             :                  * be any acls later on
    3836             :                  */
    3837      666190 :                 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
    3838             :                         return 0;
    3839             : 
    3840      249604 :                 slot++;
    3841      249604 :                 scanned++;
    3842             : 
    3843             :                 /*
    3844             :                  * it goes inode, inode backrefs, xattrs, extents,
    3845             :                  * so if there are a ton of hard links to an inode there can
    3846             :                  * be a lot of backrefs.  Don't waste time searching too hard,
    3847             :                  * this is just an optimization
    3848             :                  */
    3849      249604 :                 if (scanned >= 8)
    3850             :                         break;
    3851             :         }
    3852             :         /* we hit the end of the leaf before we found an xattr or
    3853             :          * something larger than an xattr.  We have to assume the inode
    3854             :          * has acls
    3855             :          */
    3856       93220 :         if (*first_xattr_slot == -1)
    3857       45164 :                 *first_xattr_slot = slot;
    3858             :         return 1;
    3859             : }
    3860             : 
    3861             : /*
    3862             :  * read an inode from the btree into the in-memory inode
    3863             :  */
    3864      559717 : static int btrfs_read_locked_inode(struct inode *inode,
    3865             :                                    struct btrfs_path *in_path)
    3866             : {
    3867      559717 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    3868      559717 :         struct btrfs_path *path = in_path;
    3869      559717 :         struct extent_buffer *leaf;
    3870      559717 :         struct btrfs_inode_item *inode_item;
    3871      559717 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    3872      559717 :         struct btrfs_key location;
    3873      559717 :         unsigned long ptr;
    3874      559717 :         int maybe_acls;
    3875      559717 :         u32 rdev;
    3876      559717 :         int ret;
    3877      559717 :         bool filled = false;
    3878      559717 :         int first_xattr_slot;
    3879             : 
    3880      559717 :         ret = btrfs_fill_inode(inode, &rdev);
    3881      559703 :         if (!ret)
    3882          14 :                 filled = true;
    3883             : 
    3884      559703 :         if (!path) {
    3885      559687 :                 path = btrfs_alloc_path();
    3886      559688 :                 if (!path)
    3887             :                         return -ENOMEM;
    3888             :         }
    3889             : 
    3890      559704 :         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
    3891             : 
    3892      559704 :         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
    3893      559713 :         if (ret) {
    3894        5571 :                 if (path != in_path)
    3895        5571 :                         btrfs_free_path(path);
    3896        5571 :                 return ret;
    3897             :         }
    3898             : 
    3899      554142 :         leaf = path->nodes[0];
    3900             : 
    3901      554142 :         if (filled)
    3902          14 :                 goto cache_index;
    3903             : 
    3904      554128 :         inode_item = btrfs_item_ptr(leaf, path->slots[0],
    3905             :                                     struct btrfs_inode_item);
    3906      554121 :         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
    3907      554123 :         set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
    3908      554115 :         i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
    3909      554122 :         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
    3910      554119 :         btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
    3911      554117 :         btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
    3912      554117 :                         round_up(i_size_read(inode), fs_info->sectorsize));
    3913             : 
    3914      554107 :         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
    3915      554112 :         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
    3916             : 
    3917      554115 :         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
    3918      554121 :         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
    3919             : 
    3920      554121 :         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
    3921      554116 :         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
    3922             : 
    3923     1108233 :         BTRFS_I(inode)->i_otime.tv_sec =
    3924      554119 :                 btrfs_timespec_sec(leaf, &inode_item->otime);
    3925      554114 :         BTRFS_I(inode)->i_otime.tv_nsec =
    3926      554113 :                 btrfs_timespec_nsec(leaf, &inode_item->otime);
    3927             : 
    3928      554113 :         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
    3929      554100 :         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
    3930      554110 :         BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
    3931             : 
    3932      554116 :         inode_set_iversion_queried(inode,
    3933             :                                    btrfs_inode_sequence(leaf, inode_item));
    3934      554112 :         inode->i_generation = BTRFS_I(inode)->generation;
    3935      554112 :         inode->i_rdev = 0;
    3936      554112 :         rdev = btrfs_inode_rdev(leaf, inode_item);
    3937             : 
    3938      554111 :         BTRFS_I(inode)->index_cnt = (u64)-1;
    3939      554111 :         btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
    3940             :                                 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
    3941             : 
    3942      554130 : cache_index:
    3943             :         /*
    3944             :          * If we were modified in the current generation and evicted from memory
    3945             :          * and then re-read we need to do a full sync since we don't have any
    3946             :          * idea about which extents were modified before we were evicted from
    3947             :          * cache.
    3948             :          *
    3949             :          * This is required for both inode re-read from disk and delayed inode
    3950             :          * in delayed_nodes_tree.
    3951             :          */
    3952      554130 :         if (BTRFS_I(inode)->last_trans == fs_info->generation)
    3953      122882 :                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
    3954      122882 :                         &BTRFS_I(inode)->runtime_flags);
    3955             : 
    3956             :         /*
    3957             :          * We don't persist the id of the transaction where an unlink operation
    3958             :          * against the inode was last made. So here we assume the inode might
    3959             :          * have been evicted, and therefore the exact value of last_unlink_trans
    3960             :          * lost, and set it to last_trans to avoid metadata inconsistencies
    3961             :          * between the inode and its parent if the inode is fsync'ed and the log
    3962             :          * replayed. For example, in the scenario:
    3963             :          *
    3964             :          * touch mydir/foo
    3965             :          * ln mydir/foo mydir/bar
    3966             :          * sync
    3967             :          * unlink mydir/bar
    3968             :          * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
    3969             :          * xfs_io -c fsync mydir/foo
    3970             :          * <power failure>
    3971             :          * mount fs, triggers fsync log replay
    3972             :          *
    3973             :          * We must make sure that when we fsync our inode foo we also log its
    3974             :          * parent inode, otherwise after log replay the parent still has the
    3975             :          * dentry with the "bar" name but our inode foo has a link count of 1
    3976             :          * and doesn't have an inode ref with the name "bar" anymore.
    3977             :          *
    3978             :          * Setting last_unlink_trans to last_trans is a pessimistic approach,
    3979             :          * but it guarantees correctness at the expense of occasional full
    3980             :          * transaction commits on fsync if our inode is a directory, or if our
    3981             :          * inode is not a directory, logging its parent unnecessarily.
    3982             :          */
    3983      554130 :         BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
    3984             : 
    3985             :         /*
    3986             :          * Same logic as for last_unlink_trans. We don't persist the generation
    3987             :          * of the last transaction where this inode was used for a reflink
    3988             :          * operation, so after eviction and reloading the inode we must be
    3989             :          * pessimistic and assume the last transaction that modified the inode.
    3990             :          */
    3991      554130 :         BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
    3992             : 
    3993      554130 :         path->slots[0]++;
    3994      554130 :         if (inode->i_nlink != 1 ||
    3995      439488 :             path->slots[0] >= btrfs_header_nritems(leaf))
    3996      134877 :                 goto cache_acl;
    3997             : 
    3998      419253 :         btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
    3999      419257 :         if (location.objectid != btrfs_ino(BTRFS_I(inode)))
    4000         111 :                 goto cache_acl;
    4001             : 
    4002      419146 :         ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
    4003      419130 :         if (location.type == BTRFS_INODE_REF_KEY) {
    4004      419120 :                 struct btrfs_inode_ref *ref;
    4005             : 
    4006      419120 :                 ref = (struct btrfs_inode_ref *)ptr;
    4007      419120 :                 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
    4008          10 :         } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
    4009           1 :                 struct btrfs_inode_extref *extref;
    4010             : 
    4011           1 :                 extref = (struct btrfs_inode_extref *)ptr;
    4012           1 :                 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
    4013             :                                                                      extref);
    4014             :         }
    4015           9 : cache_acl:
    4016             :         /*
    4017             :          * try to precache a NULL acl entry for files that don't have
    4018             :          * any xattrs or acls
    4019             :          */
    4020      554121 :         maybe_acls = acls_after_inode_item(leaf, path->slots[0],
    4021             :                         btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
    4022      554142 :         if (first_xattr_slot != -1) {
    4023      290137 :                 path->slots[0] = first_xattr_slot;
    4024      290137 :                 ret = btrfs_load_inode_props(inode, path);
    4025      290137 :                 if (ret)
    4026           0 :                         btrfs_err(fs_info,
    4027             :                                   "error loading props for ino %llu (root %llu): %d",
    4028             :                                   btrfs_ino(BTRFS_I(inode)),
    4029             :                                   root->root_key.objectid, ret);
    4030             :         }
    4031      554142 :         if (path != in_path)
    4032      554135 :                 btrfs_free_path(path);
    4033             : 
    4034      554142 :         if (!maybe_acls)
    4035      460922 :                 cache_no_acl(inode);
    4036             : 
    4037      554142 :         switch (inode->i_mode & S_IFMT) {
    4038      498699 :         case S_IFREG:
    4039      498699 :                 inode->i_mapping->a_ops = &btrfs_aops;
    4040      498699 :                 inode->i_fop = &btrfs_file_operations;
    4041      498699 :                 inode->i_op = &btrfs_file_inode_operations;
    4042      498699 :                 break;
    4043       29312 :         case S_IFDIR:
    4044       29312 :                 inode->i_fop = &btrfs_dir_file_operations;
    4045       29312 :                 inode->i_op = &btrfs_dir_inode_operations;
    4046       29312 :                 break;
    4047        7435 :         case S_IFLNK:
    4048        7435 :                 inode->i_op = &btrfs_symlink_inode_operations;
    4049        7435 :                 inode_nohighmem(inode);
    4050        7435 :                 inode->i_mapping->a_ops = &btrfs_aops;
    4051        7435 :                 break;
    4052       18696 :         default:
    4053       18696 :                 inode->i_op = &btrfs_special_inode_operations;
    4054       18696 :                 init_special_inode(inode, inode->i_mode, rdev);
    4055       18696 :                 break;
    4056             :         }
    4057             : 
    4058      554142 :         btrfs_sync_inode_flags_to_i_flags(inode);
    4059      554142 :         return 0;
    4060             : }
    4061             : 
    4062             : /*
    4063             :  * given a leaf and an inode, copy the inode fields into the leaf
    4064             :  */
    4065     3416018 : static void fill_inode_item(struct btrfs_trans_handle *trans,
    4066             :                             struct extent_buffer *leaf,
    4067             :                             struct btrfs_inode_item *item,
    4068             :                             struct inode *inode)
    4069             : {
    4070     3416018 :         struct btrfs_map_token token;
    4071     3416018 :         u64 flags;
    4072             : 
    4073     3416018 :         btrfs_init_map_token(&token, leaf);
    4074             : 
    4075     3416011 :         btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
    4076     3416012 :         btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
    4077     3416015 :         btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
    4078     3416014 :         btrfs_set_token_inode_mode(&token, item, inode->i_mode);
    4079     3416014 :         btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
    4080             : 
    4081     3416016 :         btrfs_set_token_timespec_sec(&token, &item->atime,
    4082     3416016 :                                      inode->i_atime.tv_sec);
    4083     3416018 :         btrfs_set_token_timespec_nsec(&token, &item->atime,
    4084     3416018 :                                       inode->i_atime.tv_nsec);
    4085             : 
    4086     3416018 :         btrfs_set_token_timespec_sec(&token, &item->mtime,
    4087     3416018 :                                      inode->i_mtime.tv_sec);
    4088     3416017 :         btrfs_set_token_timespec_nsec(&token, &item->mtime,
    4089     3416017 :                                       inode->i_mtime.tv_nsec);
    4090             : 
    4091     3416016 :         btrfs_set_token_timespec_sec(&token, &item->ctime,
    4092     3416016 :                                      inode->i_ctime.tv_sec);
    4093     3416013 :         btrfs_set_token_timespec_nsec(&token, &item->ctime,
    4094     3416013 :                                       inode->i_ctime.tv_nsec);
    4095             : 
    4096     3416015 :         btrfs_set_token_timespec_sec(&token, &item->otime,
    4097     3416015 :                                      BTRFS_I(inode)->i_otime.tv_sec);
    4098     3416014 :         btrfs_set_token_timespec_nsec(&token, &item->otime,
    4099     3416014 :                                       BTRFS_I(inode)->i_otime.tv_nsec);
    4100             : 
    4101     3416017 :         btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
    4102     3416017 :         btrfs_set_token_inode_generation(&token, item,
    4103             :                                          BTRFS_I(inode)->generation);
    4104     3416018 :         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
    4105     3416018 :         btrfs_set_token_inode_transid(&token, item, trans->transid);
    4106     3416019 :         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
    4107     3416013 :         flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
    4108             :                                           BTRFS_I(inode)->ro_flags);
    4109     3416013 :         btrfs_set_token_inode_flags(&token, item, flags);
    4110     3416014 :         btrfs_set_token_inode_block_group(&token, item, 0);
    4111     3416015 : }
    4112             : 
    4113             : /*
    4114             :  * copy everything in the in-memory inode into the btree.
    4115             :  */
    4116      160837 : static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
    4117             :                                 struct btrfs_root *root,
    4118             :                                 struct btrfs_inode *inode)
    4119             : {
    4120      160837 :         struct btrfs_inode_item *inode_item;
    4121      160837 :         struct btrfs_path *path;
    4122      160837 :         struct extent_buffer *leaf;
    4123      160837 :         int ret;
    4124             : 
    4125      160837 :         path = btrfs_alloc_path();
    4126      160837 :         if (!path)
    4127             :                 return -ENOMEM;
    4128             : 
    4129      160837 :         ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
    4130      160837 :         if (ret) {
    4131           0 :                 if (ret > 0)
    4132           0 :                         ret = -ENOENT;
    4133           0 :                 goto failed;
    4134             :         }
    4135             : 
    4136      160837 :         leaf = path->nodes[0];
    4137      160837 :         inode_item = btrfs_item_ptr(leaf, path->slots[0],
    4138             :                                     struct btrfs_inode_item);
    4139             : 
    4140      160837 :         fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
    4141      160837 :         btrfs_mark_buffer_dirty(leaf);
    4142      160837 :         btrfs_set_inode_last_trans(trans, inode);
    4143      160837 :         ret = 0;
    4144      160837 : failed:
    4145      160837 :         btrfs_free_path(path);
    4146      160837 :         return ret;
    4147             : }
    4148             : 
    4149             : /*
    4150             :  * copy everything in the in-memory inode into the btree.
    4151             :  */
    4152    36660898 : noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
    4153             :                                 struct btrfs_root *root,
    4154             :                                 struct btrfs_inode *inode)
    4155             : {
    4156    36660898 :         struct btrfs_fs_info *fs_info = root->fs_info;
    4157    36660898 :         int ret;
    4158             : 
    4159             :         /*
    4160             :          * If the inode is a free space inode, we can deadlock during commit
    4161             :          * if we put it into the delayed code.
    4162             :          *
    4163             :          * The data relocation inode should also be directly updated
    4164             :          * without delay
    4165             :          */
    4166    73321796 :         if (!btrfs_is_free_space_inode(inode)
    4167    36660750 :             && !btrfs_is_data_reloc_root(root)
    4168    36614699 :             && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
    4169    36499830 :                 btrfs_update_root_times(trans, root);
    4170             : 
    4171    36501956 :                 ret = btrfs_delayed_update_inode(trans, root, inode);
    4172    36503012 :                 if (!ret)
    4173    36413987 :                         btrfs_set_inode_last_trans(trans, inode);
    4174    36502649 :                 return ret;
    4175             :         }
    4176             : 
    4177      161068 :         return btrfs_update_inode_item(trans, root, inode);
    4178             : }
    4179             : 
    4180     3505070 : int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
    4181             :                                 struct btrfs_root *root, struct btrfs_inode *inode)
    4182             : {
    4183     3505070 :         int ret;
    4184             : 
    4185     3505070 :         ret = btrfs_update_inode(trans, root, inode);
    4186     3505209 :         if (ret == -ENOSPC)
    4187           1 :                 return btrfs_update_inode_item(trans, root, inode);
    4188             :         return ret;
    4189             : }
    4190             : 
    4191             : /*
    4192             :  * unlink helper that gets used here in inode.c and in the tree logging
    4193             :  * recovery code.  It remove a link in a directory with a given name, and
    4194             :  * also drops the back refs in the inode to the directory
    4195             :  */
    4196     1885311 : static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
    4197             :                                 struct btrfs_inode *dir,
    4198             :                                 struct btrfs_inode *inode,
    4199             :                                 const struct fscrypt_str *name,
    4200             :                                 struct btrfs_rename_ctx *rename_ctx)
    4201             : {
    4202     1885311 :         struct btrfs_root *root = dir->root;
    4203     1885311 :         struct btrfs_fs_info *fs_info = root->fs_info;
    4204     1885311 :         struct btrfs_path *path;
    4205     1885311 :         int ret = 0;
    4206     1885311 :         struct btrfs_dir_item *di;
    4207     1885311 :         u64 index;
    4208     1885311 :         u64 ino = btrfs_ino(inode);
    4209     1885311 :         u64 dir_ino = btrfs_ino(dir);
    4210             : 
    4211     1885311 :         path = btrfs_alloc_path();
    4212     1885374 :         if (!path) {
    4213           0 :                 ret = -ENOMEM;
    4214           0 :                 goto out;
    4215             :         }
    4216             : 
    4217     1885374 :         di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
    4218     3770822 :         if (IS_ERR_OR_NULL(di)) {
    4219           0 :                 ret = di ? PTR_ERR(di) : -ENOENT;
    4220           0 :                 goto err;
    4221             :         }
    4222     1885411 :         ret = btrfs_delete_one_dir_name(trans, root, path, di);
    4223     1885406 :         if (ret)
    4224           0 :                 goto err;
    4225     1885406 :         btrfs_release_path(path);
    4226             : 
    4227             :         /*
    4228             :          * If we don't have dir index, we have to get it by looking up
    4229             :          * the inode ref, since we get the inode ref, remove it directly,
    4230             :          * it is unnecessary to do delayed deletion.
    4231             :          *
    4232             :          * But if we have dir index, needn't search inode ref to get it.
    4233             :          * Since the inode ref is close to the inode item, it is better
    4234             :          * that we delay to delete it, and just do this deletion when
    4235             :          * we update the inode item.
    4236             :          */
    4237     1885410 :         if (inode->dir_index) {
    4238     1337230 :                 ret = btrfs_delayed_delete_inode_ref(inode);
    4239     1337234 :                 if (!ret) {
    4240     1337227 :                         index = inode->dir_index;
    4241     1337227 :                         goto skip_backref;
    4242             :                 }
    4243             :         }
    4244             : 
    4245      548187 :         ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
    4246      548185 :         if (ret) {
    4247           0 :                 btrfs_info(fs_info,
    4248             :                         "failed to delete reference to %.*s, inode %llu parent %llu",
    4249             :                         name->len, name->name, ino, dir_ino);
    4250           0 :                 btrfs_abort_transaction(trans, ret);
    4251           0 :                 goto err;
    4252             :         }
    4253      548185 : skip_backref:
    4254     1885412 :         if (rename_ctx)
    4255      462853 :                 rename_ctx->index = index;
    4256             : 
    4257     1885412 :         ret = btrfs_delete_delayed_dir_index(trans, dir, index);
    4258     1885411 :         if (ret) {
    4259           0 :                 btrfs_abort_transaction(trans, ret);
    4260           0 :                 goto err;
    4261             :         }
    4262             : 
    4263             :         /*
    4264             :          * If we are in a rename context, we don't need to update anything in the
    4265             :          * log. That will be done later during the rename by btrfs_log_new_name().
    4266             :          * Besides that, doing it here would only cause extra unnecessary btree
    4267             :          * operations on the log tree, increasing latency for applications.
    4268             :          */
    4269     1885411 :         if (!rename_ctx) {
    4270     1422558 :                 btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
    4271     1422530 :                 btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
    4272             :         }
    4273             : 
    4274             :         /*
    4275             :          * If we have a pending delayed iput we could end up with the final iput
    4276             :          * being run in btrfs-cleaner context.  If we have enough of these built
    4277             :          * up we can end up burning a lot of time in btrfs-cleaner without any
    4278             :          * way to throttle the unlinks.  Since we're currently holding a ref on
    4279             :          * the inode we can run the delayed iput here without any issues as the
    4280             :          * final iput won't be done until after we drop the ref we're currently
    4281             :          * holding.
    4282             :          */
    4283     1885387 :         btrfs_run_delayed_iput(fs_info, inode);
    4284     1885383 : err:
    4285     1885383 :         btrfs_free_path(path);
    4286     1885375 :         if (ret)
    4287           0 :                 goto out;
    4288             : 
    4289     1885375 :         btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
    4290     1885375 :         inode_inc_iversion(&inode->vfs_inode);
    4291     1885377 :         inode_inc_iversion(&dir->vfs_inode);
    4292     1885386 :         inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
    4293     1885354 :         dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
    4294     1885354 :         dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
    4295     1885354 :         ret = btrfs_update_inode(trans, root, dir);
    4296     1885395 : out:
    4297     1885395 :         return ret;
    4298             : }
    4299             : 
    4300     1422528 : int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
    4301             :                        struct btrfs_inode *dir, struct btrfs_inode *inode,
    4302             :                        const struct fscrypt_str *name)
    4303             : {
    4304     1422528 :         int ret;
    4305             : 
    4306     1422528 :         ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
    4307     1422545 :         if (!ret) {
    4308     1422543 :                 drop_nlink(&inode->vfs_inode);
    4309     1422544 :                 ret = btrfs_update_inode(trans, inode->root, inode);
    4310             :         }
    4311     1422557 :         return ret;
    4312             : }
    4313             : 
    4314             : /*
    4315             :  * helper to start transaction for unlink and rmdir.
    4316             :  *
    4317             :  * unlink and rmdir are special in btrfs, they do not always free space, so
    4318             :  * if we cannot make our reservations the normal way try and see if there is
    4319             :  * plenty of slack room in the global reserve to migrate, otherwise we cannot
    4320             :  * allow the unlink to occur.
    4321             :  */
    4322             : static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
    4323             : {
    4324     1383454 :         struct btrfs_root *root = dir->root;
    4325             : 
    4326     1383454 :         return btrfs_start_transaction_fallback_global_rsv(root,
    4327             :                                                    BTRFS_UNLINK_METADATA_UNITS);
    4328             : }
    4329             : 
    4330     1339419 : static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
    4331             : {
    4332     1339419 :         struct btrfs_trans_handle *trans;
    4333     1339419 :         struct inode *inode = d_inode(dentry);
    4334     1339419 :         int ret;
    4335     1339419 :         struct fscrypt_name fname;
    4336             : 
    4337     1339419 :         ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
    4338     1339238 :         if (ret)
    4339             :                 return ret;
    4340             : 
    4341             :         /* This needs to handle no-key deletions later on */
    4342             : 
    4343     1339282 :         trans = __unlink_start_trans(BTRFS_I(dir));
    4344     1339663 :         if (IS_ERR(trans)) {
    4345           0 :                 ret = PTR_ERR(trans);
    4346           0 :                 goto fscrypt_free;
    4347             :         }
    4348             : 
    4349     1339663 :         btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
    4350             :                                 false);
    4351             : 
    4352     1339662 :         ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
    4353             :                                  &fname.disk_name);
    4354     1339679 :         if (ret)
    4355           0 :                 goto end_trans;
    4356             : 
    4357     1339679 :         if (inode->i_nlink == 0) {
    4358     1288920 :                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
    4359     1288925 :                 if (ret)
    4360           0 :                         goto end_trans;
    4361             :         }
    4362             : 
    4363     1339684 : end_trans:
    4364     1339684 :         btrfs_end_transaction(trans);
    4365     1339684 :         btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
    4366             : fscrypt_free:
    4367             :         fscrypt_free_filename(&fname);
    4368             :         return ret;
    4369             : }
    4370             : 
    4371         178 : static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
    4372             :                                struct btrfs_inode *dir, struct dentry *dentry)
    4373             : {
    4374         178 :         struct btrfs_root *root = dir->root;
    4375         178 :         struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
    4376         178 :         struct btrfs_path *path;
    4377         178 :         struct extent_buffer *leaf;
    4378         178 :         struct btrfs_dir_item *di;
    4379         178 :         struct btrfs_key key;
    4380         178 :         u64 index;
    4381         178 :         int ret;
    4382         178 :         u64 objectid;
    4383         178 :         u64 dir_ino = btrfs_ino(dir);
    4384         178 :         struct fscrypt_name fname;
    4385             : 
    4386         178 :         ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
    4387         178 :         if (ret)
    4388             :                 return ret;
    4389             : 
    4390             :         /* This needs to handle no-key deletions later on */
    4391             : 
    4392         178 :         if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
    4393         174 :                 objectid = inode->root->root_key.objectid;
    4394           4 :         } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
    4395           4 :                 objectid = inode->location.objectid;
    4396             :         } else {
    4397           0 :                 WARN_ON(1);
    4398           0 :                 fscrypt_free_filename(&fname);
    4399           0 :                 return -EINVAL;
    4400             :         }
    4401             : 
    4402         178 :         path = btrfs_alloc_path();
    4403         178 :         if (!path) {
    4404           0 :                 ret = -ENOMEM;
    4405           0 :                 goto out;
    4406             :         }
    4407             : 
    4408         178 :         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
    4409             :                                    &fname.disk_name, -1);
    4410         356 :         if (IS_ERR_OR_NULL(di)) {
    4411           0 :                 ret = di ? PTR_ERR(di) : -ENOENT;
    4412           0 :                 goto out;
    4413             :         }
    4414             : 
    4415         178 :         leaf = path->nodes[0];
    4416         178 :         btrfs_dir_item_key_to_cpu(leaf, di, &key);
    4417         356 :         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
    4418         178 :         ret = btrfs_delete_one_dir_name(trans, root, path, di);
    4419         178 :         if (ret) {
    4420           0 :                 btrfs_abort_transaction(trans, ret);
    4421           0 :                 goto out;
    4422             :         }
    4423         178 :         btrfs_release_path(path);
    4424             : 
    4425             :         /*
    4426             :          * This is a placeholder inode for a subvolume we didn't have a
    4427             :          * reference to at the time of the snapshot creation.  In the meantime
    4428             :          * we could have renamed the real subvol link into our snapshot, so
    4429             :          * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
    4430             :          * Instead simply lookup the dir_index_item for this entry so we can
    4431             :          * remove it.  Otherwise we know we have a ref to the root and we can
    4432             :          * call btrfs_del_root_ref, and it _shouldn't_ fail.
    4433             :          */
    4434         178 :         if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
    4435           4 :                 di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
    4436           8 :                 if (IS_ERR_OR_NULL(di)) {
    4437           0 :                         if (!di)
    4438             :                                 ret = -ENOENT;
    4439             :                         else
    4440           0 :                                 ret = PTR_ERR(di);
    4441           0 :                         btrfs_abort_transaction(trans, ret);
    4442           0 :                         goto out;
    4443             :                 }
    4444             : 
    4445           4 :                 leaf = path->nodes[0];
    4446           4 :                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    4447           4 :                 index = key.offset;
    4448           4 :                 btrfs_release_path(path);
    4449             :         } else {
    4450         174 :                 ret = btrfs_del_root_ref(trans, objectid,
    4451             :                                          root->root_key.objectid, dir_ino,
    4452             :                                          &index, &fname.disk_name);
    4453         174 :                 if (ret) {
    4454           0 :                         btrfs_abort_transaction(trans, ret);
    4455           0 :                         goto out;
    4456             :                 }
    4457             :         }
    4458             : 
    4459         178 :         ret = btrfs_delete_delayed_dir_index(trans, dir, index);
    4460         178 :         if (ret) {
    4461           0 :                 btrfs_abort_transaction(trans, ret);
    4462           0 :                 goto out;
    4463             :         }
    4464             : 
    4465         178 :         btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
    4466         178 :         inode_inc_iversion(&dir->vfs_inode);
    4467         178 :         dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode);
    4468         178 :         dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime;
    4469         178 :         ret = btrfs_update_inode_fallback(trans, root, dir);
    4470         178 :         if (ret)
    4471           0 :                 btrfs_abort_transaction(trans, ret);
    4472         178 : out:
    4473         178 :         btrfs_free_path(path);
    4474         178 :         fscrypt_free_filename(&fname);
    4475         178 :         return ret;
    4476             : }
    4477             : 
    4478             : /*
    4479             :  * Helper to check if the subvolume references other subvolumes or if it's
    4480             :  * default.
    4481             :  */
    4482         169 : static noinline int may_destroy_subvol(struct btrfs_root *root)
    4483             : {
    4484         169 :         struct btrfs_fs_info *fs_info = root->fs_info;
    4485         169 :         struct btrfs_path *path;
    4486         169 :         struct btrfs_dir_item *di;
    4487         169 :         struct btrfs_key key;
    4488         169 :         struct fscrypt_str name = FSTR_INIT("default", 7);
    4489         169 :         u64 dir_id;
    4490         169 :         int ret;
    4491             : 
    4492         169 :         path = btrfs_alloc_path();
    4493         169 :         if (!path)
    4494             :                 return -ENOMEM;
    4495             : 
    4496             :         /* Make sure this root isn't set as the default subvol */
    4497         169 :         dir_id = btrfs_super_root_dir(fs_info->super_copy);
    4498         169 :         di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
    4499             :                                    dir_id, &name, 0);
    4500         169 :         if (di && !IS_ERR(di)) {
    4501         169 :                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
    4502         169 :                 if (key.objectid == root->root_key.objectid) {
    4503           1 :                         ret = -EPERM;
    4504           1 :                         btrfs_err(fs_info,
    4505             :                                   "deleting default subvolume %llu is not allowed",
    4506             :                                   key.objectid);
    4507           1 :                         goto out;
    4508             :                 }
    4509         168 :                 btrfs_release_path(path);
    4510             :         }
    4511             : 
    4512         168 :         key.objectid = root->root_key.objectid;
    4513         168 :         key.type = BTRFS_ROOT_REF_KEY;
    4514         168 :         key.offset = (u64)-1;
    4515             : 
    4516         168 :         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
    4517         168 :         if (ret < 0)
    4518           0 :                 goto out;
    4519         168 :         BUG_ON(ret == 0);
    4520             : 
    4521         168 :         ret = 0;
    4522         168 :         if (path->slots[0] > 0) {
    4523         168 :                 path->slots[0]--;
    4524         168 :                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
    4525         168 :                 if (key.objectid == root->root_key.objectid &&
    4526         168 :                     key.type == BTRFS_ROOT_REF_KEY)
    4527           0 :                         ret = -ENOTEMPTY;
    4528             :         }
    4529         168 : out:
    4530         169 :         btrfs_free_path(path);
    4531         169 :         return ret;
    4532             : }
    4533             : 
    4534             : /* Delete all dentries for inodes belonging to the root */
    4535         168 : static void btrfs_prune_dentries(struct btrfs_root *root)
    4536             : {
    4537         168 :         struct btrfs_fs_info *fs_info = root->fs_info;
    4538         168 :         struct rb_node *node;
    4539         168 :         struct rb_node *prev;
    4540         168 :         struct btrfs_inode *entry;
    4541         168 :         struct inode *inode;
    4542         168 :         u64 objectid = 0;
    4543             : 
    4544         168 :         if (!BTRFS_FS_ERROR(fs_info))
    4545         168 :                 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
    4546             : 
    4547         168 :         spin_lock(&root->inode_lock);
    4548             : again:
    4549         340 :         node = root->inode_tree.rb_node;
    4550         340 :         prev = NULL;
    4551         683 :         while (node) {
    4552         346 :                 prev = node;
    4553         346 :                 entry = rb_entry(node, struct btrfs_inode, rb_node);
    4554             : 
    4555         346 :                 if (objectid < btrfs_ino(entry))
    4556         170 :                         node = node->rb_left;
    4557         176 :                 else if (objectid > btrfs_ino(entry))
    4558         173 :                         node = node->rb_right;
    4559             :                 else
    4560             :                         break;
    4561             :         }
    4562         340 :         if (!node) {
    4563         506 :                 while (prev) {
    4564         338 :                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
    4565         338 :                         if (objectid <= btrfs_ino(entry)) {
    4566             :                                 node = prev;
    4567             :                                 break;
    4568             :                         }
    4569         169 :                         prev = rb_next(prev);
    4570             :                 }
    4571             :         }
    4572         340 :         while (node) {
    4573         172 :                 entry = rb_entry(node, struct btrfs_inode, rb_node);
    4574         172 :                 objectid = btrfs_ino(entry) + 1;
    4575         172 :                 inode = igrab(&entry->vfs_inode);
    4576         172 :                 if (inode) {
    4577         172 :                         spin_unlock(&root->inode_lock);
    4578         172 :                         if (atomic_read(&inode->i_count) > 1)
    4579         170 :                                 d_prune_aliases(inode);
    4580             :                         /*
    4581             :                          * btrfs_drop_inode will have it removed from the inode
    4582             :                          * cache when its usage count hits zero.
    4583             :                          */
    4584         172 :                         iput(inode);
    4585         172 :                         cond_resched();
    4586         172 :                         spin_lock(&root->inode_lock);
    4587         172 :                         goto again;
    4588             :                 }
    4589             : 
    4590           0 :                 if (cond_resched_lock(&root->inode_lock))
    4591           0 :                         goto again;
    4592             : 
    4593           0 :                 node = rb_next(node);
    4594             :         }
    4595         168 :         spin_unlock(&root->inode_lock);
    4596         168 : }
    4597             : 
    4598         170 : int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
    4599             : {
    4600         170 :         struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
    4601         170 :         struct btrfs_root *root = dir->root;
    4602         170 :         struct inode *inode = d_inode(dentry);
    4603         170 :         struct btrfs_root *dest = BTRFS_I(inode)->root;
    4604         170 :         struct btrfs_trans_handle *trans;
    4605         170 :         struct btrfs_block_rsv block_rsv;
    4606         170 :         u64 root_flags;
    4607         170 :         int ret;
    4608             : 
    4609             :         /*
    4610             :          * Don't allow to delete a subvolume with send in progress. This is
    4611             :          * inside the inode lock so the error handling that has to drop the bit
    4612             :          * again is not run concurrently.
    4613             :          */
    4614         170 :         spin_lock(&dest->root_item_lock);
    4615         170 :         if (dest->send_in_progress) {
    4616           0 :                 spin_unlock(&dest->root_item_lock);
    4617           0 :                 btrfs_warn(fs_info,
    4618             :                            "attempt to delete subvolume %llu during send",
    4619             :                            dest->root_key.objectid);
    4620           0 :                 return -EPERM;
    4621             :         }
    4622         170 :         if (atomic_read(&dest->nr_swapfiles)) {
    4623           1 :                 spin_unlock(&dest->root_item_lock);
    4624           1 :                 btrfs_warn(fs_info,
    4625             :                            "attempt to delete subvolume %llu with active swapfile",
    4626             :                            root->root_key.objectid);
    4627           1 :                 return -EPERM;
    4628             :         }
    4629         169 :         root_flags = btrfs_root_flags(&dest->root_item);
    4630         169 :         btrfs_set_root_flags(&dest->root_item,
    4631             :                              root_flags | BTRFS_ROOT_SUBVOL_DEAD);
    4632         169 :         spin_unlock(&dest->root_item_lock);
    4633             : 
    4634         169 :         down_write(&fs_info->subvol_sem);
    4635             : 
    4636         169 :         ret = may_destroy_subvol(dest);
    4637         169 :         if (ret)
    4638           1 :                 goto out_up_write;
    4639             : 
    4640         168 :         btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
    4641             :         /*
    4642             :          * One for dir inode,
    4643             :          * two for dir entries,
    4644             :          * two for root ref/backref.
    4645             :          */
    4646         168 :         ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
    4647         168 :         if (ret)
    4648           0 :                 goto out_up_write;
    4649             : 
    4650         168 :         trans = btrfs_start_transaction(root, 0);
    4651         168 :         if (IS_ERR(trans)) {
    4652           0 :                 ret = PTR_ERR(trans);
    4653           0 :                 goto out_release;
    4654             :         }
    4655         168 :         trans->block_rsv = &block_rsv;
    4656         168 :         trans->bytes_reserved = block_rsv.size;
    4657             : 
    4658         168 :         btrfs_record_snapshot_destroy(trans, dir);
    4659             : 
    4660         168 :         ret = btrfs_unlink_subvol(trans, dir, dentry);
    4661         168 :         if (ret) {
    4662           0 :                 btrfs_abort_transaction(trans, ret);
    4663           0 :                 goto out_end_trans;
    4664             :         }
    4665             : 
    4666         168 :         ret = btrfs_record_root_in_trans(trans, dest);
    4667         168 :         if (ret) {
    4668           0 :                 btrfs_abort_transaction(trans, ret);
    4669           0 :                 goto out_end_trans;
    4670             :         }
    4671             : 
    4672         168 :         memset(&dest->root_item.drop_progress, 0,
    4673             :                 sizeof(dest->root_item.drop_progress));
    4674         168 :         btrfs_set_root_drop_level(&dest->root_item, 0);
    4675         168 :         btrfs_set_root_refs(&dest->root_item, 0);
    4676             : 
    4677         168 :         if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
    4678         168 :                 ret = btrfs_insert_orphan_item(trans,
    4679             :                                         fs_info->tree_root,
    4680             :                                         dest->root_key.objectid);
    4681         168 :                 if (ret) {
    4682           0 :                         btrfs_abort_transaction(trans, ret);
    4683           0 :                         goto out_end_trans;
    4684             :                 }
    4685             :         }
    4686             : 
    4687         168 :         ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
    4688             :                                   BTRFS_UUID_KEY_SUBVOL,
    4689             :                                   dest->root_key.objectid);
    4690         168 :         if (ret && ret != -ENOENT) {
    4691           0 :                 btrfs_abort_transaction(trans, ret);
    4692           0 :                 goto out_end_trans;
    4693             :         }
    4694         168 :         if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
    4695           0 :                 ret = btrfs_uuid_tree_remove(trans,
    4696             :                                           dest->root_item.received_uuid,
    4697             :                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
    4698             :                                           dest->root_key.objectid);
    4699           0 :                 if (ret && ret != -ENOENT) {
    4700           0 :                         btrfs_abort_transaction(trans, ret);
    4701           0 :                         goto out_end_trans;
    4702             :                 }
    4703             :         }
    4704             : 
    4705         168 :         free_anon_bdev(dest->anon_dev);
    4706         168 :         dest->anon_dev = 0;
    4707         168 : out_end_trans:
    4708         168 :         trans->block_rsv = NULL;
    4709         168 :         trans->bytes_reserved = 0;
    4710         168 :         ret = btrfs_end_transaction(trans);
    4711         168 :         inode->i_flags |= S_DEAD;
    4712         168 : out_release:
    4713         168 :         btrfs_subvolume_release_metadata(root, &block_rsv);
    4714         169 : out_up_write:
    4715         169 :         up_write(&fs_info->subvol_sem);
    4716         169 :         if (ret) {
    4717           1 :                 spin_lock(&dest->root_item_lock);
    4718           1 :                 root_flags = btrfs_root_flags(&dest->root_item);
    4719           1 :                 btrfs_set_root_flags(&dest->root_item,
    4720             :                                 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
    4721           1 :                 spin_unlock(&dest->root_item_lock);
    4722             :         } else {
    4723         168 :                 d_invalidate(dentry);
    4724         168 :                 btrfs_prune_dentries(dest);
    4725         168 :                 ASSERT(dest->send_in_progress == 0);
    4726             :         }
    4727             : 
    4728             :         return ret;
    4729             : }
    4730             : 
    4731       62377 : static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
    4732             : {
    4733       62377 :         struct inode *inode = d_inode(dentry);
    4734       62377 :         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
    4735       62377 :         int err = 0;
    4736       62377 :         struct btrfs_trans_handle *trans;
    4737       62377 :         u64 last_unlink_trans;
    4738       62377 :         struct fscrypt_name fname;
    4739             : 
    4740       62377 :         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
    4741             :                 return -ENOTEMPTY;
    4742       44187 :         if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
    4743          12 :                 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
    4744           0 :                         btrfs_err(fs_info,
    4745             :                         "extent tree v2 doesn't support snapshot deletion yet");
    4746           0 :                         return -EOPNOTSUPP;
    4747             :                 }
    4748          12 :                 return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
    4749             :         }
    4750             : 
    4751       44175 :         err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
    4752       44172 :         if (err)
    4753             :                 return err;
    4754             : 
    4755             :         /* This needs to handle no-key deletions later on */
    4756             : 
    4757       44172 :         trans = __unlink_start_trans(BTRFS_I(dir));
    4758       44183 :         if (IS_ERR(trans)) {
    4759           0 :                 err = PTR_ERR(trans);
    4760           0 :                 goto out_notrans;
    4761             :         }
    4762             : 
    4763       44183 :         if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
    4764           4 :                 err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
    4765           4 :                 goto out;
    4766             :         }
    4767             : 
    4768       44179 :         err = btrfs_orphan_add(trans, BTRFS_I(inode));
    4769       44179 :         if (err)
    4770           0 :                 goto out;
    4771             : 
    4772       44179 :         last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
    4773             : 
    4774             :         /* now the directory is empty */
    4775       44179 :         err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
    4776             :                                  &fname.disk_name);
    4777       44179 :         if (!err) {
    4778       44179 :                 btrfs_i_size_write(BTRFS_I(inode), 0);
    4779             :                 /*
    4780             :                  * Propagate the last_unlink_trans value of the deleted dir to
    4781             :                  * its parent directory. This is to prevent an unrecoverable
    4782             :                  * log tree in the case we do something like this:
    4783             :                  * 1) create dir foo
    4784             :                  * 2) create snapshot under dir foo
    4785             :                  * 3) delete the snapshot
    4786             :                  * 4) rmdir foo
    4787             :                  * 5) mkdir foo
    4788             :                  * 6) fsync foo or some file inside foo
    4789             :                  */
    4790       44179 :                 if (last_unlink_trans >= trans->transid)
    4791        1478 :                         BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
    4792             :         }
    4793       42701 : out:
    4794       44183 :         btrfs_end_transaction(trans);
    4795       44183 : out_notrans:
    4796       44183 :         btrfs_btree_balance_dirty(fs_info);
    4797       44183 :         fscrypt_free_filename(&fname);
    4798             : 
    4799       44183 :         return err;
    4800             : }
    4801             : 
    4802             : /*
    4803             :  * btrfs_truncate_block - read, zero a chunk and write a block
    4804             :  * @inode - inode that we're zeroing
    4805             :  * @from - the offset to start zeroing
    4806             :  * @len - the length to zero, 0 to zero the entire range respective to the
    4807             :  *      offset
    4808             :  * @front - zero up to the offset instead of from the offset on
    4809             :  *
    4810             :  * This will find the block for the "from" offset and cow the block and zero the
    4811             :  * part we want to zero.  This is used with truncate and hole punching.
    4812             :  */
    4813     4509845 : int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
    4814             :                          int front)
    4815             : {
    4816     4509845 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    4817     4509845 :         struct address_space *mapping = inode->vfs_inode.i_mapping;
    4818     4509845 :         struct extent_io_tree *io_tree = &inode->io_tree;
    4819     4509845 :         struct btrfs_ordered_extent *ordered;
    4820     4509845 :         struct extent_state *cached_state = NULL;
    4821     4509845 :         struct extent_changeset *data_reserved = NULL;
    4822     4509845 :         bool only_release_metadata = false;
    4823     4509845 :         u32 blocksize = fs_info->sectorsize;
    4824     4509845 :         pgoff_t index = from >> PAGE_SHIFT;
    4825     4509845 :         unsigned offset = from & (blocksize - 1);
    4826     4509845 :         struct page *page;
    4827     4509845 :         gfp_t mask = btrfs_alloc_write_mask(mapping);
    4828     4509845 :         size_t write_bytes = blocksize;
    4829     4509845 :         int ret = 0;
    4830     4509845 :         u64 block_start;
    4831     4509845 :         u64 block_end;
    4832             : 
    4833     4509845 :         if (IS_ALIGNED(offset, blocksize) &&
    4834        3919 :             (!len || IS_ALIGNED(len, blocksize)))
    4835     2548287 :                 goto out;
    4836             : 
    4837     1961558 :         block_start = round_down(from, blocksize);
    4838     1961558 :         block_end = block_start + blocksize - 1;
    4839             : 
    4840     1961558 :         ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
    4841             :                                           blocksize, false);
    4842     1971713 :         if (ret < 0) {
    4843        1709 :                 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
    4844             :                         /* For nocow case, no need to reserve data space */
    4845             :                         only_release_metadata = true;
    4846             :                 } else {
    4847        1663 :                         goto out;
    4848             :                 }
    4849             :         }
    4850     1970052 :         ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
    4851     1965881 :         if (ret < 0) {
    4852         468 :                 if (!only_release_metadata)
    4853         468 :                         btrfs_free_reserved_data_space(inode, data_reserved,
    4854             :                                                        block_start, blocksize);
    4855         468 :                 goto out;
    4856             :         }
    4857     1965413 : again:
    4858     1965882 :         page = find_or_create_page(mapping, index, mask);
    4859     1968076 :         if (!page) {
    4860           0 :                 btrfs_delalloc_release_space(inode, data_reserved, block_start,
    4861             :                                              blocksize, true);
    4862           0 :                 btrfs_delalloc_release_extents(inode, blocksize);
    4863           0 :                 ret = -ENOMEM;
    4864           0 :                 goto out;
    4865             :         }
    4866             : 
    4867     1968076 :         if (!PageUptodate(page)) {
    4868      113037 :                 ret = btrfs_read_folio(NULL, page_folio(page));
    4869      113035 :                 lock_page(page);
    4870      113036 :                 if (page->mapping != mapping) {
    4871           0 :                         unlock_page(page);
    4872           0 :                         put_page(page);
    4873           0 :                         goto again;
    4874             :                 }
    4875      113036 :                 if (!PageUptodate(page)) {
    4876           0 :                         ret = -EIO;
    4877           0 :                         goto out_unlock;
    4878             :                 }
    4879             :         }
    4880             : 
    4881             :         /*
    4882             :          * We unlock the page after the io is completed and then re-lock it
    4883             :          * above.  release_folio() could have come in between that and cleared
    4884             :          * PagePrivate(), but left the page in the mapping.  Set the page mapped
    4885             :          * here to make sure it's properly set for the subpage stuff.
    4886             :          */
    4887     1950173 :         ret = set_page_extent_mapped(page);
    4888     1947878 :         if (ret < 0)
    4889           0 :                 goto out_unlock;
    4890             : 
    4891     1947878 :         wait_on_page_writeback(page);
    4892             : 
    4893     1955016 :         lock_extent(io_tree, block_start, block_end, &cached_state);
    4894             : 
    4895     1945610 :         ordered = btrfs_lookup_ordered_extent(inode, block_start);
    4896     1960471 :         if (ordered) {
    4897         469 :                 unlock_extent(io_tree, block_start, block_end, &cached_state);
    4898         469 :                 unlock_page(page);
    4899         469 :                 put_page(page);
    4900         469 :                 btrfs_start_ordered_extent(ordered);
    4901         469 :                 btrfs_put_ordered_extent(ordered);
    4902         469 :                 goto again;
    4903             :         }
    4904             : 
    4905     1960002 :         clear_extent_bit(&inode->io_tree, block_start, block_end,
    4906             :                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
    4907             :                          &cached_state);
    4908             : 
    4909     1967759 :         ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
    4910             :                                         &cached_state);
    4911     1966150 :         if (ret) {
    4912           0 :                 unlock_extent(io_tree, block_start, block_end, &cached_state);
    4913           0 :                 goto out_unlock;
    4914             :         }
    4915             : 
    4916     1966150 :         if (offset != blocksize) {
    4917     1966150 :                 if (!len)
    4918     1957564 :                         len = blocksize - offset;
    4919     1966150 :                 if (front)
    4920      143327 :                         memzero_page(page, (block_start - page_offset(page)),
    4921             :                                      offset);
    4922             :                 else
    4923     1822823 :                         memzero_page(page, (block_start - page_offset(page)) + offset,
    4924             :                                      len);
    4925             :         }
    4926     1966494 :         btrfs_page_clear_checked(fs_info, page, block_start,
    4927     1966494 :                                  block_end + 1 - block_start);
    4928     1965202 :         btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
    4929     1956434 :         unlock_extent(io_tree, block_start, block_end, &cached_state);
    4930             : 
    4931     1966992 :         if (only_release_metadata)
    4932          48 :                 set_extent_bit(&inode->io_tree, block_start, block_end,
    4933             :                                EXTENT_NORESERVE, NULL);
    4934             : 
    4935     1966944 : out_unlock:
    4936     1966992 :         if (ret) {
    4937           0 :                 if (only_release_metadata)
    4938           0 :                         btrfs_delalloc_release_metadata(inode, blocksize, true);
    4939             :                 else
    4940           0 :                         btrfs_delalloc_release_space(inode, data_reserved,
    4941             :                                         block_start, blocksize, true);
    4942             :         }
    4943     1966992 :         btrfs_delalloc_release_extents(inode, blocksize);
    4944     1969858 :         unlock_page(page);
    4945     1969600 :         put_page(page);
    4946     1970037 : out:
    4947     4519987 :         if (only_release_metadata)
    4948          48 :                 btrfs_check_nocow_unlock(inode);
    4949     4519987 :         extent_changeset_free(data_reserved);
    4950     4519845 :         return ret;
    4951             : }
    4952             : 
    4953     1666692 : static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
    4954             :                              u64 offset, u64 len)
    4955             : {
    4956     1666692 :         struct btrfs_fs_info *fs_info = root->fs_info;
    4957     1666692 :         struct btrfs_trans_handle *trans;
    4958     1666692 :         struct btrfs_drop_extents_args drop_args = { 0 };
    4959     1666692 :         int ret;
    4960             : 
    4961             :         /*
    4962             :          * If NO_HOLES is enabled, we don't need to do anything.
    4963             :          * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
    4964             :          * or btrfs_update_inode() will be called, which guarantee that the next
    4965             :          * fsync will know this inode was changed and needs to be logged.
    4966             :          */
    4967     1666692 :         if (btrfs_fs_incompat(fs_info, NO_HOLES))
    4968             :                 return 0;
    4969             : 
    4970             :         /*
    4971             :          * 1 - for the one we're dropping
    4972             :          * 1 - for the one we're adding
    4973             :          * 1 - for updating the inode.
    4974             :          */
    4975         115 :         trans = btrfs_start_transaction(root, 3);
    4976         115 :         if (IS_ERR(trans))
    4977           0 :                 return PTR_ERR(trans);
    4978             : 
    4979         115 :         drop_args.start = offset;
    4980         115 :         drop_args.end = offset + len;
    4981         115 :         drop_args.drop_cache = true;
    4982             : 
    4983         115 :         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
    4984         115 :         if (ret) {
    4985           0 :                 btrfs_abort_transaction(trans, ret);
    4986           0 :                 btrfs_end_transaction(trans);
    4987           0 :                 return ret;
    4988             :         }
    4989             : 
    4990         115 :         ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
    4991         115 :         if (ret) {
    4992           0 :                 btrfs_abort_transaction(trans, ret);
    4993             :         } else {
    4994         115 :                 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
    4995         115 :                 btrfs_update_inode(trans, root, inode);
    4996             :         }
    4997         115 :         btrfs_end_transaction(trans);
    4998         115 :         return ret;
    4999             : }
    5000             : 
    5001             : /*
    5002             :  * This function puts in dummy file extents for the area we're creating a hole
    5003             :  * for.  So if we are truncating this file to a larger size we need to insert
    5004             :  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
    5005             :  * the range between oldsize and size
    5006             :  */
    5007     1644859 : int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
    5008             : {
    5009     1644859 :         struct btrfs_root *root = inode->root;
    5010     1644859 :         struct btrfs_fs_info *fs_info = root->fs_info;
    5011     1644859 :         struct extent_io_tree *io_tree = &inode->io_tree;
    5012     1644859 :         struct extent_map *em = NULL;
    5013     1644859 :         struct extent_state *cached_state = NULL;
    5014     1644859 :         u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
    5015     1644859 :         u64 block_end = ALIGN(size, fs_info->sectorsize);
    5016     1644859 :         u64 last_byte;
    5017     1644859 :         u64 cur_offset;
    5018     1644859 :         u64 hole_size;
    5019     1644859 :         int err = 0;
    5020             : 
    5021             :         /*
    5022             :          * If our size started in the middle of a block we need to zero out the
    5023             :          * rest of the block before we expand the i_size, otherwise we could
    5024             :          * expose stale data.
    5025             :          */
    5026     1644859 :         err = btrfs_truncate_block(inode, oldsize, 0, 0);
    5027     1656484 :         if (err)
    5028             :                 return err;
    5029             : 
    5030     1654486 :         if (size <= hole_start)
    5031             :                 return 0;
    5032             : 
    5033     1651498 :         btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
    5034             :                                            &cached_state);
    5035     1651498 :         cur_offset = hole_start;
    5036     1699966 :         while (1) {
    5037     1699966 :                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
    5038             :                                       block_end - cur_offset);
    5039     1710290 :                 if (IS_ERR(em)) {
    5040           0 :                         err = PTR_ERR(em);
    5041           0 :                         em = NULL;
    5042           0 :                         break;
    5043             :                 }
    5044     1710290 :                 last_byte = min(extent_map_end(em), block_end);
    5045     1710290 :                 last_byte = ALIGN(last_byte, fs_info->sectorsize);
    5046     1710290 :                 hole_size = last_byte - cur_offset;
    5047             : 
    5048     1710290 :                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
    5049     1666942 :                         struct extent_map *hole_em;
    5050             : 
    5051     1666942 :                         err = maybe_insert_hole(root, inode, cur_offset,
    5052             :                                                 hole_size);
    5053     1668653 :                         if (err)
    5054             :                                 break;
    5055             : 
    5056     1668653 :                         err = btrfs_inode_set_file_extent_range(inode,
    5057             :                                                         cur_offset, hole_size);
    5058     1667215 :                         if (err)
    5059             :                                 break;
    5060             : 
    5061     1667215 :                         hole_em = alloc_extent_map();
    5062     1669158 :                         if (!hole_em) {
    5063           0 :                                 btrfs_drop_extent_map_range(inode, cur_offset,
    5064             :                                                     cur_offset + hole_size - 1,
    5065             :                                                     false);
    5066           0 :                                 btrfs_set_inode_full_sync(inode);
    5067           0 :                                 goto next;
    5068             :                         }
    5069     1669158 :                         hole_em->start = cur_offset;
    5070     1669158 :                         hole_em->len = hole_size;
    5071     1669158 :                         hole_em->orig_start = cur_offset;
    5072             : 
    5073     1669158 :                         hole_em->block_start = EXTENT_MAP_HOLE;
    5074     1669158 :                         hole_em->block_len = 0;
    5075     1669158 :                         hole_em->orig_block_len = 0;
    5076     1669158 :                         hole_em->ram_bytes = hole_size;
    5077     1669158 :                         hole_em->compress_type = BTRFS_COMPRESS_NONE;
    5078     1669158 :                         hole_em->generation = fs_info->generation;
    5079             : 
    5080     1669158 :                         err = btrfs_replace_extent_map_range(inode, hole_em, true);
    5081     1671031 :                         free_extent_map(hole_em);
    5082             :                 } else {
    5083       43348 :                         err = btrfs_inode_set_file_extent_range(inode,
    5084             :                                                         cur_offset, hole_size);
    5085       43348 :                         if (err)
    5086             :                                 break;
    5087             :                 }
    5088       43348 : next:
    5089     1716121 :                 free_extent_map(em);
    5090     1715538 :                 em = NULL;
    5091     1715538 :                 cur_offset = last_byte;
    5092     1715538 :                 if (cur_offset >= block_end)
    5093             :                         break;
    5094             :         }
    5095     1650864 :         free_extent_map(em);
    5096     1650130 :         unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
    5097     1650130 :         return err;
    5098             : }
    5099             : 
    5100      343554 : static int btrfs_setsize(struct inode *inode, struct iattr *attr)
    5101             : {
    5102      343554 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    5103      343554 :         struct btrfs_trans_handle *trans;
    5104      343554 :         loff_t oldsize = i_size_read(inode);
    5105      343554 :         loff_t newsize = attr->ia_size;
    5106      343554 :         int mask = attr->ia_valid;
    5107      343554 :         int ret;
    5108             : 
    5109             :         /*
    5110             :          * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
    5111             :          * special case where we need to update the times despite not having
    5112             :          * these flags set.  For all other operations the VFS set these flags
    5113             :          * explicitly if it wants a timestamp update.
    5114             :          */
    5115      343554 :         if (newsize != oldsize) {
    5116      283523 :                 inode_inc_iversion(inode);
    5117      283525 :                 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
    5118       21935 :                         inode->i_mtime = current_time(inode);
    5119       21935 :                         inode->i_ctime = inode->i_mtime;
    5120             :                 }
    5121             :         }
    5122             : 
    5123      343556 :         if (newsize > oldsize) {
    5124             :                 /*
    5125             :                  * Don't do an expanding truncate while snapshotting is ongoing.
    5126             :                  * This is to ensure the snapshot captures a fully consistent
    5127             :                  * state of this file - if the snapshot captures this expanding
    5128             :                  * truncation, it must capture all writes that happened before
    5129             :                  * this truncation.
    5130             :                  */
    5131       98363 :                 btrfs_drew_write_lock(&root->snapshot_lock);
    5132       98362 :                 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
    5133       98363 :                 if (ret) {
    5134         122 :                         btrfs_drew_write_unlock(&root->snapshot_lock);
    5135         122 :                         return ret;
    5136             :                 }
    5137             : 
    5138       98241 :                 trans = btrfs_start_transaction(root, 1);
    5139       98242 :                 if (IS_ERR(trans)) {
    5140          17 :                         btrfs_drew_write_unlock(&root->snapshot_lock);
    5141          17 :                         return PTR_ERR(trans);
    5142             :                 }
    5143             : 
    5144       98225 :                 i_size_write(inode, newsize);
    5145       98225 :                 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
    5146       98221 :                 pagecache_isize_extended(inode, oldsize, newsize);
    5147       98219 :                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    5148       98225 :                 btrfs_drew_write_unlock(&root->snapshot_lock);
    5149       98223 :                 btrfs_end_transaction(trans);
    5150             :         } else {
    5151      245193 :                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    5152             : 
    5153      245193 :                 if (btrfs_is_zoned(fs_info)) {
    5154           0 :                         ret = btrfs_wait_ordered_range(inode,
    5155           0 :                                         ALIGN(newsize, fs_info->sectorsize),
    5156             :                                         (u64)-1);
    5157           0 :                         if (ret)
    5158             :                                 return ret;
    5159             :                 }
    5160             : 
    5161             :                 /*
    5162             :                  * We're truncating a file that used to have good data down to
    5163             :                  * zero. Make sure any new writes to the file get on disk
    5164             :                  * on close.
    5165             :                  */
    5166      245193 :                 if (newsize == 0)
    5167       76361 :                         set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
    5168       76361 :                                 &BTRFS_I(inode)->runtime_flags);
    5169             : 
    5170      245203 :                 truncate_setsize(inode, newsize);
    5171             : 
    5172      245195 :                 inode_dio_wait(inode);
    5173             : 
    5174      245189 :                 ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
    5175      245208 :                 if (ret && inode->i_nlink) {
    5176         126 :                         int err;
    5177             : 
    5178             :                         /*
    5179             :                          * Truncate failed, so fix up the in-memory size. We
    5180             :                          * adjusted disk_i_size down as we removed extents, so
    5181             :                          * wait for disk_i_size to be stable and then update the
    5182             :                          * in-memory size to match.
    5183             :                          */
    5184         126 :                         err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
    5185         126 :                         if (err)
    5186             :                                 return err;
    5187         126 :                         i_size_write(inode, BTRFS_I(inode)->disk_i_size);
    5188             :                 }
    5189             :         }
    5190             : 
    5191             :         return ret;
    5192             : }
    5193             : 
    5194     2447564 : static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
    5195             :                          struct iattr *attr)
    5196             : {
    5197     2447564 :         struct inode *inode = d_inode(dentry);
    5198     2447564 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    5199     2447564 :         int err;
    5200             : 
    5201     2447564 :         if (btrfs_root_readonly(root))
    5202             :                 return -EROFS;
    5203             : 
    5204     2447564 :         err = setattr_prepare(idmap, dentry, attr);
    5205     2442354 :         if (err)
    5206             :                 return err;
    5207             : 
    5208     2442318 :         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
    5209      343553 :                 err = btrfs_setsize(inode, attr);
    5210      343571 :                 if (err)
    5211             :                         return err;
    5212             :         }
    5213             : 
    5214     2442071 :         if (attr->ia_valid) {
    5215     2442341 :                 setattr_copy(idmap, inode, attr);
    5216     2440315 :                 inode_inc_iversion(inode);
    5217     2448157 :                 err = btrfs_dirty_inode(BTRFS_I(inode));
    5218             : 
    5219     2451308 :                 if (!err && attr->ia_valid & ATTR_MODE)
    5220       44079 :                         err = posix_acl_chmod(idmap, dentry, inode->i_mode);
    5221             :         }
    5222             : 
    5223             :         return err;
    5224             : }
    5225             : 
    5226             : /*
    5227             :  * While truncating the inode pages during eviction, we get the VFS
    5228             :  * calling btrfs_invalidate_folio() against each folio of the inode. This
    5229             :  * is slow because the calls to btrfs_invalidate_folio() result in a
    5230             :  * huge amount of calls to lock_extent() and clear_extent_bit(),
    5231             :  * which keep merging and splitting extent_state structures over and over,
    5232             :  * wasting lots of time.
    5233             :  *
    5234             :  * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
    5235             :  * skip all those expensive operations on a per folio basis and do only
    5236             :  * the ordered io finishing, while we release here the extent_map and
    5237             :  * extent_state structures, without the excessive merging and splitting.
    5238             :  */
    5239     3865328 : static void evict_inode_truncate_pages(struct inode *inode)
    5240             : {
    5241     3865328 :         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
    5242     3865328 :         struct rb_node *node;
    5243             : 
    5244     3865328 :         ASSERT(inode->i_state & I_FREEING);
    5245     3865328 :         truncate_inode_pages_final(&inode->i_data);
    5246             : 
    5247     3864553 :         btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
    5248             : 
    5249             :         /*
    5250             :          * Keep looping until we have no more ranges in the io tree.
    5251             :          * We can have ongoing bios started by readahead that have
    5252             :          * their endio callback (extent_io.c:end_bio_extent_readpage)
    5253             :          * still in progress (unlocked the pages in the bio but did not yet
    5254             :          * unlocked the ranges in the io tree). Therefore this means some
    5255             :          * ranges can still be locked and eviction started because before
    5256             :          * submitting those bios, which are executed by a separate task (work
    5257             :          * queue kthread), inode references (inode->i_count) were not taken
    5258             :          * (which would be dropped in the end io callback of each bio).
    5259             :          * Therefore here we effectively end up waiting for those bios and
    5260             :          * anyone else holding locked ranges without having bumped the inode's
    5261             :          * reference count - if we don't do it, when they access the inode's
    5262             :          * io_tree to unlock a range it may be too late, leading to an
    5263             :          * use-after-free issue.
    5264             :          */
    5265     3864672 :         spin_lock(&io_tree->lock);
    5266     3875583 :         while (!RB_EMPTY_ROOT(&io_tree->state)) {
    5267       10718 :                 struct extent_state *state;
    5268       10718 :                 struct extent_state *cached_state = NULL;
    5269       10718 :                 u64 start;
    5270       10718 :                 u64 end;
    5271       10718 :                 unsigned state_flags;
    5272             : 
    5273       10718 :                 node = rb_first(&io_tree->state);
    5274       10718 :                 state = rb_entry(node, struct extent_state, rb_node);
    5275       10718 :                 start = state->start;
    5276       10718 :                 end = state->end;
    5277       10718 :                 state_flags = state->state;
    5278       10718 :                 spin_unlock(&io_tree->lock);
    5279             : 
    5280       10718 :                 lock_extent(io_tree, start, end, &cached_state);
    5281             : 
    5282             :                 /*
    5283             :                  * If still has DELALLOC flag, the extent didn't reach disk,
    5284             :                  * and its reserved space won't be freed by delayed_ref.
    5285             :                  * So we need to free its reserved space here.
    5286             :                  * (Refer to comment in btrfs_invalidate_folio, case 2)
    5287             :                  *
    5288             :                  * Note, end is the bytenr of last byte, so we need + 1 here.
    5289             :                  */
    5290       10718 :                 if (state_flags & EXTENT_DELALLOC)
    5291       10718 :                         btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
    5292       10718 :                                                end - start + 1);
    5293             : 
    5294       10718 :                 clear_extent_bit(io_tree, start, end,
    5295             :                                  EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
    5296             :                                  &cached_state);
    5297             : 
    5298       10718 :                 cond_resched();
    5299       10718 :                 spin_lock(&io_tree->lock);
    5300             :         }
    5301     3864865 :         spin_unlock(&io_tree->lock);
    5302     3865121 : }
    5303             : 
    5304     3551201 : static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
    5305             :                                                         struct btrfs_block_rsv *rsv)
    5306             : {
    5307     3551201 :         struct btrfs_fs_info *fs_info = root->fs_info;
    5308     3551201 :         struct btrfs_trans_handle *trans;
    5309     3551201 :         u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
    5310     3551201 :         int ret;
    5311             : 
    5312             :         /*
    5313             :          * Eviction should be taking place at some place safe because of our
    5314             :          * delayed iputs.  However the normal flushing code will run delayed
    5315             :          * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
    5316             :          *
    5317             :          * We reserve the delayed_refs_extra here again because we can't use
    5318             :          * btrfs_start_transaction(root, 0) for the same deadlocky reason as
    5319             :          * above.  We reserve our extra bit here because we generate a ton of
    5320             :          * delayed refs activity by truncating.
    5321             :          *
    5322             :          * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
    5323             :          * if we fail to make this reservation we can re-try without the
    5324             :          * delayed_refs_extra so we can make some forward progress.
    5325             :          */
    5326     3551201 :         ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
    5327             :                                      BTRFS_RESERVE_FLUSH_EVICT);
    5328     3551458 :         if (ret) {
    5329           0 :                 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
    5330             :                                              BTRFS_RESERVE_FLUSH_EVICT);
    5331           0 :                 if (ret) {
    5332           0 :                         btrfs_warn(fs_info,
    5333             :                                    "could not allocate space for delete; will truncate on mount");
    5334           0 :                         return ERR_PTR(-ENOSPC);
    5335             :                 }
    5336             :                 delayed_refs_extra = 0;
    5337             :         }
    5338             : 
    5339     3551458 :         trans = btrfs_join_transaction(root);
    5340     3551460 :         if (IS_ERR(trans))
    5341             :                 return trans;
    5342             : 
    5343     3551460 :         if (delayed_refs_extra) {
    5344     3551460 :                 trans->block_rsv = &fs_info->trans_block_rsv;
    5345     3551460 :                 trans->bytes_reserved = delayed_refs_extra;
    5346     3551460 :                 btrfs_block_rsv_migrate(rsv, trans->block_rsv,
    5347             :                                         delayed_refs_extra, true);
    5348             :         }
    5349             :         return trans;
    5350             : }
    5351             : 
    5352     3867049 : void btrfs_evict_inode(struct inode *inode)
    5353             : {
    5354     3867049 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    5355     3867049 :         struct btrfs_trans_handle *trans;
    5356     3867049 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    5357     3867049 :         struct btrfs_block_rsv *rsv = NULL;
    5358     3867049 :         int ret;
    5359             : 
    5360     3867049 :         trace_btrfs_inode_evict(inode);
    5361             : 
    5362     3867019 :         if (!root) {
    5363        1687 :                 fsverity_cleanup_inode(inode);
    5364        1687 :                 clear_inode(inode);
    5365        1687 :                 return;
    5366             :         }
    5367             : 
    5368     3865332 :         evict_inode_truncate_pages(inode);
    5369             : 
    5370     3865037 :         if (inode->i_nlink &&
    5371     2081879 :             ((btrfs_root_refs(&root->root_item) != 0 &&
    5372     2094227 :               root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
    5373             :              btrfs_is_free_space_inode(BTRFS_I(inode))))
    5374     2078664 :                 goto out;
    5375             : 
    5376     1786373 :         if (is_bad_inode(inode))
    5377           0 :                 goto out;
    5378             : 
    5379     3571566 :         if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
    5380           2 :                 goto out;
    5381             : 
    5382     1785781 :         if (inode->i_nlink > 0) {
    5383       12211 :                 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
    5384             :                        root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
    5385       12211 :                 goto out;
    5386             :         }
    5387             : 
    5388             :         /*
    5389             :          * This makes sure the inode item in tree is uptodate and the space for
    5390             :          * the inode update is released.
    5391             :          */
    5392     1773570 :         ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
    5393     1774497 :         if (ret)
    5394           0 :                 goto out;
    5395             : 
    5396             :         /*
    5397             :          * This drops any pending insert or delete operations we have for this
    5398             :          * inode.  We could have a delayed dir index deletion queued up, but
    5399             :          * we're removing the inode completely so that'll be taken care of in
    5400             :          * the truncate.
    5401             :          */
    5402     1774497 :         btrfs_kill_delayed_inode_items(BTRFS_I(inode));
    5403             : 
    5404     1774486 :         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
    5405     1774350 :         if (!rsv)
    5406           0 :                 goto out;
    5407     1774350 :         rsv->size = btrfs_calc_metadata_size(fs_info, 1);
    5408     1774350 :         rsv->failfast = true;
    5409             : 
    5410     1774350 :         btrfs_i_size_write(BTRFS_I(inode), 0);
    5411             : 
    5412        2455 :         while (1) {
    5413     1776805 :                 struct btrfs_truncate_control control = {
    5414             :                         .inode = BTRFS_I(inode),
    5415             :                         .ino = btrfs_ino(BTRFS_I(inode)),
    5416             :                         .new_size = 0,
    5417             :                         .min_type = 0,
    5418             :                 };
    5419             : 
    5420     1776805 :                 trans = evict_refill_and_join(root, rsv);
    5421     1776951 :                 if (IS_ERR(trans))
    5422           0 :                         goto out;
    5423             : 
    5424     1776951 :                 trans->block_rsv = rsv;
    5425             : 
    5426     1776951 :                 ret = btrfs_truncate_inode_items(trans, root, &control);
    5427     1776945 :                 trans->block_rsv = &fs_info->trans_block_rsv;
    5428     1776945 :                 btrfs_end_transaction(trans);
    5429             :                 /*
    5430             :                  * We have not added new delayed items for our inode after we
    5431             :                  * have flushed its delayed items, so no need to throttle on
    5432             :                  * delayed items. However we have modified extent buffers.
    5433             :                  */
    5434     1776951 :                 btrfs_btree_balance_dirty_nodelay(fs_info);
    5435     1776922 :                 if (ret && ret != -ENOSPC && ret != -EAGAIN)
    5436           0 :                         goto out;
    5437     1776922 :                 else if (!ret)
    5438             :                         break;
    5439             :         }
    5440             : 
    5441             :         /*
    5442             :          * Errors here aren't a big deal, it just means we leave orphan items in
    5443             :          * the tree. They will be cleaned up on the next mount. If the inode
    5444             :          * number gets reused, cleanup deletes the orphan item without doing
    5445             :          * anything, and unlink reuses the existing orphan item.
    5446             :          *
    5447             :          * If it turns out that we are dropping too many of these, we might want
    5448             :          * to add a mechanism for retrying these after a commit.
    5449             :          */
    5450     1774467 :         trans = evict_refill_and_join(root, rsv);
    5451     1774500 :         if (!IS_ERR(trans)) {
    5452     1774500 :                 trans->block_rsv = rsv;
    5453     1774500 :                 btrfs_orphan_del(trans, BTRFS_I(inode));
    5454     1774498 :                 trans->block_rsv = &fs_info->trans_block_rsv;
    5455     1774498 :                 btrfs_end_transaction(trans);
    5456             :         }
    5457             : 
    5458           0 : out:
    5459     3865377 :         btrfs_free_block_rsv(fs_info, rsv);
    5460             :         /*
    5461             :          * If we didn't successfully delete, the orphan item will still be in
    5462             :          * the tree and we'll retry on the next mount. Again, we might also want
    5463             :          * to retry these periodically in the future.
    5464             :          */
    5465     3865378 :         btrfs_remove_delayed_node(BTRFS_I(inode));
    5466     3865376 :         fsverity_cleanup_inode(inode);
    5467     3865376 :         clear_inode(inode);
    5468             : }
    5469             : 
    5470             : /*
    5471             :  * Return the key found in the dir entry in the location pointer, fill @type
    5472             :  * with BTRFS_FT_*, and return 0.
    5473             :  *
    5474             :  * If no dir entries were found, returns -ENOENT.
    5475             :  * If found a corrupted location in dir entry, returns -EUCLEAN.
    5476             :  */
    5477     3423558 : static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
    5478             :                                struct btrfs_key *location, u8 *type)
    5479             : {
    5480     3423558 :         struct btrfs_dir_item *di;
    5481     3423558 :         struct btrfs_path *path;
    5482     3423558 :         struct btrfs_root *root = dir->root;
    5483     3423558 :         int ret = 0;
    5484     3423558 :         struct fscrypt_name fname;
    5485             : 
    5486     3423558 :         path = btrfs_alloc_path();
    5487     3426643 :         if (!path)
    5488             :                 return -ENOMEM;
    5489             : 
    5490     3426643 :         ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
    5491     3423151 :         if (ret < 0)
    5492           0 :                 goto out;
    5493             :         /*
    5494             :          * fscrypt_setup_filename() should never return a positive value, but
    5495             :          * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
    5496             :          */
    5497     3423151 :         ASSERT(ret == 0);
    5498             : 
    5499             :         /* This needs to handle no-key deletions later on */
    5500             : 
    5501     3423151 :         di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
    5502             :                                    &fname.disk_name, 0);
    5503     3680169 :         if (IS_ERR_OR_NULL(di)) {
    5504     3176193 :                 ret = di ? PTR_ERR(di) : -ENOENT;
    5505     3176193 :                 goto out;
    5506             :         }
    5507             : 
    5508      252377 :         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
    5509      252385 :         if (location->type != BTRFS_INODE_ITEM_KEY &&
    5510             :             location->type != BTRFS_ROOT_ITEM_KEY) {
    5511           0 :                 ret = -EUCLEAN;
    5512           0 :                 btrfs_warn(root->fs_info,
    5513             : "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
    5514             :                            __func__, fname.disk_name.name, btrfs_ino(dir),
    5515             :                            location->objectid, location->type, location->offset);
    5516             :         }
    5517      252385 :         if (!ret)
    5518      252385 :                 *type = btrfs_dir_ftype(path->nodes[0], di);
    5519           0 : out:
    5520     3428564 :         fscrypt_free_filename(&fname);
    5521     3428564 :         btrfs_free_path(path);
    5522     3428564 :         return ret;
    5523             : }
    5524             : 
    5525             : /*
    5526             :  * when we hit a tree root in a directory, the btrfs part of the inode
    5527             :  * needs to be changed to reflect the root directory of the tree root.  This
    5528             :  * is kind of like crossing a mount point.
    5529             :  */
    5530       48291 : static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
    5531             :                                     struct btrfs_inode *dir,
    5532             :                                     struct dentry *dentry,
    5533             :                                     struct btrfs_key *location,
    5534             :                                     struct btrfs_root **sub_root)
    5535             : {
    5536       48291 :         struct btrfs_path *path;
    5537       48291 :         struct btrfs_root *new_root;
    5538       48291 :         struct btrfs_root_ref *ref;
    5539       48291 :         struct extent_buffer *leaf;
    5540       48291 :         struct btrfs_key key;
    5541       48291 :         int ret;
    5542       48291 :         int err = 0;
    5543       48291 :         struct fscrypt_name fname;
    5544             : 
    5545       48291 :         ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
    5546       48291 :         if (ret)
    5547             :                 return ret;
    5548             : 
    5549       48291 :         path = btrfs_alloc_path();
    5550       48291 :         if (!path) {
    5551           0 :                 err = -ENOMEM;
    5552           0 :                 goto out;
    5553             :         }
    5554             : 
    5555       48291 :         err = -ENOENT;
    5556       48291 :         key.objectid = dir->root->root_key.objectid;
    5557       48291 :         key.type = BTRFS_ROOT_REF_KEY;
    5558       48291 :         key.offset = location->objectid;
    5559             : 
    5560       48291 :         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
    5561       48291 :         if (ret) {
    5562       47235 :                 if (ret < 0)
    5563           0 :                         err = ret;
    5564       47235 :                 goto out;
    5565             :         }
    5566             : 
    5567        1056 :         leaf = path->nodes[0];
    5568        1056 :         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
    5569        1056 :         if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
    5570        1053 :             btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
    5571           3 :                 goto out;
    5572             : 
    5573        1053 :         ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
    5574        1053 :                                    (unsigned long)(ref + 1), fname.disk_name.len);
    5575        1053 :         if (ret)
    5576           0 :                 goto out;
    5577             : 
    5578        1053 :         btrfs_release_path(path);
    5579             : 
    5580        1053 :         new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
    5581        1053 :         if (IS_ERR(new_root)) {
    5582           0 :                 err = PTR_ERR(new_root);
    5583           0 :                 goto out;
    5584             :         }
    5585             : 
    5586        1053 :         *sub_root = new_root;
    5587        1053 :         location->objectid = btrfs_root_dirid(&new_root->root_item);
    5588        1053 :         location->type = BTRFS_INODE_ITEM_KEY;
    5589        1053 :         location->offset = 0;
    5590        1053 :         err = 0;
    5591       48291 : out:
    5592       48291 :         btrfs_free_path(path);
    5593       48291 :         fscrypt_free_filename(&fname);
    5594       48291 :         return err;
    5595             : }
    5596             : 
    5597     3809311 : static void inode_tree_add(struct btrfs_inode *inode)
    5598             : {
    5599     3809311 :         struct btrfs_root *root = inode->root;
    5600     3809311 :         struct btrfs_inode *entry;
    5601     3809311 :         struct rb_node **p;
    5602     3809311 :         struct rb_node *parent;
    5603     3809311 :         struct rb_node *new = &inode->rb_node;
    5604     3809311 :         u64 ino = btrfs_ino(inode);
    5605             : 
    5606     3809311 :         if (inode_unhashed(&inode->vfs_inode))
    5607             :                 return;
    5608     3809311 :         parent = NULL;
    5609     3809311 :         spin_lock(&root->inode_lock);
    5610     3809327 :         p = &root->inode_tree.rb_node;
    5611    98364813 :         while (*p) {
    5612    94555486 :                 parent = *p;
    5613    94555486 :                 entry = rb_entry(parent, struct btrfs_inode, rb_node);
    5614             : 
    5615    94555486 :                 if (ino < btrfs_ino(entry))
    5616     2337978 :                         p = &parent->rb_left;
    5617    92217508 :                 else if (ino > btrfs_ino(entry))
    5618    92217508 :                         p = &parent->rb_right;
    5619             :                 else {
    5620           0 :                         WARN_ON(!(entry->vfs_inode.i_state &
    5621             :                                   (I_WILL_FREE | I_FREEING)));
    5622           0 :                         rb_replace_node(parent, new, &root->inode_tree);
    5623           0 :                         RB_CLEAR_NODE(parent);
    5624           0 :                         spin_unlock(&root->inode_lock);
    5625           0 :                         return;
    5626             :                 }
    5627             :         }
    5628     3809327 :         rb_link_node(new, parent, p);
    5629     3809327 :         rb_insert_color(new, &root->inode_tree);
    5630     3809325 :         spin_unlock(&root->inode_lock);
    5631             : }
    5632             : 
    5633     3865228 : static void inode_tree_del(struct btrfs_inode *inode)
    5634             : {
    5635     3865228 :         struct btrfs_root *root = inode->root;
    5636     3865228 :         int empty = 0;
    5637             : 
    5638     3865228 :         spin_lock(&root->inode_lock);
    5639     3865380 :         if (!RB_EMPTY_NODE(&inode->rb_node)) {
    5640     3809329 :                 rb_erase(&inode->rb_node, &root->inode_tree);
    5641     3809329 :                 RB_CLEAR_NODE(&inode->rb_node);
    5642     3809329 :                 empty = RB_EMPTY_ROOT(&root->inode_tree);
    5643             :         }
    5644     3865380 :         spin_unlock(&root->inode_lock);
    5645             : 
    5646     3865379 :         if (empty && btrfs_root_refs(&root->root_item) == 0) {
    5647         168 :                 spin_lock(&root->inode_lock);
    5648         168 :                 empty = RB_EMPTY_ROOT(&root->inode_tree);
    5649         168 :                 spin_unlock(&root->inode_lock);
    5650         168 :                 if (empty)
    5651         168 :                         btrfs_add_dead_root(root);
    5652             :         }
    5653     3865379 : }
    5654             : 
    5655             : 
    5656      559717 : static int btrfs_init_locked_inode(struct inode *inode, void *p)
    5657             : {
    5658      559717 :         struct btrfs_iget_args *args = p;
    5659             : 
    5660      559717 :         inode->i_ino = args->ino;
    5661      559717 :         BTRFS_I(inode)->location.objectid = args->ino;
    5662      559717 :         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
    5663      559717 :         BTRFS_I(inode)->location.offset = 0;
    5664      559717 :         BTRFS_I(inode)->root = btrfs_grab_root(args->root);
    5665      559717 :         BUG_ON(args->root && !BTRFS_I(inode)->root);
    5666             : 
    5667      559717 :         if (args->root && args->root == args->root->fs_info->tree_root &&
    5668         139 :             args->ino != BTRFS_BTREE_INODE_OBJECTID)
    5669         139 :                 set_bit(BTRFS_INODE_FREE_SPACE_INODE,
    5670             :                         &BTRFS_I(inode)->runtime_flags);
    5671      559717 :         return 0;
    5672             : }
    5673             : 
    5674      825446 : static int btrfs_find_actor(struct inode *inode, void *opaque)
    5675             : {
    5676      825446 :         struct btrfs_iget_args *args = opaque;
    5677             : 
    5678      825446 :         return args->ino == BTRFS_I(inode)->location.objectid &&
    5679      392613 :                 args->root == BTRFS_I(inode)->root;
    5680             : }
    5681             : 
    5682      952259 : static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
    5683             :                                        struct btrfs_root *root)
    5684             : {
    5685      952259 :         struct inode *inode;
    5686      952259 :         struct btrfs_iget_args args;
    5687      952259 :         unsigned long hashval = btrfs_inode_hash(ino, root);
    5688             : 
    5689      952259 :         args.ino = ino;
    5690      952259 :         args.root = root;
    5691             : 
    5692      952259 :         inode = iget5_locked(s, hashval, btrfs_find_actor,
    5693             :                              btrfs_init_locked_inode,
    5694             :                              (void *)&args);
    5695      952325 :         return inode;
    5696             : }
    5697             : 
    5698             : /*
    5699             :  * Get an inode object given its inode number and corresponding root.
    5700             :  * Path can be preallocated to prevent recursing back to iget through
    5701             :  * allocator. NULL is also valid but may require an additional allocation
    5702             :  * later.
    5703             :  */
    5704      952268 : struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
    5705             :                               struct btrfs_root *root, struct btrfs_path *path)
    5706             : {
    5707      952268 :         struct inode *inode;
    5708             : 
    5709      952268 :         inode = btrfs_iget_locked(s, ino, root);
    5710      952325 :         if (!inode)
    5711             :                 return ERR_PTR(-ENOMEM);
    5712             : 
    5713      952325 :         if (inode->i_state & I_NEW) {
    5714      559717 :                 int ret;
    5715             : 
    5716      559717 :                 ret = btrfs_read_locked_inode(inode, path);
    5717      559708 :                 if (!ret) {
    5718      554137 :                         inode_tree_add(BTRFS_I(inode));
    5719      554140 :                         unlock_new_inode(inode);
    5720             :                 } else {
    5721        5571 :                         iget_failed(inode);
    5722             :                         /*
    5723             :                          * ret > 0 can come from btrfs_search_slot called by
    5724             :                          * btrfs_read_locked_inode, this means the inode item
    5725             :                          * was not found.
    5726             :                          */
    5727        5571 :                         if (ret > 0)
    5728        5571 :                                 ret = -ENOENT;
    5729        5571 :                         inode = ERR_PTR(ret);
    5730             :                 }
    5731             :         }
    5732             : 
    5733             :         return inode;
    5734             : }
    5735             : 
    5736      746737 : struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
    5737             : {
    5738      746875 :         return btrfs_iget_path(s, ino, root, NULL);
    5739             : }
    5740             : 
    5741       47238 : static struct inode *new_simple_dir(struct super_block *s,
    5742             :                                     struct btrfs_key *key,
    5743             :                                     struct btrfs_root *root)
    5744             : {
    5745       47238 :         struct inode *inode = new_inode(s);
    5746             : 
    5747       47238 :         if (!inode)
    5748             :                 return ERR_PTR(-ENOMEM);
    5749             : 
    5750       47238 :         BTRFS_I(inode)->root = btrfs_grab_root(root);
    5751       47238 :         memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
    5752       47238 :         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
    5753             : 
    5754       47238 :         inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
    5755             :         /*
    5756             :          * We only need lookup, the rest is read-only and there's no inode
    5757             :          * associated with the dentry
    5758             :          */
    5759       47238 :         inode->i_op = &simple_dir_inode_operations;
    5760       47238 :         inode->i_opflags &= ~IOP_XATTR;
    5761       47238 :         inode->i_fop = &simple_dir_operations;
    5762       47238 :         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
    5763       47238 :         inode->i_mtime = current_time(inode);
    5764       47238 :         inode->i_atime = inode->i_mtime;
    5765       47238 :         inode->i_ctime = inode->i_mtime;
    5766       47238 :         BTRFS_I(inode)->i_otime = inode->i_mtime;
    5767             : 
    5768       47238 :         return inode;
    5769             : }
    5770             : 
    5771             : static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
    5772             : static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
    5773             : static_assert(BTRFS_FT_DIR == FT_DIR);
    5774             : static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
    5775             : static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
    5776             : static_assert(BTRFS_FT_FIFO == FT_FIFO);
    5777             : static_assert(BTRFS_FT_SOCK == FT_SOCK);
    5778             : static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
    5779             : 
    5780             : static inline u8 btrfs_inode_type(struct inode *inode)
    5781             : {
    5782     3581366 :         return fs_umode_to_ftype(inode->i_mode);
    5783             : }
    5784             : 
    5785     3425615 : struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
    5786             : {
    5787     3425615 :         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
    5788     3425615 :         struct inode *inode;
    5789     3425615 :         struct btrfs_root *root = BTRFS_I(dir)->root;
    5790     3425615 :         struct btrfs_root *sub_root = root;
    5791     3425615 :         struct btrfs_key location;
    5792     3425615 :         u8 di_type = 0;
    5793     3425615 :         int ret = 0;
    5794             : 
    5795     3425615 :         if (dentry->d_name.len > BTRFS_NAME_LEN)
    5796             :                 return ERR_PTR(-ENAMETOOLONG);
    5797             : 
    5798     3424900 :         ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
    5799     3430903 :         if (ret < 0)
    5800     3178508 :                 return ERR_PTR(ret);
    5801             : 
    5802      252395 :         if (location.type == BTRFS_INODE_ITEM_KEY) {
    5803      204104 :                 inode = btrfs_iget(dir->i_sb, location.objectid, root);
    5804      204109 :                 if (IS_ERR(inode))
    5805             :                         return inode;
    5806             : 
    5807             :                 /* Do extra check against inode mode with di_type */
    5808      204109 :                 if (btrfs_inode_type(inode) != di_type) {
    5809           0 :                         btrfs_crit(fs_info,
    5810             : "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
    5811             :                                   inode->i_mode, btrfs_inode_type(inode),
    5812             :                                   di_type);
    5813           0 :                         iput(inode);
    5814           0 :                         return ERR_PTR(-EUCLEAN);
    5815             :                 }
    5816             :                 return inode;
    5817             :         }
    5818             : 
    5819       48291 :         ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
    5820             :                                        &location, &sub_root);
    5821       48291 :         if (ret < 0) {
    5822       47238 :                 if (ret != -ENOENT)
    5823           0 :                         inode = ERR_PTR(ret);
    5824             :                 else
    5825       47238 :                         inode = new_simple_dir(dir->i_sb, &location, root);
    5826             :         } else {
    5827        1053 :                 inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
    5828        1053 :                 btrfs_put_root(sub_root);
    5829             : 
    5830        1053 :                 if (IS_ERR(inode))
    5831             :                         return inode;
    5832             : 
    5833        1053 :                 down_read(&fs_info->cleanup_work_sem);
    5834        1053 :                 if (!sb_rdonly(inode->i_sb))
    5835        1052 :                         ret = btrfs_orphan_cleanup(sub_root);
    5836        1053 :                 up_read(&fs_info->cleanup_work_sem);
    5837        1053 :                 if (ret) {
    5838           0 :                         iput(inode);
    5839           0 :                         inode = ERR_PTR(ret);
    5840             :                 }
    5841             :         }
    5842             : 
    5843             :         return inode;
    5844             : }
    5845             : 
    5846    11360156 : static int btrfs_dentry_delete(const struct dentry *dentry)
    5847             : {
    5848    11360156 :         struct btrfs_root *root;
    5849    11360156 :         struct inode *inode = d_inode(dentry);
    5850             : 
    5851    11360156 :         if (!inode && !IS_ROOT(dentry))
    5852     1799064 :                 inode = d_inode(dentry->d_parent);
    5853             : 
    5854    11360156 :         if (inode) {
    5855    11360156 :                 root = BTRFS_I(inode)->root;
    5856    11360156 :                 if (btrfs_root_refs(&root->root_item) == 0)
    5857             :                         return 1;
    5858             : 
    5859    11360154 :                 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
    5860       47234 :                         return 1;
    5861             :         }
    5862             :         return 0;
    5863             : }
    5864             : 
    5865     3425668 : static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
    5866             :                                    unsigned int flags)
    5867             : {
    5868     3425668 :         struct inode *inode = btrfs_lookup_dentry(dir, dentry);
    5869             : 
    5870     3429614 :         if (inode == ERR_PTR(-ENOENT))
    5871     3177754 :                 inode = NULL;
    5872     3429614 :         return d_splice_alias(inode, dentry);
    5873             : }
    5874             : 
    5875             : /*
    5876             :  * All this infrastructure exists because dir_emit can fault, and we are holding
    5877             :  * the tree lock when doing readdir.  For now just allocate a buffer and copy
    5878             :  * our information into that, and then dir_emit from the buffer.  This is
    5879             :  * similar to what NFS does, only we don't keep the buffer around in pagecache
    5880             :  * because I'm afraid I'll mess that up.  Long term we need to make filldir do
    5881             :  * copy_to_user_inatomic so we don't have to worry about page faulting under the
    5882             :  * tree lock.
    5883             :  */
    5884      148891 : static int btrfs_opendir(struct inode *inode, struct file *file)
    5885             : {
    5886      148891 :         struct btrfs_file_private *private;
    5887             : 
    5888      148891 :         private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
    5889      148842 :         if (!private)
    5890             :                 return -ENOMEM;
    5891      148842 :         private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
    5892      148914 :         if (!private->filldir_buf) {
    5893           0 :                 kfree(private);
    5894           0 :                 return -ENOMEM;
    5895             :         }
    5896      148914 :         file->private_data = private;
    5897      148914 :         return 0;
    5898             : }
    5899             : 
    5900             : struct dir_entry {
    5901             :         u64 ino;
    5902             :         u64 offset;
    5903             :         unsigned type;
    5904             :         int name_len;
    5905             : };
    5906             : 
    5907    22303904 : static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
    5908             : {
    5909  2551777309 :         while (entries--) {
    5910  2531866470 :                 struct dir_entry *entry = addr;
    5911  2531866470 :                 char *name = (char *)(entry + 1);
    5912             : 
    5913  2531866470 :                 ctx->pos = get_unaligned(&entry->offset);
    5914  2531869431 :                 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
    5915  2531866470 :                                          get_unaligned(&entry->ino),
    5916  2531866470 :                                          get_unaligned(&entry->type)))
    5917             :                         return 1;
    5918  2529473405 :                 addr += sizeof(struct dir_entry) +
    5919  2529473405 :                         get_unaligned(&entry->name_len);
    5920  2529473405 :                 ctx->pos++;
    5921             :         }
    5922             :         return 0;
    5923             : }
    5924             : 
    5925     2773639 : static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
    5926             : {
    5927     2773639 :         struct inode *inode = file_inode(file);
    5928     2773639 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    5929     2773639 :         struct btrfs_file_private *private = file->private_data;
    5930     2773639 :         struct btrfs_dir_item *di;
    5931     2773639 :         struct btrfs_key key;
    5932     2773639 :         struct btrfs_key found_key;
    5933     2773639 :         struct btrfs_path *path;
    5934     2773639 :         void *addr;
    5935     2773639 :         struct list_head ins_list;
    5936     2773639 :         struct list_head del_list;
    5937     2773639 :         int ret;
    5938     2773639 :         char *name_ptr;
    5939     2773639 :         int name_len;
    5940     2773639 :         int entries = 0;
    5941     2773639 :         int total_len = 0;
    5942     2773639 :         bool put = false;
    5943     2773639 :         struct btrfs_key location;
    5944             : 
    5945     2773639 :         if (!dir_emit_dots(file, ctx))
    5946             :                 return 0;
    5947             : 
    5948     2773394 :         path = btrfs_alloc_path();
    5949     2773649 :         if (!path)
    5950             :                 return -ENOMEM;
    5951             : 
    5952     2773649 :         addr = private->filldir_buf;
    5953     2773649 :         path->reada = READA_FORWARD;
    5954             : 
    5955     2773649 :         INIT_LIST_HEAD(&ins_list);
    5956     2773649 :         INIT_LIST_HEAD(&del_list);
    5957     2773649 :         put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
    5958             : 
    5959    22306084 : again:
    5960    22306084 :         key.type = BTRFS_DIR_INDEX_KEY;
    5961    22306084 :         key.offset = ctx->pos;
    5962    22306084 :         key.objectid = btrfs_ino(BTRFS_I(inode));
    5963             : 
    5964  2671693152 :         btrfs_for_each_slot(root, &key, &found_key, path, ret) {
    5965  2664125214 :                 struct dir_entry *entry;
    5966  2664125214 :                 struct extent_buffer *leaf = path->nodes[0];
    5967  2664125214 :                 u8 ftype;
    5968             : 
    5969  2664125214 :                 if (found_key.objectid != key.objectid)
    5970             :                         break;
    5971  2663743696 :                 if (found_key.type != BTRFS_DIR_INDEX_KEY)
    5972             :                         break;
    5973  2663743696 :                 if (found_key.offset < ctx->pos)
    5974           0 :                         continue;
    5975  2663743696 :                 if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
    5976        9319 :                         continue;
    5977  2657509103 :                 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
    5978  2655030288 :                 name_len = btrfs_dir_name_len(leaf, di);
    5979  2658427244 :                 if ((total_len + sizeof(struct dir_entry) + name_len) >=
    5980             :                     PAGE_SIZE) {
    5981    21919240 :                         btrfs_release_path(path);
    5982    21923047 :                         ret = btrfs_filldir(private->filldir_buf, entries, ctx);
    5983    21923336 :                         if (ret)
    5984     2391247 :                                 goto nopos;
    5985    19532089 :                         addr = private->filldir_buf;
    5986    19532089 :                         entries = 0;
    5987    19532089 :                         total_len = 0;
    5988    19532089 :                         goto again;
    5989             :                 }
    5990             : 
    5991  2636508004 :                 ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
    5992  2632548741 :                 entry = addr;
    5993  2632548741 :                 name_ptr = (char *)(entry + 1);
    5994  2632548741 :                 read_extent_buffer(leaf, name_ptr,
    5995  2632548741 :                                    (unsigned long)(di + 1), name_len);
    5996  2625464988 :                 put_unaligned(name_len, &entry->name_len);
    5997  2625464988 :                 put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
    5998  2629826952 :                 btrfs_dir_item_key_to_cpu(leaf, di, &location);
    5999  2649377749 :                 put_unaligned(location.objectid, &entry->ino);
    6000  2649377749 :                 put_unaligned(found_key.offset, &entry->offset);
    6001  2649377749 :                 entries++;
    6002  2649377749 :                 addr += sizeof(struct dir_entry) + name_len;
    6003  2649377749 :                 total_len += sizeof(struct dir_entry) + name_len;
    6004             :         }
    6005             :         /* Catch error encountered during iteration */
    6006      382532 :         if (ret < 0)
    6007           0 :                 goto err;
    6008             : 
    6009      382532 :         btrfs_release_path(path);
    6010             : 
    6011      382537 :         ret = btrfs_filldir(private->filldir_buf, entries, ctx);
    6012      382529 :         if (ret)
    6013        4560 :                 goto nopos;
    6014             : 
    6015      377969 :         ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
    6016      377965 :         if (ret)
    6017       32063 :                 goto nopos;
    6018             : 
    6019             :         /*
    6020             :          * Stop new entries from being returned after we return the last
    6021             :          * entry.
    6022             :          *
    6023             :          * New directory entries are assigned a strictly increasing
    6024             :          * offset.  This means that new entries created during readdir
    6025             :          * are *guaranteed* to be seen in the future by that readdir.
    6026             :          * This has broken buggy programs which operate on names as
    6027             :          * they're returned by readdir.  Until we re-use freed offsets
    6028             :          * we have this hack to stop new entries from being returned
    6029             :          * under the assumption that they'll never reach this huge
    6030             :          * offset.
    6031             :          *
    6032             :          * This is being careful not to overflow 32bit loff_t unless the
    6033             :          * last entry requires it because doing so has broken 32bit apps
    6034             :          * in the past.
    6035             :          */
    6036      345902 :         if (ctx->pos >= INT_MAX)
    6037      183840 :                 ctx->pos = LLONG_MAX;
    6038             :         else
    6039      162062 :                 ctx->pos = INT_MAX;
    6040             : nopos:
    6041             :         ret = 0;
    6042     2773772 : err:
    6043     2773772 :         if (put)
    6044      487865 :                 btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
    6045     2773768 :         btrfs_free_path(path);
    6046     2773768 :         return ret;
    6047             : }
    6048             : 
    6049             : /*
    6050             :  * This is somewhat expensive, updating the tree every time the
    6051             :  * inode changes.  But, it is most likely to find the inode in cache.
    6052             :  * FIXME, needs more benchmarking...there are no reasons other than performance
    6053             :  * to keep or drop this code.
    6054             :  */
    6055     4441656 : static int btrfs_dirty_inode(struct btrfs_inode *inode)
    6056             : {
    6057     4441656 :         struct btrfs_root *root = inode->root;
    6058     4441656 :         struct btrfs_fs_info *fs_info = root->fs_info;
    6059     4441656 :         struct btrfs_trans_handle *trans;
    6060     4441656 :         int ret;
    6061             : 
    6062     8883312 :         if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
    6063             :                 return 0;
    6064             : 
    6065     4442049 :         trans = btrfs_join_transaction(root);
    6066     4450732 :         if (IS_ERR(trans))
    6067           0 :                 return PTR_ERR(trans);
    6068             : 
    6069     4450732 :         ret = btrfs_update_inode(trans, root, inode);
    6070     4450623 :         if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
    6071             :                 /* whoops, lets try again with the full transaction */
    6072       89131 :                 btrfs_end_transaction(trans);
    6073       89054 :                 trans = btrfs_start_transaction(root, 1);
    6074       89157 :                 if (IS_ERR(trans))
    6075        1106 :                         return PTR_ERR(trans);
    6076             : 
    6077       88051 :                 ret = btrfs_update_inode(trans, root, inode);
    6078             :         }
    6079     4449544 :         btrfs_end_transaction(trans);
    6080     4447725 :         if (inode->delayed_node)
    6081     4447725 :                 btrfs_balance_delayed_items(fs_info);
    6082             : 
    6083             :         return ret;
    6084             : }
    6085             : 
    6086             : /*
    6087             :  * This is a copy of file_update_time.  We need this so we can return error on
    6088             :  * ENOSPC for updating the inode in the case of file write and mmap writes.
    6089             :  */
    6090     4699421 : static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
    6091             :                              int flags)
    6092             : {
    6093     4699421 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    6094     4699421 :         bool dirty = flags & ~S_VERSION;
    6095             : 
    6096     4699421 :         if (btrfs_root_readonly(root))
    6097             :                 return -EROFS;
    6098             : 
    6099     1995090 :         if (flags & S_VERSION)
    6100        1608 :                 dirty |= inode_maybe_inc_iversion(inode, dirty);
    6101     1995090 :         if (flags & S_CTIME)
    6102      597158 :                 inode->i_ctime = *now;
    6103     1995090 :         if (flags & S_MTIME)
    6104      664837 :                 inode->i_mtime = *now;
    6105     1995090 :         if (flags & S_ATIME)
    6106     1328639 :                 inode->i_atime = *now;
    6107     1995090 :         return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
    6108             : }
    6109             : 
    6110             : /*
    6111             :  * find the highest existing sequence number in a directory
    6112             :  * and then set the in-memory index_cnt variable to reflect
    6113             :  * free sequence numbers
    6114             :  */
    6115        6213 : static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
    6116             : {
    6117        6213 :         struct btrfs_root *root = inode->root;
    6118        6213 :         struct btrfs_key key, found_key;
    6119        6213 :         struct btrfs_path *path;
    6120        6213 :         struct extent_buffer *leaf;
    6121        6213 :         int ret;
    6122             : 
    6123        6213 :         key.objectid = btrfs_ino(inode);
    6124        6213 :         key.type = BTRFS_DIR_INDEX_KEY;
    6125        6213 :         key.offset = (u64)-1;
    6126             : 
    6127        6213 :         path = btrfs_alloc_path();
    6128        6213 :         if (!path)
    6129             :                 return -ENOMEM;
    6130             : 
    6131        6213 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    6132        6213 :         if (ret < 0)
    6133           0 :                 goto out;
    6134             :         /* FIXME: we should be able to handle this */
    6135        6213 :         if (ret == 0)
    6136           0 :                 goto out;
    6137        6213 :         ret = 0;
    6138             : 
    6139        6213 :         if (path->slots[0] == 0) {
    6140           0 :                 inode->index_cnt = BTRFS_DIR_START_INDEX;
    6141           0 :                 goto out;
    6142             :         }
    6143             : 
    6144        6213 :         path->slots[0]--;
    6145             : 
    6146        6213 :         leaf = path->nodes[0];
    6147        6213 :         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
    6148             : 
    6149        6213 :         if (found_key.objectid != btrfs_ino(inode) ||
    6150        6213 :             found_key.type != BTRFS_DIR_INDEX_KEY) {
    6151        2619 :                 inode->index_cnt = BTRFS_DIR_START_INDEX;
    6152        2619 :                 goto out;
    6153             :         }
    6154             : 
    6155        3594 :         inode->index_cnt = found_key.offset + 1;
    6156        6213 : out:
    6157        6213 :         btrfs_free_path(path);
    6158        6213 :         return ret;
    6159             : }
    6160             : 
    6161             : /*
    6162             :  * helper to find a free sequence number in a given directory.  This current
    6163             :  * code is very simple, later versions will do smarter things in the btree
    6164             :  */
    6165     3377621 : int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
    6166             : {
    6167     3377621 :         int ret = 0;
    6168             : 
    6169     3377621 :         if (dir->index_cnt == (u64)-1) {
    6170        6213 :                 ret = btrfs_inode_delayed_dir_index_count(dir);
    6171        6213 :                 if (ret) {
    6172        6213 :                         ret = btrfs_set_inode_index_count(dir);
    6173        6213 :                         if (ret)
    6174             :                                 return ret;
    6175             :                 }
    6176             :         }
    6177             : 
    6178     3377621 :         *index = dir->index_cnt;
    6179     3377621 :         dir->index_cnt++;
    6180             : 
    6181     3377621 :         return ret;
    6182             : }
    6183             : 
    6184     3249886 : static int btrfs_insert_inode_locked(struct inode *inode)
    6185             : {
    6186     3249886 :         struct btrfs_iget_args args;
    6187             : 
    6188     3249886 :         args.ino = BTRFS_I(inode)->location.objectid;
    6189     3249886 :         args.root = BTRFS_I(inode)->root;
    6190             : 
    6191     6505069 :         return insert_inode_locked4(inode,
    6192     3249886 :                    btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
    6193             :                    btrfs_find_actor, &args);
    6194             : }
    6195             : 
    6196     3252162 : int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
    6197             :                             unsigned int *trans_num_items)
    6198             : {
    6199     3252162 :         struct inode *dir = args->dir;
    6200     3252162 :         struct inode *inode = args->inode;
    6201     3252162 :         int ret;
    6202             : 
    6203     3252162 :         if (!args->orphan) {
    6204     2846886 :                 ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
    6205             :                                              &args->fname);
    6206     2846379 :                 if (ret)
    6207             :                         return ret;
    6208             :         }
    6209             : 
    6210     3251655 :         ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
    6211     3235795 :         if (ret) {
    6212             :                 fscrypt_free_filename(&args->fname);
    6213             :                 return ret;
    6214             :         }
    6215             : 
    6216             :         /* 1 to add inode item */
    6217     3235795 :         *trans_num_items = 1;
    6218             :         /* 1 to add compression property */
    6219     3235795 :         if (BTRFS_I(dir)->prop_compress)
    6220          12 :                 (*trans_num_items)++;
    6221             :         /* 1 to add default ACL xattr */
    6222     3235795 :         if (args->default_acl)
    6223        2771 :                 (*trans_num_items)++;
    6224             :         /* 1 to add access ACL xattr */
    6225     3235795 :         if (args->acl)
    6226       31056 :                 (*trans_num_items)++;
    6227             : #ifdef CONFIG_SECURITY
    6228             :         /* 1 to add LSM xattr */
    6229             :         if (dir->i_security)
    6230             :                 (*trans_num_items)++;
    6231             : #endif
    6232     3235795 :         if (args->orphan) {
    6233             :                 /* 1 to add orphan item */
    6234      395019 :                 (*trans_num_items)++;
    6235             :         } else {
    6236             :                 /*
    6237             :                  * 1 to add dir item
    6238             :                  * 1 to add dir index
    6239             :                  * 1 to update parent inode item
    6240             :                  *
    6241             :                  * No need for 1 unit for the inode ref item because it is
    6242             :                  * inserted in a batch together with the inode item at
    6243             :                  * btrfs_create_new_inode().
    6244             :                  */
    6245     2840776 :                 *trans_num_items += 3;
    6246             :         }
    6247             :         return 0;
    6248             : }
    6249             : 
    6250     3255532 : void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
    6251             : {
    6252     3255532 :         posix_acl_release(args->acl);
    6253     3255054 :         posix_acl_release(args->default_acl);
    6254     3255112 :         fscrypt_free_filename(&args->fname);
    6255     3255112 : }
    6256             : 
    6257             : /*
    6258             :  * Inherit flags from the parent inode.
    6259             :  *
    6260             :  * Currently only the compression flags and the cow flags are inherited.
    6261             :  */
    6262     3253212 : static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
    6263             : {
    6264     3253212 :         unsigned int flags;
    6265             : 
    6266     3253212 :         flags = dir->flags;
    6267             : 
    6268     3253212 :         if (flags & BTRFS_INODE_NOCOMPRESS) {
    6269           0 :                 inode->flags &= ~BTRFS_INODE_COMPRESS;
    6270           0 :                 inode->flags |= BTRFS_INODE_NOCOMPRESS;
    6271     3253212 :         } else if (flags & BTRFS_INODE_COMPRESS) {
    6272          11 :                 inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
    6273          11 :                 inode->flags |= BTRFS_INODE_COMPRESS;
    6274             :         }
    6275             : 
    6276     3253212 :         if (flags & BTRFS_INODE_NODATACOW) {
    6277           0 :                 inode->flags |= BTRFS_INODE_NODATACOW;
    6278           0 :                 if (S_ISREG(inode->vfs_inode.i_mode))
    6279           0 :                         inode->flags |= BTRFS_INODE_NODATASUM;
    6280             :         }
    6281             : 
    6282     3253212 :         btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
    6283     3251258 : }
    6284             : 
    6285     3254685 : int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
    6286             :                            struct btrfs_new_inode_args *args)
    6287             : {
    6288     3254685 :         struct inode *dir = args->dir;
    6289     3254685 :         struct inode *inode = args->inode;
    6290     3254685 :         const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
    6291     3254685 :         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
    6292     3254685 :         struct btrfs_root *root;
    6293     3254685 :         struct btrfs_inode_item *inode_item;
    6294     3254685 :         struct btrfs_key *location;
    6295     3254685 :         struct btrfs_path *path;
    6296     3254685 :         u64 objectid;
    6297     3254685 :         struct btrfs_inode_ref *ref;
    6298     3254685 :         struct btrfs_key key[2];
    6299     3254685 :         u32 sizes[2];
    6300     3254685 :         struct btrfs_item_batch batch;
    6301     3254685 :         unsigned long ptr;
    6302     3254685 :         int ret;
    6303             : 
    6304     3254685 :         path = btrfs_alloc_path();
    6305     3254523 :         if (!path)
    6306             :                 return -ENOMEM;
    6307             : 
    6308     3254523 :         if (!args->subvol)
    6309     3254295 :                 BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
    6310     3254945 :         root = BTRFS_I(inode)->root;
    6311             : 
    6312     3254945 :         ret = btrfs_get_free_objectid(root, &objectid);
    6313     3254343 :         if (ret)
    6314           0 :                 goto out;
    6315     3254343 :         inode->i_ino = objectid;
    6316             : 
    6317     3254343 :         if (args->orphan) {
    6318             :                 /*
    6319             :                  * O_TMPFILE, set link count to 0, so that after this point, we
    6320             :                  * fill in an inode item with the correct link count.
    6321             :                  */
    6322      403959 :                 set_nlink(inode, 0);
    6323             :         } else {
    6324     2850384 :                 trace_btrfs_inode_request(dir);
    6325             : 
    6326     2850092 :                 ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
    6327     2850097 :                 if (ret)
    6328           0 :                         goto out;
    6329             :         }
    6330             :         /* index_cnt is ignored for everything but a dir. */
    6331     3254274 :         BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
    6332     3254274 :         BTRFS_I(inode)->generation = trans->transid;
    6333     3254274 :         inode->i_generation = BTRFS_I(inode)->generation;
    6334             : 
    6335             :         /*
    6336             :          * Subvolumes don't inherit flags from their parent directory.
    6337             :          * Originally this was probably by accident, but we probably can't
    6338             :          * change it now without compatibility issues.
    6339             :          */
    6340     3254274 :         if (!args->subvol)
    6341     3253929 :                 btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
    6342             : 
    6343     3251471 :         if (S_ISREG(inode->i_mode)) {
    6344     3058976 :                 if (btrfs_test_opt(fs_info, NODATASUM))
    6345          28 :                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
    6346     3058976 :                 if (btrfs_test_opt(fs_info, NODATACOW))
    6347          24 :                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
    6348             :                                 BTRFS_INODE_NODATASUM;
    6349             :         }
    6350             : 
    6351     3251471 :         location = &BTRFS_I(inode)->location;
    6352     3251471 :         location->objectid = objectid;
    6353     3251471 :         location->offset = 0;
    6354     3251471 :         location->type = BTRFS_INODE_ITEM_KEY;
    6355             : 
    6356     3251471 :         ret = btrfs_insert_inode_locked(inode);
    6357     3255183 :         if (ret < 0) {
    6358           0 :                 if (!args->orphan)
    6359           0 :                         BTRFS_I(dir)->index_cnt--;
    6360           0 :                 goto out;
    6361             :         }
    6362             : 
    6363             :         /*
    6364             :          * We could have gotten an inode number from somebody who was fsynced
    6365             :          * and then removed in this same transaction, so let's just set full
    6366             :          * sync since it will be a full sync anyway and this will blow away the
    6367             :          * old info in the log.
    6368             :          */
    6369     3255183 :         btrfs_set_inode_full_sync(BTRFS_I(inode));
    6370             : 
    6371     3255074 :         key[0].objectid = objectid;
    6372     3255074 :         key[0].type = BTRFS_INODE_ITEM_KEY;
    6373     3255074 :         key[0].offset = 0;
    6374             : 
    6375     3255074 :         sizes[0] = sizeof(struct btrfs_inode_item);
    6376             : 
    6377     3255074 :         if (!args->orphan) {
    6378             :                 /*
    6379             :                  * Start new inodes with an inode_ref. This is slightly more
    6380             :                  * efficient for small numbers of hard links since they will
    6381             :                  * be packed into one item. Extended refs will kick in if we
    6382             :                  * add more hard links than can fit in the ref item.
    6383             :                  */
    6384     2850609 :                 key[1].objectid = objectid;
    6385     2850609 :                 key[1].type = BTRFS_INODE_REF_KEY;
    6386     2850609 :                 if (args->subvol) {
    6387         252 :                         key[1].offset = objectid;
    6388         252 :                         sizes[1] = 2 + sizeof(*ref);
    6389             :                 } else {
    6390     2850357 :                         key[1].offset = btrfs_ino(BTRFS_I(dir));
    6391     2850357 :                         sizes[1] = name->len + sizeof(*ref);
    6392             :                 }
    6393             :         }
    6394             : 
    6395     3255074 :         batch.keys = &key[0];
    6396     3255074 :         batch.data_sizes = &sizes[0];
    6397     3255074 :         batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
    6398     3255074 :         batch.nr = args->orphan ? 1 : 2;
    6399     3255074 :         ret = btrfs_insert_empty_items(trans, root, path, &batch);
    6400     3255174 :         if (ret != 0) {
    6401           0 :                 btrfs_abort_transaction(trans, ret);
    6402           0 :                 goto discard;
    6403             :         }
    6404             : 
    6405     3255174 :         inode->i_mtime = current_time(inode);
    6406     3255176 :         inode->i_atime = inode->i_mtime;
    6407     3255176 :         inode->i_ctime = inode->i_mtime;
    6408     3255176 :         BTRFS_I(inode)->i_otime = inode->i_mtime;
    6409             : 
    6410             :         /*
    6411             :          * We're going to fill the inode item now, so at this point the inode
    6412             :          * must be fully initialized.
    6413             :          */
    6414             : 
    6415     3255176 :         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
    6416             :                                   struct btrfs_inode_item);
    6417     3255177 :         memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
    6418             :                              sizeof(*inode_item));
    6419     3255178 :         fill_inode_item(trans, path->nodes[0], inode_item, inode);
    6420             : 
    6421     3255179 :         if (!args->orphan) {
    6422     2850735 :                 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
    6423             :                                      struct btrfs_inode_ref);
    6424     2850736 :                 ptr = (unsigned long)(ref + 1);
    6425     2850736 :                 if (args->subvol) {
    6426         252 :                         btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
    6427         252 :                         btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
    6428         252 :                         write_extent_buffer(path->nodes[0], "..", ptr, 2);
    6429             :                 } else {
    6430     2850484 :                         btrfs_set_inode_ref_name_len(path->nodes[0], ref,
    6431     2850484 :                                                      name->len);
    6432     2850482 :                         btrfs_set_inode_ref_index(path->nodes[0], ref,
    6433             :                                                   BTRFS_I(inode)->dir_index);
    6434     2850482 :                         write_extent_buffer(path->nodes[0], name->name, ptr,
    6435     2850482 :                                             name->len);
    6436             :                 }
    6437             :         }
    6438             : 
    6439     3255179 :         btrfs_mark_buffer_dirty(path->nodes[0]);
    6440             :         /*
    6441             :          * We don't need the path anymore, plus inheriting properties, adding
    6442             :          * ACLs, security xattrs, orphan item or adding the link, will result in
    6443             :          * allocating yet another path. So just free our path.
    6444             :          */
    6445     3255180 :         btrfs_free_path(path);
    6446     3255177 :         path = NULL;
    6447             : 
    6448     3255177 :         if (args->subvol) {
    6449         252 :                 struct inode *parent;
    6450             : 
    6451             :                 /*
    6452             :                  * Subvolumes inherit properties from their parent subvolume,
    6453             :                  * not the directory they were created in.
    6454             :                  */
    6455         252 :                 parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
    6456             :                                     BTRFS_I(dir)->root);
    6457         252 :                 if (IS_ERR(parent)) {
    6458           0 :                         ret = PTR_ERR(parent);
    6459             :                 } else {
    6460         252 :                         ret = btrfs_inode_inherit_props(trans, inode, parent);
    6461         252 :                         iput(parent);
    6462             :                 }
    6463             :         } else {
    6464     3254925 :                 ret = btrfs_inode_inherit_props(trans, inode, dir);
    6465             :         }
    6466     3255178 :         if (ret) {
    6467           0 :                 btrfs_err(fs_info,
    6468             :                           "error inheriting props for ino %llu (root %llu): %d",
    6469             :                           btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
    6470             :                           ret);
    6471             :         }
    6472             : 
    6473             :         /*
    6474             :          * Subvolumes don't inherit ACLs or get passed to the LSM. This is
    6475             :          * probably a bug.
    6476             :          */
    6477     3255178 :         if (!args->subvol) {
    6478     3254922 :                 ret = btrfs_init_inode_security(trans, args);
    6479     3254921 :                 if (ret) {
    6480           0 :                         btrfs_abort_transaction(trans, ret);
    6481           0 :                         goto discard;
    6482             :                 }
    6483             :         }
    6484             : 
    6485     3255177 :         inode_tree_add(BTRFS_I(inode));
    6486             : 
    6487     3255183 :         trace_btrfs_inode_new(inode);
    6488     3255183 :         btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
    6489             : 
    6490     3255182 :         btrfs_update_root_times(trans, root);
    6491             : 
    6492     3255180 :         if (args->orphan) {
    6493      404447 :                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
    6494             :         } else {
    6495     2850733 :                 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
    6496             :                                      0, BTRFS_I(inode)->dir_index);
    6497             :         }
    6498     3255067 :         if (ret) {
    6499           0 :                 btrfs_abort_transaction(trans, ret);
    6500           0 :                 goto discard;
    6501             :         }
    6502             : 
    6503             :         return 0;
    6504             : 
    6505           0 : discard:
    6506             :         /*
    6507             :          * discard_new_inode() calls iput(), but the caller owns the reference
    6508             :          * to the inode.
    6509             :          */
    6510           0 :         ihold(inode);
    6511           0 :         discard_new_inode(inode);
    6512           0 : out:
    6513           0 :         btrfs_free_path(path);
    6514           0 :         return ret;
    6515             : }
    6516             : 
    6517             : /*
    6518             :  * utility function to add 'inode' into 'parent_inode' with
    6519             :  * a give name and a given sequence number.
    6520             :  * if 'add_backref' is true, also insert a backref from the
    6521             :  * inode to the parent directory.
    6522             :  */
    6523     3377256 : int btrfs_add_link(struct btrfs_trans_handle *trans,
    6524             :                    struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
    6525             :                    const struct fscrypt_str *name, int add_backref, u64 index)
    6526             : {
    6527     3377256 :         int ret = 0;
    6528     3377256 :         struct btrfs_key key;
    6529     3377256 :         struct btrfs_root *root = parent_inode->root;
    6530     3377256 :         u64 ino = btrfs_ino(inode);
    6531     3377256 :         u64 parent_ino = btrfs_ino(parent_inode);
    6532             : 
    6533     3377256 :         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
    6534         516 :                 memcpy(&key, &inode->root->root_key, sizeof(key));
    6535             :         } else {
    6536     3376998 :                 key.objectid = ino;
    6537     3376998 :                 key.type = BTRFS_INODE_ITEM_KEY;
    6538     3376998 :                 key.offset = 0;
    6539             :         }
    6540             : 
    6541     3377256 :         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
    6542         258 :                 ret = btrfs_add_root_ref(trans, key.objectid,
    6543             :                                          root->root_key.objectid, parent_ino,
    6544             :                                          index, name);
    6545     3376998 :         } else if (add_backref) {
    6546       63463 :                 ret = btrfs_insert_inode_ref(trans, root, name,
    6547             :                                              ino, parent_ino, index);
    6548             :         }
    6549             : 
    6550             :         /* Nothing to clean up yet */
    6551       63722 :         if (ret)
    6552             :                 return ret;
    6553             : 
    6554     3377257 :         ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
    6555             :                                     btrfs_inode_type(&inode->vfs_inode), index);
    6556     3377114 :         if (ret == -EEXIST || ret == -EOVERFLOW)
    6557           0 :                 goto fail_dir_item;
    6558     3377114 :         else if (ret) {
    6559           0 :                 btrfs_abort_transaction(trans, ret);
    6560           0 :                 return ret;
    6561             :         }
    6562             : 
    6563     3377114 :         btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
    6564     3377114 :                            name->len * 2);
    6565     3377114 :         inode_inc_iversion(&parent_inode->vfs_inode);
    6566             :         /*
    6567             :          * If we are replaying a log tree, we do not want to update the mtime
    6568             :          * and ctime of the parent directory with the current time, since the
    6569             :          * log replay procedure is responsible for setting them to their correct
    6570             :          * values (the ones it had when the fsync was done).
    6571             :          */
    6572     3377118 :         if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
    6573     3376871 :                 struct timespec64 now = current_time(&parent_inode->vfs_inode);
    6574             : 
    6575     3376716 :                 parent_inode->vfs_inode.i_mtime = now;
    6576     3376716 :                 parent_inode->vfs_inode.i_ctime = now;
    6577             :         }
    6578     3376963 :         ret = btrfs_update_inode(trans, root, parent_inode);
    6579     3377152 :         if (ret)
    6580           0 :                 btrfs_abort_transaction(trans, ret);
    6581             :         return ret;
    6582             : 
    6583             : fail_dir_item:
    6584           0 :         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
    6585           0 :                 u64 local_index;
    6586           0 :                 int err;
    6587           0 :                 err = btrfs_del_root_ref(trans, key.objectid,
    6588             :                                          root->root_key.objectid, parent_ino,
    6589             :                                          &local_index, name);
    6590           0 :                 if (err)
    6591           0 :                         btrfs_abort_transaction(trans, err);
    6592           0 :         } else if (add_backref) {
    6593           0 :                 u64 local_index;
    6594           0 :                 int err;
    6595             : 
    6596           0 :                 err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
    6597             :                                           &local_index);
    6598           0 :                 if (err)
    6599           0 :                         btrfs_abort_transaction(trans, err);
    6600             :         }
    6601             : 
    6602             :         /* Return the original error code */
    6603             :         return ret;
    6604             : }
    6605             : 
    6606     2779572 : static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
    6607             :                                struct inode *inode)
    6608             : {
    6609     2779572 :         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
    6610     2779572 :         struct btrfs_root *root = BTRFS_I(dir)->root;
    6611     2779572 :         struct btrfs_new_inode_args new_inode_args = {
    6612             :                 .dir = dir,
    6613             :                 .dentry = dentry,
    6614             :                 .inode = inode,
    6615             :         };
    6616     2779572 :         unsigned int trans_num_items;
    6617     2779572 :         struct btrfs_trans_handle *trans;
    6618     2779572 :         int err;
    6619             : 
    6620     2779572 :         err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
    6621     2768949 :         if (err)
    6622           0 :                 goto out_inode;
    6623             : 
    6624     2768949 :         trans = btrfs_start_transaction(root, trans_num_items);
    6625     2781446 :         if (IS_ERR(trans)) {
    6626        1152 :                 err = PTR_ERR(trans);
    6627        1152 :                 goto out_new_inode_args;
    6628             :         }
    6629             : 
    6630     2780294 :         err = btrfs_create_new_inode(trans, &new_inode_args);
    6631     2780268 :         if (!err)
    6632     2780209 :                 d_instantiate_new(dentry, inode);
    6633             : 
    6634     2780311 :         btrfs_end_transaction(trans);
    6635     2780275 :         btrfs_btree_balance_dirty(fs_info);
    6636     2780522 : out_new_inode_args:
    6637     2780522 :         btrfs_new_inode_args_destroy(&new_inode_args);
    6638     2780105 : out_inode:
    6639     2780105 :         if (err)
    6640        1148 :                 iput(inode);
    6641     2780095 :         return err;
    6642             : }
    6643             : 
    6644       32393 : static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
    6645             :                        struct dentry *dentry, umode_t mode, dev_t rdev)
    6646             : {
    6647       32393 :         struct inode *inode;
    6648             : 
    6649       32393 :         inode = new_inode(dir->i_sb);
    6650       32401 :         if (!inode)
    6651             :                 return -ENOMEM;
    6652       32401 :         inode_init_owner(idmap, inode, dir, mode);
    6653       32401 :         inode->i_op = &btrfs_special_inode_operations;
    6654       32401 :         init_special_inode(inode, inode->i_mode, rdev);
    6655       32399 :         return btrfs_create_common(dir, dentry, inode);
    6656             : }
    6657             : 
    6658     2651894 : static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
    6659             :                         struct dentry *dentry, umode_t mode, bool excl)
    6660             : {
    6661     2651894 :         struct inode *inode;
    6662             : 
    6663     2651894 :         inode = new_inode(dir->i_sb);
    6664     2658787 :         if (!inode)
    6665             :                 return -ENOMEM;
    6666     2658787 :         inode_init_owner(idmap, inode, dir, mode);
    6667     2657393 :         inode->i_fop = &btrfs_file_operations;
    6668     2657393 :         inode->i_op = &btrfs_file_inode_operations;
    6669     2657393 :         inode->i_mapping->a_ops = &btrfs_aops;
    6670     2657393 :         return btrfs_create_common(dir, dentry, inode);
    6671             : }
    6672             : 
    6673       63578 : static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
    6674             :                       struct dentry *dentry)
    6675             : {
    6676       63578 :         struct btrfs_trans_handle *trans = NULL;
    6677       63578 :         struct btrfs_root *root = BTRFS_I(dir)->root;
    6678       63578 :         struct inode *inode = d_inode(old_dentry);
    6679       63578 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    6680       63578 :         struct fscrypt_name fname;
    6681       63578 :         u64 index;
    6682       63578 :         int err;
    6683       63578 :         int drop_inode = 0;
    6684             : 
    6685             :         /* do not allow sys_link's with other subvols of the same device */
    6686       63578 :         if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
    6687             :                 return -EXDEV;
    6688             : 
    6689       63578 :         if (inode->i_nlink >= BTRFS_LINK_MAX)
    6690             :                 return -EMLINK;
    6691             : 
    6692       63578 :         err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
    6693       63577 :         if (err)
    6694           0 :                 goto fail;
    6695             : 
    6696       63577 :         err = btrfs_set_inode_index(BTRFS_I(dir), &index);
    6697       63578 :         if (err)
    6698           0 :                 goto fail;
    6699             : 
    6700             :         /*
    6701             :          * 2 items for inode and inode ref
    6702             :          * 2 items for dir items
    6703             :          * 1 item for parent inode
    6704             :          * 1 item for orphan item deletion if O_TMPFILE
    6705             :          */
    6706       65818 :         trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
    6707       63579 :         if (IS_ERR(trans)) {
    6708         115 :                 err = PTR_ERR(trans);
    6709         115 :                 trans = NULL;
    6710         115 :                 goto fail;
    6711             :         }
    6712             : 
    6713             :         /* There are several dir indexes for this inode, clear the cache. */
    6714       63464 :         BTRFS_I(inode)->dir_index = 0ULL;
    6715       63464 :         inc_nlink(inode);
    6716       63464 :         inode_inc_iversion(inode);
    6717       63464 :         inode->i_ctime = current_time(inode);
    6718       63464 :         ihold(inode);
    6719       63464 :         set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
    6720             : 
    6721       63464 :         err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
    6722             :                              &fname.disk_name, 1, index);
    6723             : 
    6724       63464 :         if (err) {
    6725             :                 drop_inode = 1;
    6726             :         } else {
    6727       63464 :                 struct dentry *parent = dentry->d_parent;
    6728             : 
    6729       63464 :                 err = btrfs_update_inode(trans, root, BTRFS_I(inode));
    6730       63464 :                 if (err)
    6731           0 :                         goto fail;
    6732       63464 :                 if (inode->i_nlink == 1) {
    6733             :                         /*
    6734             :                          * If new hard link count is 1, it's a file created
    6735             :                          * with open(2) O_TMPFILE flag.
    6736             :                          */
    6737        2240 :                         err = btrfs_orphan_del(trans, BTRFS_I(inode));
    6738        2240 :                         if (err)
    6739           0 :                                 goto fail;
    6740             :                 }
    6741       63464 :                 d_instantiate(dentry, inode);
    6742       63464 :                 btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
    6743             :         }
    6744             : 
    6745       63578 : fail:
    6746       63578 :         fscrypt_free_filename(&fname);
    6747       63578 :         if (trans)
    6748       63463 :                 btrfs_end_transaction(trans);
    6749       63579 :         if (drop_inode) {
    6750           0 :                 inode_dec_link_count(inode);
    6751           0 :                 iput(inode);
    6752             :         }
    6753       63579 :         btrfs_btree_balance_dirty(fs_info);
    6754       63579 :         return err;
    6755             : }
    6756             : 
    6757       90449 : static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
    6758             :                        struct dentry *dentry, umode_t mode)
    6759             : {
    6760       90449 :         struct inode *inode;
    6761             : 
    6762       90449 :         inode = new_inode(dir->i_sb);
    6763       90478 :         if (!inode)
    6764             :                 return -ENOMEM;
    6765       90478 :         inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
    6766       90475 :         inode->i_op = &btrfs_dir_inode_operations;
    6767       90475 :         inode->i_fop = &btrfs_dir_file_operations;
    6768       90475 :         return btrfs_create_common(dir, dentry, inode);
    6769             : }
    6770             : 
    6771          16 : static noinline int uncompress_inline(struct btrfs_path *path,
    6772             :                                       struct page *page,
    6773             :                                       struct btrfs_file_extent_item *item)
    6774             : {
    6775          16 :         int ret;
    6776          16 :         struct extent_buffer *leaf = path->nodes[0];
    6777          16 :         char *tmp;
    6778          16 :         size_t max_size;
    6779          16 :         unsigned long inline_size;
    6780          16 :         unsigned long ptr;
    6781          16 :         int compress_type;
    6782             : 
    6783          16 :         compress_type = btrfs_file_extent_compression(leaf, item);
    6784          16 :         max_size = btrfs_file_extent_ram_bytes(leaf, item);
    6785          16 :         inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
    6786          16 :         tmp = kmalloc(inline_size, GFP_NOFS);
    6787          16 :         if (!tmp)
    6788             :                 return -ENOMEM;
    6789          16 :         ptr = btrfs_file_extent_inline_start(item);
    6790             : 
    6791          16 :         read_extent_buffer(leaf, tmp, ptr, inline_size);
    6792             : 
    6793          16 :         max_size = min_t(unsigned long, PAGE_SIZE, max_size);
    6794          16 :         ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
    6795             : 
    6796             :         /*
    6797             :          * decompression code contains a memset to fill in any space between the end
    6798             :          * of the uncompressed data and the end of max_size in case the decompressed
    6799             :          * data ends up shorter than ram_bytes.  That doesn't cover the hole between
    6800             :          * the end of an inline extent and the beginning of the next block, so we
    6801             :          * cover that region here.
    6802             :          */
    6803             : 
    6804          16 :         if (max_size < PAGE_SIZE)
    6805           9 :                 memzero_page(page, max_size, PAGE_SIZE - max_size);
    6806          16 :         kfree(tmp);
    6807          16 :         return ret;
    6808             : }
    6809             : 
    6810      235535 : static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
    6811             :                               struct page *page)
    6812             : {
    6813      235535 :         struct btrfs_file_extent_item *fi;
    6814      235535 :         void *kaddr;
    6815      235535 :         size_t copy_size;
    6816             : 
    6817      235535 :         if (!page || PageUptodate(page))
    6818      214241 :                 return 0;
    6819             : 
    6820       21294 :         ASSERT(page_offset(page) == 0);
    6821             : 
    6822       21294 :         fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
    6823             :                             struct btrfs_file_extent_item);
    6824       21293 :         if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
    6825          16 :                 return uncompress_inline(path, page, fi);
    6826             : 
    6827       21278 :         copy_size = min_t(u64, PAGE_SIZE,
    6828             :                           btrfs_file_extent_ram_bytes(path->nodes[0], fi));
    6829       21278 :         kaddr = kmap_local_page(page);
    6830       21278 :         read_extent_buffer(path->nodes[0], kaddr,
    6831             :                            btrfs_file_extent_inline_start(fi), copy_size);
    6832       21278 :         kunmap_local(kaddr);
    6833       21278 :         if (copy_size < PAGE_SIZE)
    6834       21278 :                 memzero_page(page, copy_size, PAGE_SIZE - copy_size);
    6835             :         return 0;
    6836             : }
    6837             : 
    6838             : /*
    6839             :  * Lookup the first extent overlapping a range in a file.
    6840             :  *
    6841             :  * @inode:      file to search in
    6842             :  * @page:       page to read extent data into if the extent is inline
    6843             :  * @pg_offset:  offset into @page to copy to
    6844             :  * @start:      file offset
    6845             :  * @len:        length of range starting at @start
    6846             :  *
    6847             :  * Return the first &struct extent_map which overlaps the given range, reading
    6848             :  * it from the B-tree and caching it if necessary. Note that there may be more
    6849             :  * extents which overlap the given range after the returned extent_map.
    6850             :  *
    6851             :  * If @page is not NULL and the extent is inline, this also reads the extent
    6852             :  * data directly into the page and marks the extent up to date in the io_tree.
    6853             :  *
    6854             :  * Return: ERR_PTR on error, non-NULL extent_map on success.
    6855             :  */
    6856   105179336 : struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
    6857             :                                     struct page *page, size_t pg_offset,
    6858             :                                     u64 start, u64 len)
    6859             : {
    6860   105179336 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    6861   105179336 :         int ret = 0;
    6862   105179336 :         u64 extent_start = 0;
    6863   105179336 :         u64 extent_end = 0;
    6864   105179336 :         u64 objectid = btrfs_ino(inode);
    6865   105179336 :         int extent_type = -1;
    6866   105179336 :         struct btrfs_path *path = NULL;
    6867   105179336 :         struct btrfs_root *root = inode->root;
    6868   105179336 :         struct btrfs_file_extent_item *item;
    6869   105179336 :         struct extent_buffer *leaf;
    6870   105179336 :         struct btrfs_key found_key;
    6871   105179336 :         struct extent_map *em = NULL;
    6872   105179336 :         struct extent_map_tree *em_tree = &inode->extent_tree;
    6873             : 
    6874   105179336 :         read_lock(&em_tree->lock);
    6875   105216444 :         em = lookup_extent_mapping(em_tree, start, len);
    6876   105211522 :         read_unlock(&em_tree->lock);
    6877             : 
    6878   105221437 :         if (em) {
    6879    97250042 :                 if (em->start > start || em->start + em->len <= start)
    6880       83190 :                         free_extent_map(em);
    6881    97166852 :                 else if (em->block_start == EXTENT_MAP_INLINE && page)
    6882          12 :                         free_extent_map(em);
    6883             :                 else
    6884    97166840 :                         goto out;
    6885             :         }
    6886     8052770 :         em = alloc_extent_map();
    6887     8047407 :         if (!em) {
    6888           0 :                 ret = -ENOMEM;
    6889           0 :                 goto out;
    6890             :         }
    6891     8047407 :         em->start = EXTENT_MAP_HOLE;
    6892     8047407 :         em->orig_start = EXTENT_MAP_HOLE;
    6893     8047407 :         em->len = (u64)-1;
    6894     8047407 :         em->block_len = (u64)-1;
    6895             : 
    6896     8047407 :         path = btrfs_alloc_path();
    6897     8048322 :         if (!path) {
    6898           0 :                 ret = -ENOMEM;
    6899           0 :                 goto out;
    6900             :         }
    6901             : 
    6902             :         /* Chances are we'll be called again, so go ahead and do readahead */
    6903     8048322 :         path->reada = READA_FORWARD;
    6904             : 
    6905             :         /*
    6906             :          * The same explanation in load_free_space_cache applies here as well,
    6907             :          * we only read when we're loading the free space cache, and at that
    6908             :          * point the commit_root has everything we need.
    6909             :          */
    6910    16096644 :         if (btrfs_is_free_space_inode(inode)) {
    6911           1 :                 path->search_commit_root = 1;
    6912           1 :                 path->skip_locking = 1;
    6913             :         }
    6914             : 
    6915     8048322 :         ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
    6916     8046613 :         if (ret < 0) {
    6917           0 :                 goto out;
    6918     8046613 :         } else if (ret > 0) {
    6919     6921609 :                 if (path->slots[0] == 0)
    6920           0 :                         goto not_found;
    6921     6921609 :                 path->slots[0]--;
    6922     6921609 :                 ret = 0;
    6923             :         }
    6924             : 
    6925     8046613 :         leaf = path->nodes[0];
    6926     8046613 :         item = btrfs_item_ptr(leaf, path->slots[0],
    6927             :                               struct btrfs_file_extent_item);
    6928     8037898 :         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
    6929     8043432 :         if (found_key.objectid != objectid ||
    6930     8043395 :             found_key.type != BTRFS_EXTENT_DATA_KEY) {
    6931             :                 /*
    6932             :                  * If we backup past the first extent we want to move forward
    6933             :                  * and see if there is an extent in front of us, otherwise we'll
    6934             :                  * say there is a hole for our whole search range which can
    6935             :                  * cause problems.
    6936             :                  */
    6937     1872031 :                 extent_end = start;
    6938     1872031 :                 goto next;
    6939             :         }
    6940             : 
    6941     6171401 :         extent_type = btrfs_file_extent_type(leaf, item);
    6942     6171420 :         extent_start = found_key.offset;
    6943     6171420 :         extent_end = btrfs_file_extent_end(path);
    6944     6171361 :         if (extent_type == BTRFS_FILE_EXTENT_REG ||
    6945             :             extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
    6946             :                 /* Only regular file could have regular/prealloc extent */
    6947     5935707 :                 if (!S_ISREG(inode->vfs_inode.i_mode)) {
    6948           0 :                         ret = -EUCLEAN;
    6949           0 :                         btrfs_crit(fs_info,
    6950             :                 "regular/prealloc extent found for non-regular inode %llu",
    6951             :                                    btrfs_ino(inode));
    6952           0 :                         goto out;
    6953             :                 }
    6954     5935707 :                 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
    6955             :                                                        extent_start);
    6956      235654 :         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
    6957      235654 :                 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
    6958             :                                                       path->slots[0],
    6959             :                                                       extent_start);
    6960             :         }
    6961     8043354 : next:
    6962     8055403 :         if (start >= extent_end) {
    6963     6881220 :                 path->slots[0]++;
    6964     6881220 :                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
    6965      776725 :                         ret = btrfs_next_leaf(root, path);
    6966      776730 :                         if (ret < 0)
    6967           0 :                                 goto out;
    6968      776730 :                         else if (ret > 0)
    6969      707065 :                                 goto not_found;
    6970             : 
    6971       69665 :                         leaf = path->nodes[0];
    6972             :                 }
    6973     6174160 :                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
    6974     6180879 :                 if (found_key.objectid != objectid ||
    6975     4482834 :                     found_key.type != BTRFS_EXTENT_DATA_KEY)
    6976     1698045 :                         goto not_found;
    6977     4482834 :                 if (start + len <= found_key.offset)
    6978     4447294 :                         goto not_found;
    6979       35540 :                 if (start > found_key.offset)
    6980       12049 :                         goto next;
    6981             : 
    6982             :                 /* New extent overlaps with existing one */
    6983       23491 :                 em->start = start;
    6984       23491 :                 em->orig_start = start;
    6985       23491 :                 em->len = found_key.offset - start;
    6986       23491 :                 em->block_start = EXTENT_MAP_HOLE;
    6987       23491 :                 goto insert;
    6988             :         }
    6989             : 
    6990     1174183 :         btrfs_extent_item_to_extent_map(inode, path, item, em);
    6991             : 
    6992     1174218 :         if (extent_type == BTRFS_FILE_EXTENT_REG ||
    6993             :             extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
    6994      938683 :                 goto insert;
    6995      235535 :         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
    6996             :                 /*
    6997             :                  * Inline extent can only exist at file offset 0. This is
    6998             :                  * ensured by tree-checker and inline extent creation path.
    6999             :                  * Thus all members representing file offsets should be zero.
    7000             :                  */
    7001      235535 :                 ASSERT(pg_offset == 0);
    7002      235535 :                 ASSERT(extent_start == 0);
    7003      235535 :                 ASSERT(em->start == 0);
    7004             : 
    7005             :                 /*
    7006             :                  * btrfs_extent_item_to_extent_map() should have properly
    7007             :                  * initialized em members already.
    7008             :                  *
    7009             :                  * Other members are not utilized for inline extents.
    7010             :                  */
    7011      235535 :                 ASSERT(em->block_start == EXTENT_MAP_INLINE);
    7012      235535 :                 ASSERT(em->len == fs_info->sectorsize);
    7013             : 
    7014      235535 :                 ret = read_inline_extent(inode, path, page);
    7015      235535 :                 if (ret < 0)
    7016           0 :                         goto out;
    7017      235535 :                 goto insert;
    7018             :         }
    7019           0 : not_found:
    7020     6852404 :         em->start = start;
    7021     6852404 :         em->orig_start = start;
    7022     6852404 :         em->len = len;
    7023     6852404 :         em->block_start = EXTENT_MAP_HOLE;
    7024     8050113 : insert:
    7025     8050113 :         ret = 0;
    7026     8050113 :         btrfs_release_path(path);
    7027    16110420 :         if (em->start > start || extent_map_end(em) <= start) {
    7028          42 :                 btrfs_err(fs_info,
    7029             :                           "bad extent! em: [%llu %llu] passed [%llu %llu]",
    7030             :                           em->start, em->len, start, len);
    7031           0 :                 ret = -EIO;
    7032           0 :                 goto out;
    7033             :         }
    7034             : 
    7035     8055168 :         write_lock(&em_tree->lock);
    7036     8055035 :         ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
    7037     8053356 :         write_unlock(&em_tree->lock);
    7038   105217947 : out:
    7039   105217947 :         btrfs_free_path(path);
    7040             : 
    7041   105175349 :         trace_btrfs_get_extent(root, inode, em);
    7042             : 
    7043   105156858 :         if (ret) {
    7044           0 :                 free_extent_map(em);
    7045           0 :                 return ERR_PTR(ret);
    7046             :         }
    7047   105156858 :         return em;
    7048             : }
    7049             : 
    7050      856929 : static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
    7051             :                                                   struct btrfs_dio_data *dio_data,
    7052             :                                                   const u64 start,
    7053             :                                                   const u64 len,
    7054             :                                                   const u64 orig_start,
    7055             :                                                   const u64 block_start,
    7056             :                                                   const u64 block_len,
    7057             :                                                   const u64 orig_block_len,
    7058             :                                                   const u64 ram_bytes,
    7059             :                                                   const int type)
    7060             : {
    7061      856929 :         struct extent_map *em = NULL;
    7062      856929 :         struct btrfs_ordered_extent *ordered;
    7063             : 
    7064      856929 :         if (type != BTRFS_ORDERED_NOCOW) {
    7065      856928 :                 em = create_io_em(inode, start, len, orig_start, block_start,
    7066             :                                   block_len, orig_block_len, ram_bytes,
    7067             :                                   BTRFS_COMPRESS_NONE, /* compress_type */
    7068             :                                   type);
    7069      856938 :                 if (IS_ERR(em))
    7070           0 :                         goto out;
    7071             :         }
    7072      856939 :         ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
    7073             :                                              block_start, block_len, 0,
    7074      856939 :                                              (1 << type) |
    7075             :                                              (1 << BTRFS_ORDERED_DIRECT),
    7076             :                                              BTRFS_COMPRESS_NONE);
    7077      857045 :         if (IS_ERR(ordered)) {
    7078           0 :                 if (em) {
    7079           0 :                         free_extent_map(em);
    7080           0 :                         btrfs_drop_extent_map_range(inode, start,
    7081           0 :                                                     start + len - 1, false);
    7082             :                 }
    7083             :                 em = ERR_CAST(ordered);
    7084             :         } else {
    7085      857045 :                 ASSERT(!dio_data->ordered);
    7086      857045 :                 dio_data->ordered = ordered;
    7087             :         }
    7088      857045 :  out:
    7089             : 
    7090      857045 :         return em;
    7091             : }
    7092             : 
    7093      782128 : static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
    7094             :                                                   struct btrfs_dio_data *dio_data,
    7095             :                                                   u64 start, u64 len)
    7096             : {
    7097      782128 :         struct btrfs_root *root = inode->root;
    7098      782128 :         struct btrfs_fs_info *fs_info = root->fs_info;
    7099      782128 :         struct extent_map *em;
    7100      782128 :         struct btrfs_key ins;
    7101      782128 :         u64 alloc_hint;
    7102      782128 :         int ret;
    7103             : 
    7104      782128 :         alloc_hint = get_extent_allocation_hint(inode, start, len);
    7105      782146 :         ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
    7106             :                                    0, alloc_hint, &ins, 1, 1);
    7107      782063 :         if (ret)
    7108           0 :                 return ERR_PTR(ret);
    7109             : 
    7110      782063 :         em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
    7111             :                                      ins.objectid, ins.offset, ins.offset,
    7112             :                                      ins.offset, BTRFS_ORDERED_REGULAR);
    7113      782169 :         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
    7114      782146 :         if (IS_ERR(em))
    7115           0 :                 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
    7116             :                                            1);
    7117             : 
    7118             :         return em;
    7119             : }
    7120             : 
    7121       80803 : static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
    7122             : {
    7123       80803 :         struct btrfs_block_group *block_group;
    7124       80803 :         bool readonly = false;
    7125             : 
    7126       80803 :         block_group = btrfs_lookup_block_group(fs_info, bytenr);
    7127       80804 :         if (!block_group || block_group->ro)
    7128           0 :                 readonly = true;
    7129       80804 :         if (block_group)
    7130       80804 :                 btrfs_put_block_group(block_group);
    7131       80804 :         return readonly;
    7132             : }
    7133             : 
    7134             : /*
    7135             :  * Check if we can do nocow write into the range [@offset, @offset + @len)
    7136             :  *
    7137             :  * @offset:     File offset
    7138             :  * @len:        The length to write, will be updated to the nocow writeable
    7139             :  *              range
    7140             :  * @orig_start: (optional) Return the original file offset of the file extent
    7141             :  * @orig_len:   (optional) Return the original on-disk length of the file extent
    7142             :  * @ram_bytes:  (optional) Return the ram_bytes of the file extent
    7143             :  * @strict:     if true, omit optimizations that might force us into unnecessary
    7144             :  *              cow. e.g., don't trust generation number.
    7145             :  *
    7146             :  * Return:
    7147             :  * >0        and update @len if we can do nocow write
    7148             :  *  0   if we can't do nocow write
    7149             :  * <0        if error happened
    7150             :  *
    7151             :  * NOTE: This only checks the file extents, caller is responsible to wait for
    7152             :  *       any ordered extents.
    7153             :  */
    7154      102008 : noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
    7155             :                               u64 *orig_start, u64 *orig_block_len,
    7156             :                               u64 *ram_bytes, bool nowait, bool strict)
    7157             : {
    7158      102008 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    7159      102008 :         struct can_nocow_file_extent_args nocow_args = { 0 };
    7160      102008 :         struct btrfs_path *path;
    7161      102008 :         int ret;
    7162      102008 :         struct extent_buffer *leaf;
    7163      102008 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    7164      102008 :         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
    7165      102008 :         struct btrfs_file_extent_item *fi;
    7166      102008 :         struct btrfs_key key;
    7167      102008 :         int found_type;
    7168             : 
    7169      102008 :         path = btrfs_alloc_path();
    7170      102010 :         if (!path)
    7171             :                 return -ENOMEM;
    7172      102010 :         path->nowait = nowait;
    7173             : 
    7174      102010 :         ret = btrfs_lookup_file_extent(NULL, root, path,
    7175             :                         btrfs_ino(BTRFS_I(inode)), offset, 0);
    7176      102009 :         if (ret < 0)
    7177           0 :                 goto out;
    7178             : 
    7179      102009 :         if (ret == 1) {
    7180       53771 :                 if (path->slots[0] == 0) {
    7181             :                         /* can't find the item, must cow */
    7182           0 :                         ret = 0;
    7183           0 :                         goto out;
    7184             :                 }
    7185       53771 :                 path->slots[0]--;
    7186             :         }
    7187      102009 :         ret = 0;
    7188      102009 :         leaf = path->nodes[0];
    7189      102009 :         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    7190      102008 :         if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
    7191      102010 :             key.type != BTRFS_EXTENT_DATA_KEY) {
    7192             :                 /* not our file or wrong item type, must cow */
    7193         150 :                 goto out;
    7194             :         }
    7195             : 
    7196      101858 :         if (key.offset > offset) {
    7197             :                 /* Wrong offset, must cow */
    7198           0 :                 goto out;
    7199             :         }
    7200             : 
    7201      101858 :         if (btrfs_file_extent_end(path) <= offset)
    7202        1371 :                 goto out;
    7203             : 
    7204      100488 :         fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
    7205      100488 :         found_type = btrfs_file_extent_type(leaf, fi);
    7206      100488 :         if (ram_bytes)
    7207       94023 :                 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
    7208             : 
    7209      100489 :         nocow_args.start = offset;
    7210      100489 :         nocow_args.end = offset + *len - 1;
    7211      100489 :         nocow_args.strict = strict;
    7212      100489 :         nocow_args.free_path = true;
    7213             : 
    7214      100489 :         ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
    7215             :         /* can_nocow_file_extent() has freed the path. */
    7216      100487 :         path = NULL;
    7217             : 
    7218      100487 :         if (ret != 1) {
    7219             :                 /* Treat errors as not being able to NOCOW. */
    7220       19684 :                 ret = 0;
    7221       19684 :                 goto out;
    7222             :         }
    7223             : 
    7224       80803 :         ret = 0;
    7225       80803 :         if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
    7226           0 :                 goto out;
    7227             : 
    7228       80804 :         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
    7229             :             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
    7230       80239 :                 u64 range_end;
    7231             : 
    7232       80239 :                 range_end = round_up(offset + nocow_args.num_bytes,
    7233             :                                      root->fs_info->sectorsize) - 1;
    7234       80239 :                 ret = test_range_bit(io_tree, offset, range_end,
    7235             :                                      EXTENT_DELALLOC, 0, NULL);
    7236       80239 :                 if (ret) {
    7237           3 :                         ret = -EAGAIN;
    7238           3 :                         goto out;
    7239             :                 }
    7240             :         }
    7241             : 
    7242       80801 :         if (orig_start)
    7243       74892 :                 *orig_start = key.offset - nocow_args.extent_offset;
    7244       80801 :         if (orig_block_len)
    7245       74892 :                 *orig_block_len = nocow_args.disk_num_bytes;
    7246             : 
    7247       80801 :         *len = nocow_args.num_bytes;
    7248       80801 :         ret = 1;
    7249      102009 : out:
    7250      102009 :         btrfs_free_path(path);
    7251      102009 :         return ret;
    7252             : }
    7253             : 
    7254     2163194 : static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
    7255             :                               struct extent_state **cached_state,
    7256             :                               unsigned int iomap_flags)
    7257             : {
    7258     2163194 :         const bool writing = (iomap_flags & IOMAP_WRITE);
    7259     2163194 :         const bool nowait = (iomap_flags & IOMAP_NOWAIT);
    7260     2163194 :         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
    7261     2163194 :         struct btrfs_ordered_extent *ordered;
    7262     2163194 :         int ret = 0;
    7263             : 
    7264        9212 :         while (1) {
    7265     2172334 :                 if (nowait) {
    7266           3 :                         if (!try_lock_extent(io_tree, lockstart, lockend,
    7267             :                                              cached_state))
    7268             :                                 return -EAGAIN;
    7269             :                 } else {
    7270     2172331 :                         lock_extent(io_tree, lockstart, lockend, cached_state);
    7271             :                 }
    7272             :                 /*
    7273             :                  * We're concerned with the entire range that we're going to be
    7274             :                  * doing DIO to, so we need to make sure there's no ordered
    7275             :                  * extents in this range.
    7276             :                  */
    7277     2173831 :                 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
    7278     2173831 :                                                      lockend - lockstart + 1);
    7279             : 
    7280             :                 /*
    7281             :                  * We need to make sure there are no buffered pages in this
    7282             :                  * range either, we could have raced between the invalidate in
    7283             :                  * generic_file_direct_write and locking the extent.  The
    7284             :                  * invalidate needs to happen so that reads after a write do not
    7285             :                  * get stale data.
    7286             :                  */
    7287     2173761 :                 if (!ordered &&
    7288      869770 :                     (!writing || !filemap_range_has_page(inode->i_mapping,
    7289             :                                                          lockstart, lockend)))
    7290             :                         break;
    7291             : 
    7292      693440 :                 unlock_extent(io_tree, lockstart, lockend, cached_state);
    7293             : 
    7294      692892 :                 if (ordered) {
    7295      692739 :                         if (nowait) {
    7296           0 :                                 btrfs_put_ordered_extent(ordered);
    7297           0 :                                 ret = -EAGAIN;
    7298           0 :                                 break;
    7299             :                         }
    7300             :                         /*
    7301             :                          * If we are doing a DIO read and the ordered extent we
    7302             :                          * found is for a buffered write, we can not wait for it
    7303             :                          * to complete and retry, because if we do so we can
    7304             :                          * deadlock with concurrent buffered writes on page
    7305             :                          * locks. This happens only if our DIO read covers more
    7306             :                          * than one extent map, if at this point has already
    7307             :                          * created an ordered extent for a previous extent map
    7308             :                          * and locked its range in the inode's io tree, and a
    7309             :                          * concurrent write against that previous extent map's
    7310             :                          * range and this range started (we unlock the ranges
    7311             :                          * in the io tree only when the bios complete and
    7312             :                          * buffered writes always lock pages before attempting
    7313             :                          * to lock range in the io tree).
    7314             :                          */
    7315     1383926 :                         if (writing ||
    7316      691187 :                             test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
    7317        9213 :                                 btrfs_start_ordered_extent(ordered);
    7318             :                         else
    7319             :                                 ret = nowait ? -EAGAIN : -ENOTBLK;
    7320      692738 :                         btrfs_put_ordered_extent(ordered);
    7321             :                 } else {
    7322             :                         /*
    7323             :                          * We could trigger writeback for this range (and wait
    7324             :                          * for it to complete) and then invalidate the pages for
    7325             :                          * this range (through invalidate_inode_pages2_range()),
    7326             :                          * but that can lead us to a deadlock with a concurrent
    7327             :                          * call to readahead (a buffered read or a defrag call
    7328             :                          * triggered a readahead) on a page lock due to an
    7329             :                          * ordered dio extent we created before but did not have
    7330             :                          * yet a corresponding bio submitted (whence it can not
    7331             :                          * complete), which makes readahead wait for that
    7332             :                          * ordered extent to complete while holding a lock on
    7333             :                          * that page.
    7334             :                          */
    7335         153 :                         ret = nowait ? -EAGAIN : -ENOTBLK;
    7336             :                 }
    7337             : 
    7338      692890 :                 if (ret)
    7339             :                         break;
    7340             : 
    7341        9211 :                 cond_resched();
    7342             :         }
    7343             : 
    7344             :         return ret;
    7345             : }
    7346             : 
    7347             : /* The callers of this must take lock_extent() */
    7348     3501028 : static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
    7349             :                                        u64 len, u64 orig_start, u64 block_start,
    7350             :                                        u64 block_len, u64 orig_block_len,
    7351             :                                        u64 ram_bytes, int compress_type,
    7352             :                                        int type)
    7353             : {
    7354     3501028 :         struct extent_map *em;
    7355     3501028 :         int ret;
    7356             : 
    7357     3501028 :         ASSERT(type == BTRFS_ORDERED_PREALLOC ||
    7358             :                type == BTRFS_ORDERED_COMPRESSED ||
    7359             :                type == BTRFS_ORDERED_NOCOW ||
    7360             :                type == BTRFS_ORDERED_REGULAR);
    7361             : 
    7362     3501028 :         em = alloc_extent_map();
    7363     3501059 :         if (!em)
    7364             :                 return ERR_PTR(-ENOMEM);
    7365             : 
    7366     3501059 :         em->start = start;
    7367     3501059 :         em->orig_start = orig_start;
    7368     3501059 :         em->len = len;
    7369     3501059 :         em->block_len = block_len;
    7370     3501059 :         em->block_start = block_start;
    7371     3501059 :         em->orig_block_len = orig_block_len;
    7372     3501059 :         em->ram_bytes = ram_bytes;
    7373     3501059 :         em->generation = -1;
    7374     3501059 :         set_bit(EXTENT_FLAG_PINNED, &em->flags);
    7375     3501264 :         if (type == BTRFS_ORDERED_PREALLOC) {
    7376      291247 :                 set_bit(EXTENT_FLAG_FILLING, &em->flags);
    7377     3210017 :         } else if (type == BTRFS_ORDERED_COMPRESSED) {
    7378      158282 :                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
    7379      158282 :                 em->compress_type = compress_type;
    7380             :         }
    7381             : 
    7382     3501264 :         ret = btrfs_replace_extent_map_range(inode, em, true);
    7383     3501147 :         if (ret) {
    7384           0 :                 free_extent_map(em);
    7385           0 :                 return ERR_PTR(ret);
    7386             :         }
    7387             : 
    7388             :         /* em got 2 refs now, callers needs to do free_extent_map once. */
    7389             :         return em;
    7390             : }
    7391             : 
    7392             : 
    7393      869736 : static int btrfs_get_blocks_direct_write(struct extent_map **map,
    7394             :                                          struct inode *inode,
    7395             :                                          struct btrfs_dio_data *dio_data,
    7396             :                                          u64 start, u64 *lenp,
    7397             :                                          unsigned int iomap_flags)
    7398             : {
    7399      869736 :         const bool nowait = (iomap_flags & IOMAP_NOWAIT);
    7400      869736 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    7401      869736 :         struct extent_map *em = *map;
    7402      869736 :         int type;
    7403      869736 :         u64 block_start, orig_start, orig_block_len, ram_bytes;
    7404      869736 :         struct btrfs_block_group *bg;
    7405      869736 :         bool can_nocow = false;
    7406      869736 :         bool space_reserved = false;
    7407      869736 :         u64 len = *lenp;
    7408      869736 :         u64 prev_len;
    7409      869736 :         int ret = 0;
    7410             : 
    7411             :         /*
    7412             :          * We don't allocate a new extent in the following cases
    7413             :          *
    7414             :          * 1) The inode is marked as NODATACOW. In this case we'll just use the
    7415             :          * existing extent.
    7416             :          * 2) The extent is marked as PREALLOC. We're good to go here and can
    7417             :          * just use the extent.
    7418             :          *
    7419             :          */
    7420      869736 :         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
    7421      775713 :             ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
    7422           4 :              em->block_start != EXTENT_MAP_HOLE)) {
    7423      188048 :                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
    7424             :                         type = BTRFS_ORDERED_PREALLOC;
    7425             :                 else
    7426           1 :                         type = BTRFS_ORDERED_NOCOW;
    7427       94024 :                 len = min(len, em->len - (start - em->start));
    7428       94024 :                 block_start = em->block_start + (start - em->start);
    7429             : 
    7430       94024 :                 if (can_nocow_extent(inode, start, &len, &orig_start,
    7431             :                                      &orig_block_len, &ram_bytes, false, false) == 1) {
    7432       74892 :                         bg = btrfs_inc_nocow_writers(fs_info, block_start);
    7433       74892 :                         if (bg)
    7434       74892 :                                 can_nocow = true;
    7435             :                 }
    7436             :         }
    7437             : 
    7438      869718 :         prev_len = len;
    7439      869718 :         if (can_nocow) {
    7440       74892 :                 struct extent_map *em2;
    7441             : 
    7442             :                 /* We can NOCOW, so only need to reserve metadata space. */
    7443       74892 :                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
    7444             :                                                       nowait);
    7445       74892 :                 if (ret < 0) {
    7446             :                         /* Our caller expects us to free the input extent map. */
    7447          17 :                         free_extent_map(em);
    7448          17 :                         *map = NULL;
    7449          17 :                         btrfs_dec_nocow_writers(bg);
    7450          17 :                         if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
    7451           0 :                                 ret = -EAGAIN;
    7452          17 :                         goto out;
    7453             :                 }
    7454       74875 :                 space_reserved = true;
    7455             : 
    7456       74875 :                 em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
    7457             :                                               orig_start, block_start,
    7458             :                                               len, orig_block_len,
    7459             :                                               ram_bytes, type);
    7460       74874 :                 btrfs_dec_nocow_writers(bg);
    7461       74875 :                 if (type == BTRFS_ORDERED_PREALLOC) {
    7462       74874 :                         free_extent_map(em);
    7463       74874 :                         *map = em2;
    7464       74874 :                         em = em2;
    7465             :                 }
    7466             : 
    7467       74875 :                 if (IS_ERR(em2)) {
    7468           0 :                         ret = PTR_ERR(em2);
    7469           0 :                         goto out;
    7470             :                 }
    7471             : 
    7472       74875 :                 dio_data->nocow_done = true;
    7473             :         } else {
    7474             :                 /* Our caller expects us to free the input extent map. */
    7475      794826 :                 free_extent_map(em);
    7476      795095 :                 *map = NULL;
    7477             : 
    7478      795095 :                 if (nowait) {
    7479           0 :                         ret = -EAGAIN;
    7480           0 :                         goto out;
    7481             :                 }
    7482             : 
    7483             :                 /*
    7484             :                  * If we could not allocate data space before locking the file
    7485             :                  * range and we can't do a NOCOW write, then we have to fail.
    7486             :                  */
    7487      795095 :                 if (!dio_data->data_space_reserved) {
    7488       12915 :                         ret = -ENOSPC;
    7489       12915 :                         goto out;
    7490             :                 }
    7491             : 
    7492             :                 /*
    7493             :                  * We have to COW and we have already reserved data space before,
    7494             :                  * so now we reserve only metadata.
    7495             :                  */
    7496      782180 :                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
    7497             :                                                       false);
    7498      782235 :                 if (ret < 0)
    7499          78 :                         goto out;
    7500      782157 :                 space_reserved = true;
    7501             : 
    7502      782157 :                 em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
    7503      782146 :                 if (IS_ERR(em)) {
    7504           0 :                         ret = PTR_ERR(em);
    7505           0 :                         goto out;
    7506             :                 }
    7507      782146 :                 *map = em;
    7508      782146 :                 len = min(len, em->len - (start - em->start));
    7509      782146 :                 if (len < prev_len)
    7510         418 :                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
    7511             :                                                         prev_len - len, true);
    7512             :         }
    7513             : 
    7514             :         /*
    7515             :          * We have created our ordered extent, so we can now release our reservation
    7516             :          * for an outstanding extent.
    7517             :          */
    7518      857021 :         btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
    7519             : 
    7520             :         /*
    7521             :          * Need to update the i_size under the extent lock so buffered
    7522             :          * readers will get the updated i_size when we unlock.
    7523             :          */
    7524      856872 :         if (start + len > i_size_read(inode))
    7525      197223 :                 i_size_write(inode, start + len);
    7526      659649 : out:
    7527      869882 :         if (ret && space_reserved) {
    7528           0 :                 btrfs_delalloc_release_extents(BTRFS_I(inode), len);
    7529           0 :                 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
    7530             :         }
    7531      869882 :         *lenp = len;
    7532      869882 :         return ret;
    7533             : }
    7534             : 
    7535     2172660 : static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
    7536             :                 loff_t length, unsigned int flags, struct iomap *iomap,
    7537             :                 struct iomap *srcmap)
    7538             : {
    7539     2172660 :         struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
    7540     2172660 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    7541     2172660 :         struct extent_map *em;
    7542     2172660 :         struct extent_state *cached_state = NULL;
    7543     2172660 :         struct btrfs_dio_data *dio_data = iter->private;
    7544     2172660 :         u64 lockstart, lockend;
    7545     2172660 :         const bool write = !!(flags & IOMAP_WRITE);
    7546     2172660 :         int ret = 0;
    7547     2172660 :         u64 len = length;
    7548     2172660 :         const u64 data_alloc_len = length;
    7549     2172660 :         bool unlock_extents = false;
    7550             : 
    7551             :         /*
    7552             :          * We could potentially fault if we have a buffer > PAGE_SIZE, and if
    7553             :          * we're NOWAIT we may submit a bio for a partial range and return
    7554             :          * EIOCBQUEUED, which would result in an errant short read.
    7555             :          *
    7556             :          * The best way to handle this would be to allow for partial completions
    7557             :          * of iocb's, so we could submit the partial bio, return and fault in
    7558             :          * the rest of the pages, and then submit the io for the rest of the
    7559             :          * range.  However we don't have that currently, so simply return
    7560             :          * -EAGAIN at this point so that the normal path is used.
    7561             :          */
    7562     2172660 :         if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
    7563             :                 return -EAGAIN;
    7564             : 
    7565             :         /*
    7566             :          * Cap the size of reads to that usually seen in buffered I/O as we need
    7567             :          * to allocate a contiguous array for the checksums.
    7568             :          */
    7569     2172660 :         if (!write)
    7570     1293990 :                 len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
    7571             : 
    7572     2172660 :         lockstart = start;
    7573     2172660 :         lockend = start + len - 1;
    7574             : 
    7575             :         /*
    7576             :          * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
    7577             :          * enough if we've written compressed pages to this area, so we need to
    7578             :          * flush the dirty pages again to make absolutely sure that any
    7579             :          * outstanding dirty pages are on disk - the first flush only starts
    7580             :          * compression on the data, while keeping the pages locked, so by the
    7581             :          * time the second flush returns we know bios for the compressed pages
    7582             :          * were submitted and finished, and the pages no longer under writeback.
    7583             :          *
    7584             :          * If we have a NOWAIT request and we have any pages in the range that
    7585             :          * are locked, likely due to compression still in progress, we don't want
    7586             :          * to block on page locks. We also don't want to block on pages marked as
    7587             :          * dirty or under writeback (same as for the non-compression case).
    7588             :          * iomap_dio_rw() did the same check, but after that and before we got
    7589             :          * here, mmap'ed writes may have happened or buffered reads started
    7590             :          * (readpage() and readahead(), which lock pages), as we haven't locked
    7591             :          * the file range yet.
    7592             :          */
    7593     2172660 :         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
    7594             :                      &BTRFS_I(inode)->runtime_flags)) {
    7595         275 :                 if (flags & IOMAP_NOWAIT) {
    7596           0 :                         if (filemap_range_needs_writeback(inode->i_mapping,
    7597             :                                                           lockstart, lockend))
    7598             :                                 return -EAGAIN;
    7599             :                 } else {
    7600         275 :                         ret = filemap_fdatawrite_range(inode->i_mapping, start,
    7601         275 :                                                        start + length - 1);
    7602         275 :                         if (ret)
    7603             :                                 return ret;
    7604             :                 }
    7605             :         }
    7606             : 
    7607     2172660 :         memset(dio_data, 0, sizeof(*dio_data));
    7608             : 
    7609             :         /*
    7610             :          * We always try to allocate data space and must do it before locking
    7611             :          * the file range, to avoid deadlocks with concurrent writes to the same
    7612             :          * range if the range has several extents and the writes don't expand the
    7613             :          * current i_size (the inode lock is taken in shared mode). If we fail to
    7614             :          * allocate data space here we continue and later, after locking the
    7615             :          * file range, we fail with ENOSPC only if we figure out we can not do a
    7616             :          * NOCOW write.
    7617             :          */
    7618     2172660 :         if (write && !(flags & IOMAP_NOWAIT)) {
    7619      879223 :                 ret = btrfs_check_data_free_space(BTRFS_I(inode),
    7620             :                                                   &dio_data->data_reserved,
    7621             :                                                   start, data_alloc_len, false);
    7622      879690 :                 if (!ret)
    7623      856151 :                         dio_data->data_space_reserved = true;
    7624       23539 :                 else if (ret && !(BTRFS_I(inode)->flags &
    7625             :                                   (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
    7626        9716 :                         goto err;
    7627             :         }
    7628             : 
    7629             :         /*
    7630             :          * If this errors out it's because we couldn't invalidate pagecache for
    7631             :          * this range and we need to fallback to buffered IO, or we are doing a
    7632             :          * NOWAIT read/write and we need to block.
    7633             :          */
    7634     2163411 :         ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
    7635     2164108 :         if (ret < 0)
    7636      683679 :                 goto err;
    7637             : 
    7638     1480429 :         em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
    7639     1481256 :         if (IS_ERR(em)) {
    7640           0 :                 ret = PTR_ERR(em);
    7641           0 :                 goto unlock_err;
    7642             :         }
    7643             : 
    7644             :         /*
    7645             :          * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
    7646             :          * io.  INLINE is special, and we could probably kludge it in here, but
    7647             :          * it's still buffered so for safety lets just fall back to the generic
    7648             :          * buffered path.
    7649             :          *
    7650             :          * For COMPRESSED we _have_ to read the entire extent in so we can
    7651             :          * decompress it, so there will be buffering required no matter what we
    7652             :          * do, so go ahead and fallback to buffered.
    7653             :          *
    7654             :          * We return -ENOTBLK because that's what makes DIO go ahead and go back
    7655             :          * to buffered IO.  Don't blame me, this is the price we pay for using
    7656             :          * the generic code.
    7657             :          */
    7658     1481256 :         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
    7659     1481116 :             em->block_start == EXTENT_MAP_INLINE) {
    7660         202 :                 free_extent_map(em);
    7661             :                 /*
    7662             :                  * If we are in a NOWAIT context, return -EAGAIN in order to
    7663             :                  * fallback to buffered IO. This is not only because we can
    7664             :                  * block with buffered IO (no support for NOWAIT semantics at
    7665             :                  * the moment) but also to avoid returning short reads to user
    7666             :                  * space - this happens if we were able to read some data from
    7667             :                  * previous non-compressed extents and then when we fallback to
    7668             :                  * buffered IO, at btrfs_file_read_iter() by calling
    7669             :                  * filemap_read(), we fail to fault in pages for the read buffer,
    7670             :                  * in which case filemap_read() returns a short read (the number
    7671             :                  * of bytes previously read is > 0, so it does not return -EFAULT).
    7672             :                  */
    7673         202 :                 ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
    7674         202 :                 goto unlock_err;
    7675             :         }
    7676             : 
    7677     1481054 :         len = min(len, em->len - (start - em->start));
    7678             : 
    7679             :         /*
    7680             :          * If we have a NOWAIT request and the range contains multiple extents
    7681             :          * (or a mix of extents and holes), then we return -EAGAIN to make the
    7682             :          * caller fallback to a context where it can do a blocking (without
    7683             :          * NOWAIT) request. This way we avoid doing partial IO and returning
    7684             :          * success to the caller, which is not optimal for writes and for reads
    7685             :          * it can result in unexpected behaviour for an application.
    7686             :          *
    7687             :          * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
    7688             :          * iomap_dio_rw(), we can end up returning less data then what the caller
    7689             :          * asked for, resulting in an unexpected, and incorrect, short read.
    7690             :          * That is, the caller asked to read N bytes and we return less than that,
    7691             :          * which is wrong unless we are crossing EOF. This happens if we get a
    7692             :          * page fault error when trying to fault in pages for the buffer that is
    7693             :          * associated to the struct iov_iter passed to iomap_dio_rw(), and we
    7694             :          * have previously submitted bios for other extents in the range, in
    7695             :          * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
    7696             :          * those bios have completed by the time we get the page fault error,
    7697             :          * which we return back to our caller - we should only return EIOCBQUEUED
    7698             :          * after we have submitted bios for all the extents in the range.
    7699             :          */
    7700     1481054 :         if ((flags & IOMAP_NOWAIT) && len < length) {
    7701           2 :                 free_extent_map(em);
    7702           2 :                 ret = -EAGAIN;
    7703           2 :                 goto unlock_err;
    7704             :         }
    7705             : 
    7706     1481052 :         if (write) {
    7707      869707 :                 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
    7708             :                                                     start, &len, flags);
    7709      869758 :                 if (ret < 0)
    7710       12989 :                         goto unlock_err;
    7711      856769 :                 unlock_extents = true;
    7712             :                 /* Recalc len in case the new em is smaller than requested */
    7713      856769 :                 len = min(len, em->len - (start - em->start));
    7714      856769 :                 if (dio_data->data_space_reserved) {
    7715      855842 :                         u64 release_offset;
    7716      855842 :                         u64 release_len = 0;
    7717             : 
    7718      855842 :                         if (dio_data->nocow_done) {
    7719             :                                 release_offset = start;
    7720             :                                 release_len = data_alloc_len;
    7721      781873 :                         } else if (len < data_alloc_len) {
    7722      151942 :                                 release_offset = start + len;
    7723      151942 :                                 release_len = data_alloc_len - len;
    7724             :                         }
    7725             : 
    7726      225911 :                         if (release_len > 0)
    7727      225902 :                                 btrfs_free_reserved_data_space(BTRFS_I(inode),
    7728             :                                                                dio_data->data_reserved,
    7729             :                                                                release_offset,
    7730             :                                                                release_len);
    7731             :                 }
    7732             :         } else {
    7733             :                 /*
    7734             :                  * We need to unlock only the end area that we aren't using.
    7735             :                  * The rest is going to be unlocked by the endio routine.
    7736             :                  */
    7737      611345 :                 lockstart = start + len;
    7738      611345 :                 if (lockstart < lockend)
    7739             :                         unlock_extents = true;
    7740             :         }
    7741             : 
    7742      225902 :         if (unlock_extents)
    7743     1148853 :                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
    7744             :                               &cached_state);
    7745             :         else
    7746      319262 :                 free_extent_state(cached_state);
    7747             : 
    7748             :         /*
    7749             :          * Translate extent map information to iomap.
    7750             :          * We trim the extents (and move the addr) even though iomap code does
    7751             :          * that, since we have locked only the parts we are performing I/O in.
    7752             :          */
    7753     1467797 :         if ((em->block_start == EXTENT_MAP_HOLE) ||
    7754     1320593 :             (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
    7755      197438 :                 iomap->addr = IOMAP_NULL_ADDR;
    7756      197438 :                 iomap->type = IOMAP_HOLE;
    7757             :         } else {
    7758     1270359 :                 iomap->addr = em->block_start + (start - em->start);
    7759     1270359 :                 iomap->type = IOMAP_MAPPED;
    7760             :         }
    7761     1467797 :         iomap->offset = start;
    7762     1467797 :         iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
    7763     1467797 :         iomap->length = len;
    7764     1467797 :         free_extent_map(em);
    7765             : 
    7766     1467797 :         return 0;
    7767             : 
    7768       13193 : unlock_err:
    7769       13193 :         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
    7770             :                       &cached_state);
    7771      706644 : err:
    7772      706644 :         if (dio_data->data_space_reserved) {
    7773         376 :                 btrfs_free_reserved_data_space(BTRFS_I(inode),
    7774             :                                                dio_data->data_reserved,
    7775             :                                                start, data_alloc_len);
    7776         376 :                 extent_changeset_free(dio_data->data_reserved);
    7777             :         }
    7778             : 
    7779             :         return ret;
    7780             : }
    7781             : 
    7782     1467235 : static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
    7783             :                 ssize_t written, unsigned int flags, struct iomap *iomap)
    7784             : {
    7785     1467235 :         struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
    7786     1467235 :         struct btrfs_dio_data *dio_data = iter->private;
    7787     1467235 :         size_t submitted = dio_data->submitted;
    7788     1467235 :         const bool write = !!(flags & IOMAP_WRITE);
    7789     1467235 :         int ret = 0;
    7790             : 
    7791     1467235 :         if (!write && (iomap->type == IOMAP_HOLE)) {
    7792             :                 /* If reading from a hole, unlock and return */
    7793      197392 :                 unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
    7794             :                               NULL);
    7795      197392 :                 return 0;
    7796             :         }
    7797             : 
    7798     1269843 :         if (submitted < length) {
    7799        1183 :                 pos += submitted;
    7800        1183 :                 length -= submitted;
    7801        1183 :                 if (write)
    7802          10 :                         btrfs_finish_ordered_extent(dio_data->ordered, NULL,
    7803             :                                                     pos, length, false);
    7804             :                 else
    7805        1173 :                         unlock_extent(&BTRFS_I(inode)->io_tree, pos,
    7806        1173 :                                       pos + length - 1, NULL);
    7807             :                 ret = -ENOTBLK;
    7808             :         }
    7809     1269850 :         if (write) {
    7810      856177 :                 btrfs_put_ordered_extent(dio_data->ordered);
    7811      856667 :                 dio_data->ordered = NULL;
    7812             :         }
    7813             : 
    7814     1270340 :         if (write)
    7815      856683 :                 extent_changeset_free(dio_data->data_reserved);
    7816             :         return ret;
    7817             : }
    7818             : 
    7819     1273086 : static void btrfs_dio_end_io(struct btrfs_bio *bbio)
    7820             : {
    7821     1273086 :         struct btrfs_dio_private *dip =
    7822     1273086 :                 container_of(bbio, struct btrfs_dio_private, bbio);
    7823     1273086 :         struct btrfs_inode *inode = bbio->inode;
    7824     1273086 :         struct bio *bio = &bbio->bio;
    7825             : 
    7826     1273086 :         if (bio->bi_status) {
    7827           7 :                 btrfs_warn(inode->root->fs_info,
    7828             :                 "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
    7829             :                            btrfs_ino(inode), bio->bi_opf,
    7830             :                            dip->file_offset, dip->bytes, bio->bi_status);
    7831             :         }
    7832             : 
    7833     1273086 :         if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
    7834      860005 :                 btrfs_finish_ordered_extent(bbio->ordered, NULL,
    7835      860005 :                                             dip->file_offset, dip->bytes,
    7836      860005 :                                             !bio->bi_status);
    7837             :         } else {
    7838      413089 :                 unlock_extent(&inode->io_tree, dip->file_offset,
    7839      413089 :                               dip->file_offset + dip->bytes - 1, NULL);
    7840             :         }
    7841             : 
    7842     1273156 :         bbio->bio.bi_private = bbio->private;
    7843     1273156 :         iomap_dio_bio_end_io(bio);
    7844     1273141 : }
    7845             : 
    7846     1271485 : static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
    7847             :                                 loff_t file_offset)
    7848             : {
    7849     1271485 :         struct btrfs_bio *bbio = btrfs_bio(bio);
    7850     1271485 :         struct btrfs_dio_private *dip =
    7851     1271485 :                 container_of(bbio, struct btrfs_dio_private, bbio);
    7852     1271485 :         struct btrfs_dio_data *dio_data = iter->private;
    7853             : 
    7854     1271485 :         btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
    7855             :                        btrfs_dio_end_io, bio->bi_private);
    7856     1271358 :         bbio->inode = BTRFS_I(iter->inode);
    7857     1271358 :         bbio->file_offset = file_offset;
    7858             : 
    7859     1271358 :         dip->file_offset = file_offset;
    7860     1271358 :         dip->bytes = bio->bi_iter.bi_size;
    7861             : 
    7862     1271358 :         dio_data->submitted += bio->bi_iter.bi_size;
    7863             : 
    7864             :         /*
    7865             :          * Check if we are doing a partial write.  If we are, we need to split
    7866             :          * the ordered extent to match the submitted bio.  Hang on to the
    7867             :          * remaining unfinishable ordered_extent in dio_data so that it can be
    7868             :          * cancelled in iomap_end to avoid a deadlock wherein faulting the
    7869             :          * remaining pages is blocked on the outstanding ordered extent.
    7870             :          */
    7871     1271358 :         if (iter->flags & IOMAP_WRITE) {
    7872      859494 :                 int ret;
    7873             : 
    7874      859494 :                 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
    7875      859730 :                 if (ret) {
    7876           0 :                         btrfs_finish_ordered_extent(dio_data->ordered, NULL,
    7877           0 :                                                     file_offset, dip->bytes,
    7878             :                                                     !ret);
    7879           0 :                         bio->bi_status = errno_to_blk_status(ret);
    7880           0 :                         iomap_dio_bio_end_io(bio);
    7881           0 :                         return;
    7882             :                 }
    7883             :         }
    7884             : 
    7885     1271594 :         btrfs_submit_bio(bbio, 0);
    7886             : }
    7887             : 
    7888             : static const struct iomap_ops btrfs_dio_iomap_ops = {
    7889             :         .iomap_begin            = btrfs_dio_iomap_begin,
    7890             :         .iomap_end              = btrfs_dio_iomap_end,
    7891             : };
    7892             : 
    7893             : static const struct iomap_dio_ops btrfs_dio_ops = {
    7894             :         .submit_io              = btrfs_dio_submit_io,
    7895             :         .bio_set                = &btrfs_dio_bioset,
    7896             : };
    7897             : 
    7898     1326923 : ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
    7899             : {
    7900     1326923 :         struct btrfs_dio_data data = { 0 };
    7901             : 
    7902     1326923 :         return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
    7903             :                             IOMAP_DIO_PARTIAL, &data, done_before);
    7904             : }
    7905             : 
    7906      705512 : struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
    7907             :                                   size_t done_before)
    7908             : {
    7909      705512 :         struct btrfs_dio_data data = { 0 };
    7910             : 
    7911      705512 :         return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
    7912             :                             IOMAP_DIO_PARTIAL, &data, done_before);
    7913             : }
    7914             : 
    7915       57701 : static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
    7916             :                         u64 start, u64 len)
    7917             : {
    7918       57701 :         int     ret;
    7919             : 
    7920       57701 :         ret = fiemap_prep(inode, fieinfo, start, &len, 0);
    7921       57701 :         if (ret)
    7922             :                 return ret;
    7923             : 
    7924             :         /*
    7925             :          * fiemap_prep() called filemap_write_and_wait() for the whole possible
    7926             :          * file range (0 to LLONG_MAX), but that is not enough if we have
    7927             :          * compression enabled. The first filemap_fdatawrite_range() only kicks
    7928             :          * in the compression of data (in an async thread) and will return
    7929             :          * before the compression is done and writeback is started. A second
    7930             :          * filemap_fdatawrite_range() is needed to wait for the compression to
    7931             :          * complete and writeback to start. We also need to wait for ordered
    7932             :          * extents to complete, because our fiemap implementation uses mainly
    7933             :          * file extent items to list the extents, searching for extent maps
    7934             :          * only for file ranges with holes or prealloc extents to figure out
    7935             :          * if we have delalloc in those ranges.
    7936             :          */
    7937       49031 :         if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
    7938       32034 :                 ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
    7939       32034 :                 if (ret)
    7940             :                         return ret;
    7941             :         }
    7942             : 
    7943       49031 :         return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
    7944             : }
    7945             : 
    7946     5596660 : static int btrfs_writepages(struct address_space *mapping,
    7947             :                             struct writeback_control *wbc)
    7948             : {
    7949     5596660 :         return extent_writepages(mapping, wbc);
    7950             : }
    7951             : 
    7952     1382152 : static void btrfs_readahead(struct readahead_control *rac)
    7953             : {
    7954     1382152 :         extent_readahead(rac);
    7955     1381920 : }
    7956             : 
    7957             : /*
    7958             :  * For release_folio() and invalidate_folio() we have a race window where
    7959             :  * folio_end_writeback() is called but the subpage spinlock is not yet released.
    7960             :  * If we continue to release/invalidate the page, we could cause use-after-free
    7961             :  * for subpage spinlock.  So this function is to spin and wait for subpage
    7962             :  * spinlock.
    7963             :  */
    7964   135912704 : static void wait_subpage_spinlock(struct page *page)
    7965             : {
    7966   135912704 :         struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
    7967   135912704 :         struct btrfs_subpage *subpage;
    7968             : 
    7969   135912704 :         if (!btrfs_is_subpage(fs_info, page))
    7970             :                 return;
    7971             : 
    7972           0 :         ASSERT(PagePrivate(page) && page->private);
    7973           0 :         subpage = (struct btrfs_subpage *)page->private;
    7974             : 
    7975             :         /*
    7976             :          * This may look insane as we just acquire the spinlock and release it,
    7977             :          * without doing anything.  But we just want to make sure no one is
    7978             :          * still holding the subpage spinlock.
    7979             :          * And since the page is not dirty nor writeback, and we have page
    7980             :          * locked, the only possible way to hold a spinlock is from the endio
    7981             :          * function to clear page writeback.
    7982             :          *
    7983             :          * Here we just acquire the spinlock so that all existing callers
    7984             :          * should exit and we're safe to release/invalidate the page.
    7985             :          */
    7986           0 :         spin_lock_irq(&subpage->lock);
    7987           0 :         spin_unlock_irq(&subpage->lock);
    7988             : }
    7989             : 
    7990    50570927 : static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
    7991             : {
    7992    50570927 :         int ret = try_release_extent_mapping(&folio->page, gfp_flags);
    7993             : 
    7994    50568071 :         if (ret == 1) {
    7995    50453749 :                 wait_subpage_spinlock(&folio->page);
    7996    50453051 :                 clear_page_extent_mapped(&folio->page);
    7997             :         }
    7998    50568443 :         return ret;
    7999             : }
    8000             : 
    8001    19139319 : static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
    8002             : {
    8003    38278566 :         if (folio_test_writeback(folio) || folio_test_dirty(folio))
    8004      722832 :                 return false;
    8005    18416416 :         return __btrfs_release_folio(folio, gfp_flags);
    8006             : }
    8007             : 
    8008             : #ifdef CONFIG_MIGRATION
    8009      259073 : static int btrfs_migrate_folio(struct address_space *mapping,
    8010             :                              struct folio *dst, struct folio *src,
    8011             :                              enum migrate_mode mode)
    8012             : {
    8013      259073 :         int ret = filemap_migrate_folio(mapping, dst, src, mode);
    8014             : 
    8015      259073 :         if (ret != MIGRATEPAGE_SUCCESS)
    8016             :                 return ret;
    8017             : 
    8018      258756 :         if (folio_test_ordered(src)) {
    8019           0 :                 folio_clear_ordered(src);
    8020           0 :                 folio_set_ordered(dst);
    8021             :         }
    8022             : 
    8023             :         return MIGRATEPAGE_SUCCESS;
    8024             : }
    8025             : #else
    8026             : #define btrfs_migrate_folio NULL
    8027             : #endif
    8028             : 
    8029    85466247 : static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
    8030             :                                  size_t length)
    8031             : {
    8032    85466247 :         struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
    8033    85466247 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    8034    85466247 :         struct extent_io_tree *tree = &inode->io_tree;
    8035    85466247 :         struct extent_state *cached_state = NULL;
    8036    85466247 :         u64 page_start = folio_pos(folio);
    8037    85466247 :         u64 page_end = page_start + folio_size(folio) - 1;
    8038    85463892 :         u64 cur;
    8039    85463892 :         int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
    8040             : 
    8041             :         /*
    8042             :          * We have folio locked so no new ordered extent can be created on this
    8043             :          * page, nor bio can be submitted for this folio.
    8044             :          *
    8045             :          * But already submitted bio can still be finished on this folio.
    8046             :          * Furthermore, endio function won't skip folio which has Ordered
    8047             :          * (Private2) already cleared, so it's possible for endio and
    8048             :          * invalidate_folio to do the same ordered extent accounting twice
    8049             :          * on one folio.
    8050             :          *
    8051             :          * So here we wait for any submitted bios to finish, so that we won't
    8052             :          * do double ordered extent accounting on the same folio.
    8053             :          */
    8054    85463892 :         folio_wait_writeback(folio);
    8055    85465792 :         wait_subpage_spinlock(&folio->page);
    8056             : 
    8057             :         /*
    8058             :          * For subpage case, we have call sites like
    8059             :          * btrfs_punch_hole_lock_range() which passes range not aligned to
    8060             :          * sectorsize.
    8061             :          * If the range doesn't cover the full folio, we don't need to and
    8062             :          * shouldn't clear page extent mapped, as folio->private can still
    8063             :          * record subpage dirty bits for other part of the range.
    8064             :          *
    8065             :          * For cases that invalidate the full folio even the range doesn't
    8066             :          * cover the full folio, like invalidating the last folio, we're
    8067             :          * still safe to wait for ordered extent to finish.
    8068             :          */
    8069   170858060 :         if (!(offset == 0 && length == folio_size(folio))) {
    8070       72186 :                 btrfs_release_folio(folio, GFP_NOFS);
    8071       72602 :                 return;
    8072             :         }
    8073             : 
    8074    85392536 :         if (!inode_evicting)
    8075    32151374 :                 lock_extent(tree, page_start, page_end, &cached_state);
    8076             : 
    8077             :         cur = page_start;
    8078   170786818 :         while (cur < page_end) {
    8079    85391206 :                 struct btrfs_ordered_extent *ordered;
    8080    85391206 :                 u64 range_end;
    8081    85391206 :                 u32 range_len;
    8082    85391206 :                 u32 extra_flags = 0;
    8083             : 
    8084   170785850 :                 ordered = btrfs_lookup_first_ordered_range(inode, cur,
    8085    85391206 :                                                            page_end + 1 - cur);
    8086    85394644 :                 if (!ordered) {
    8087    85300933 :                         range_end = page_end;
    8088             :                         /*
    8089             :                          * No ordered extent covering this range, we are safe
    8090             :                          * to delete all extent states in the range.
    8091             :                          */
    8092    85300933 :                         extra_flags = EXTENT_CLEAR_ALL_BITS;
    8093    85300933 :                         goto next;
    8094             :                 }
    8095       93711 :                 if (ordered->file_offset > cur) {
    8096             :                         /*
    8097             :                          * There is a range between [cur, oe->file_offset) not
    8098             :                          * covered by any ordered extent.
    8099             :                          * We are safe to delete all extent states, and handle
    8100             :                          * the ordered extent in the next iteration.
    8101             :                          */
    8102           0 :                         range_end = ordered->file_offset - 1;
    8103           0 :                         extra_flags = EXTENT_CLEAR_ALL_BITS;
    8104           0 :                         goto next;
    8105             :                 }
    8106             : 
    8107       93711 :                 range_end = min(ordered->file_offset + ordered->num_bytes - 1,
    8108             :                                 page_end);
    8109       93711 :                 ASSERT(range_end + 1 - cur < U32_MAX);
    8110       93711 :                 range_len = range_end + 1 - cur;
    8111       93711 :                 if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
    8112             :                         /*
    8113             :                          * If Ordered (Private2) is cleared, it means endio has
    8114             :                          * already been executed for the range.
    8115             :                          * We can't delete the extent states as
    8116             :                          * btrfs_finish_ordered_io() may still use some of them.
    8117             :                          */
    8118       81246 :                         goto next;
    8119             :                 }
    8120       12465 :                 btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
    8121             : 
    8122             :                 /*
    8123             :                  * IO on this page will never be started, so we need to account
    8124             :                  * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
    8125             :                  * here, must leave that up for the ordered extent completion.
    8126             :                  *
    8127             :                  * This will also unlock the range for incoming
    8128             :                  * btrfs_finish_ordered_io().
    8129             :                  */
    8130       12465 :                 if (!inode_evicting)
    8131       12465 :                         clear_extent_bit(tree, cur, range_end,
    8132             :                                          EXTENT_DELALLOC |
    8133             :                                          EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
    8134             :                                          EXTENT_DEFRAG, &cached_state);
    8135             : 
    8136       12465 :                 spin_lock_irq(&inode->ordered_tree.lock);
    8137       12465 :                 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
    8138       12465 :                 ordered->truncated_len = min(ordered->truncated_len,
    8139             :                                              cur - ordered->file_offset);
    8140       12465 :                 spin_unlock_irq(&inode->ordered_tree.lock);
    8141             : 
    8142             :                 /*
    8143             :                  * If the ordered extent has finished, we're safe to delete all
    8144             :                  * the extent states of the range, otherwise
    8145             :                  * btrfs_finish_ordered_io() will get executed by endio for
    8146             :                  * other pages, so we can't delete extent states.
    8147             :                  */
    8148       12465 :                 if (btrfs_dec_test_ordered_pending(inode, &ordered,
    8149             :                                                    cur, range_end + 1 - cur)) {
    8150         226 :                         btrfs_finish_ordered_io(ordered);
    8151             :                         /*
    8152             :                          * The ordered extent has finished, now we're again
    8153             :                          * safe to delete all extent states of the range.
    8154             :                          */
    8155         226 :                         extra_flags = EXTENT_CLEAR_ALL_BITS;
    8156             :                 }
    8157       12239 : next:
    8158    85394644 :                 if (ordered)
    8159       93711 :                         btrfs_put_ordered_extent(ordered);
    8160             :                 /*
    8161             :                  * Qgroup reserved space handler
    8162             :                  * Sector(s) here will be either:
    8163             :                  *
    8164             :                  * 1) Already written to disk or bio already finished
    8165             :                  *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
    8166             :                  *    Qgroup will be handled by its qgroup_record then.
    8167             :                  *    btrfs_qgroup_free_data() call will do nothing here.
    8168             :                  *
    8169             :                  * 2) Not written to disk yet
    8170             :                  *    Then btrfs_qgroup_free_data() call will clear the
    8171             :                  *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
    8172             :                  *    reserved data space.
    8173             :                  *    Since the IO will never happen for this page.
    8174             :                  */
    8175    85394644 :                 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
    8176    85392444 :                 if (!inode_evicting) {
    8177    32151717 :                         clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
    8178             :                                  EXTENT_DELALLOC | EXTENT_UPTODATE |
    8179             :                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
    8180             :                                  extra_flags, &cached_state);
    8181             :                 }
    8182    85394977 :                 cur = range_end + 1;
    8183             :         }
    8184             :         /*
    8185             :          * We have iterated through all ordered extents of the page, the page
    8186             :          * should not have Ordered (Private2) anymore, or the above iteration
    8187             :          * did something wrong.
    8188             :          */
    8189    85395612 :         ASSERT(!folio_test_ordered(folio));
    8190    85394741 :         btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
    8191    85397541 :         if (!inode_evicting)
    8192    32155940 :                 __btrfs_release_folio(folio, GFP_NOFS);
    8193    85394248 :         clear_page_extent_mapped(&folio->page);
    8194             : }
    8195             : 
    8196             : /*
    8197             :  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
    8198             :  * called from a page fault handler when a page is first dirtied. Hence we must
    8199             :  * be careful to check for EOF conditions here. We set the page up correctly
    8200             :  * for a written page which means we get ENOSPC checking when writing into
    8201             :  * holes and correct delalloc and unwritten extent mapping on filesystems that
    8202             :  * support these features.
    8203             :  *
    8204             :  * We are not allowed to take the i_mutex here so we have to play games to
    8205             :  * protect against truncate races as the page could now be beyond EOF.  Because
    8206             :  * truncate_setsize() writes the inode size before removing pages, once we have
    8207             :  * the page lock we can determine safely if the page is beyond EOF. If it is not
    8208             :  * beyond EOF, then the page is guaranteed safe against truncation until we
    8209             :  * unlock the page.
    8210             :  */
    8211     8595354 : vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
    8212             : {
    8213     8595354 :         struct page *page = vmf->page;
    8214     8595354 :         struct inode *inode = file_inode(vmf->vma->vm_file);
    8215     8595354 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    8216     8595354 :         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
    8217     8595354 :         struct btrfs_ordered_extent *ordered;
    8218     8595354 :         struct extent_state *cached_state = NULL;
    8219     8595354 :         struct extent_changeset *data_reserved = NULL;
    8220     8595354 :         unsigned long zero_start;
    8221     8595354 :         loff_t size;
    8222     8595354 :         vm_fault_t ret;
    8223     8595354 :         int ret2;
    8224     8595354 :         int reserved = 0;
    8225     8595354 :         u64 reserved_space;
    8226     8595354 :         u64 page_start;
    8227     8595354 :         u64 page_end;
    8228     8595354 :         u64 end;
    8229             : 
    8230     8595354 :         reserved_space = PAGE_SIZE;
    8231             : 
    8232     8595354 :         sb_start_pagefault(inode->i_sb);
    8233     8596195 :         page_start = page_offset(page);
    8234     8596195 :         page_end = page_start + PAGE_SIZE - 1;
    8235     8596195 :         end = page_end;
    8236             : 
    8237             :         /*
    8238             :          * Reserving delalloc space after obtaining the page lock can lead to
    8239             :          * deadlock. For example, if a dirty page is locked by this function
    8240             :          * and the call to btrfs_delalloc_reserve_space() ends up triggering
    8241             :          * dirty page write out, then the btrfs_writepages() function could
    8242             :          * end up waiting indefinitely to get a lock on the page currently
    8243             :          * being processed by btrfs_page_mkwrite() function.
    8244             :          */
    8245     8596195 :         ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
    8246             :                                             page_start, reserved_space);
    8247     8615404 :         if (!ret2) {
    8248     8614498 :                 ret2 = file_update_time(vmf->vma->vm_file);
    8249     8614498 :                 reserved = 1;
    8250             :         }
    8251     8608316 :         if (ret2) {
    8252         499 :                 ret = vmf_error(ret2);
    8253         499 :                 if (reserved)
    8254          12 :                         goto out;
    8255         487 :                 goto out_noreserve;
    8256             :         }
    8257             : 
    8258             :         ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
    8259     8608129 : again:
    8260     8608129 :         down_read(&BTRFS_I(inode)->i_mmap_lock);
    8261     8612353 :         lock_page(page);
    8262     8598657 :         size = i_size_read(inode);
    8263             : 
    8264     8598657 :         if ((page->mapping != inode->i_mapping) ||
    8265     8598635 :             (page_start >= size)) {
    8266             :                 /* page got truncated out from underneath us */
    8267          22 :                 goto out_unlock;
    8268             :         }
    8269     8598635 :         wait_on_page_writeback(page);
    8270             : 
    8271     8596032 :         lock_extent(io_tree, page_start, page_end, &cached_state);
    8272     8564080 :         ret2 = set_page_extent_mapped(page);
    8273     8571033 :         if (ret2 < 0) {
    8274           0 :                 ret = vmf_error(ret2);
    8275           0 :                 unlock_extent(io_tree, page_start, page_end, &cached_state);
    8276           0 :                 goto out_unlock;
    8277             :         }
    8278             : 
    8279             :         /*
    8280             :          * we can't set the delalloc bits if there are pending ordered
    8281             :          * extents.  Drop our locks and wait for them to finish
    8282             :          */
    8283     8571033 :         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
    8284             :                         PAGE_SIZE);
    8285     8579846 :         if (ordered) {
    8286         312 :                 unlock_extent(io_tree, page_start, page_end, &cached_state);
    8287         312 :                 unlock_page(page);
    8288         312 :                 up_read(&BTRFS_I(inode)->i_mmap_lock);
    8289         312 :                 btrfs_start_ordered_extent(ordered);
    8290         312 :                 btrfs_put_ordered_extent(ordered);
    8291         312 :                 goto again;
    8292             :         }
    8293             : 
    8294     8579534 :         if (page->index == ((size - 1) >> PAGE_SHIFT)) {
    8295      330569 :                 reserved_space = round_up(size - page_start,
    8296             :                                           fs_info->sectorsize);
    8297      330569 :                 if (reserved_space < PAGE_SIZE) {
    8298           0 :                         end = page_start + reserved_space - 1;
    8299           0 :                         btrfs_delalloc_release_space(BTRFS_I(inode),
    8300             :                                         data_reserved, page_start,
    8301             :                                         PAGE_SIZE - reserved_space, true);
    8302             :                 }
    8303             :         }
    8304             : 
    8305             :         /*
    8306             :          * page_mkwrite gets called when the page is firstly dirtied after it's
    8307             :          * faulted in, but write(2) could also dirty a page and set delalloc
    8308             :          * bits, thus in this case for space account reason, we still need to
    8309             :          * clear any delalloc bits within this page range since we have to
    8310             :          * reserve data&meta space before lock_page() (see above comments).
    8311             :          */
    8312     8579534 :         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
    8313             :                           EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
    8314             :                           EXTENT_DEFRAG, &cached_state);
    8315             : 
    8316     8614273 :         ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
    8317             :                                         &cached_state);
    8318     8612626 :         if (ret2) {
    8319           0 :                 unlock_extent(io_tree, page_start, page_end, &cached_state);
    8320           0 :                 ret = VM_FAULT_SIGBUS;
    8321           0 :                 goto out_unlock;
    8322             :         }
    8323             : 
    8324             :         /* page is wholly or partially inside EOF */
    8325     8612626 :         if (page_start + PAGE_SIZE > size)
    8326      308241 :                 zero_start = offset_in_page(size);
    8327             :         else
    8328             :                 zero_start = PAGE_SIZE;
    8329             : 
    8330      308241 :         if (zero_start != PAGE_SIZE)
    8331      308241 :                 memzero_page(page, zero_start, PAGE_SIZE - zero_start);
    8332             : 
    8333     8612626 :         btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
    8334     8615047 :         btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
    8335     8609630 :         btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
    8336             : 
    8337     8614791 :         btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
    8338             : 
    8339     8572080 :         unlock_extent(io_tree, page_start, page_end, &cached_state);
    8340     8610419 :         up_read(&BTRFS_I(inode)->i_mmap_lock);
    8341             : 
    8342     8614294 :         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
    8343     8624147 :         sb_end_pagefault(inode->i_sb);
    8344     8623972 :         extent_changeset_free(data_reserved);
    8345     8623972 :         return VM_FAULT_LOCKED;
    8346             : 
    8347          22 : out_unlock:
    8348          22 :         unlock_page(page);
    8349          22 :         up_read(&BTRFS_I(inode)->i_mmap_lock);
    8350          34 : out:
    8351          34 :         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
    8352          34 :         btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
    8353             :                                      reserved_space, (ret != 0));
    8354         521 : out_noreserve:
    8355         521 :         sb_end_pagefault(inode->i_sb);
    8356         521 :         extent_changeset_free(data_reserved);
    8357         521 :         return ret;
    8358             : }
    8359             : 
    8360      245187 : static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
    8361             : {
    8362      245187 :         struct btrfs_truncate_control control = {
    8363             :                 .inode = inode,
    8364             :                 .ino = btrfs_ino(inode),
    8365             :                 .min_type = BTRFS_EXTENT_DATA_KEY,
    8366             :                 .clear_extent_range = true,
    8367             :         };
    8368      245187 :         struct btrfs_root *root = inode->root;
    8369      245187 :         struct btrfs_fs_info *fs_info = root->fs_info;
    8370      245187 :         struct btrfs_block_rsv *rsv;
    8371      245187 :         int ret;
    8372      245187 :         struct btrfs_trans_handle *trans;
    8373      245187 :         u64 mask = fs_info->sectorsize - 1;
    8374      245187 :         const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
    8375             : 
    8376      245187 :         if (!skip_writeback) {
    8377      185160 :                 ret = btrfs_wait_ordered_range(&inode->vfs_inode,
    8378      185160 :                                                inode->vfs_inode.i_size & (~mask),
    8379             :                                                (u64)-1);
    8380      185162 :                 if (ret)
    8381             :                         return ret;
    8382             :         }
    8383             : 
    8384             :         /*
    8385             :          * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
    8386             :          * things going on here:
    8387             :          *
    8388             :          * 1) We need to reserve space to update our inode.
    8389             :          *
    8390             :          * 2) We need to have something to cache all the space that is going to
    8391             :          * be free'd up by the truncate operation, but also have some slack
    8392             :          * space reserved in case it uses space during the truncate (thank you
    8393             :          * very much snapshotting).
    8394             :          *
    8395             :          * And we need these to be separate.  The fact is we can use a lot of
    8396             :          * space doing the truncate, and we have no earthly idea how much space
    8397             :          * we will use, so we need the truncate reservation to be separate so it
    8398             :          * doesn't end up using space reserved for updating the inode.  We also
    8399             :          * need to be able to stop the transaction and start a new one, which
    8400             :          * means we need to be able to update the inode several times, and we
    8401             :          * have no idea of knowing how many times that will be, so we can't just
    8402             :          * reserve 1 item for the entirety of the operation, so that has to be
    8403             :          * done separately as well.
    8404             :          *
    8405             :          * So that leaves us with
    8406             :          *
    8407             :          * 1) rsv - for the truncate reservation, which we will steal from the
    8408             :          * transaction reservation.
    8409             :          * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
    8410             :          * updating the inode.
    8411             :          */
    8412      245189 :         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
    8413      245197 :         if (!rsv)
    8414             :                 return -ENOMEM;
    8415      245197 :         rsv->size = min_size;
    8416      245197 :         rsv->failfast = true;
    8417             : 
    8418             :         /*
    8419             :          * 1 for the truncate slack space
    8420             :          * 1 for updating the inode.
    8421             :          */
    8422      245197 :         trans = btrfs_start_transaction(root, 2);
    8423      245207 :         if (IS_ERR(trans)) {
    8424         126 :                 ret = PTR_ERR(trans);
    8425         126 :                 goto out;
    8426             :         }
    8427             : 
    8428             :         /* Migrate the slack space for the truncate to our reserve */
    8429      245081 :         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
    8430             :                                       min_size, false);
    8431             :         /*
    8432             :          * We have reserved 2 metadata units when we started the transaction and
    8433             :          * min_size matches 1 unit, so this should never fail, but if it does,
    8434             :          * it's not critical we just fail truncation.
    8435             :          */
    8436      245081 :         if (WARN_ON(ret)) {
    8437           0 :                 btrfs_end_transaction(trans);
    8438           0 :                 goto out;
    8439             :         }
    8440             : 
    8441      245081 :         trans->block_rsv = rsv;
    8442             : 
    8443      251541 :         while (1) {
    8444      248311 :                 struct extent_state *cached_state = NULL;
    8445      248311 :                 const u64 new_size = inode->vfs_inode.i_size;
    8446      248311 :                 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
    8447             : 
    8448      248311 :                 control.new_size = new_size;
    8449      248311 :                 lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
    8450             :                 /*
    8451             :                  * We want to drop from the next block forward in case this new
    8452             :                  * size is not block aligned since we will be keeping the last
    8453             :                  * block of the extent just the way it is.
    8454             :                  */
    8455      248302 :                 btrfs_drop_extent_map_range(inode,
    8456      248302 :                                             ALIGN(new_size, fs_info->sectorsize),
    8457             :                                             (u64)-1, false);
    8458             : 
    8459      248290 :                 ret = btrfs_truncate_inode_items(trans, root, &control);
    8460             : 
    8461      248314 :                 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
    8462      248314 :                 btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
    8463             : 
    8464      248314 :                 unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
    8465             : 
    8466      248314 :                 trans->block_rsv = &fs_info->trans_block_rsv;
    8467      248314 :                 if (ret != -ENOSPC && ret != -EAGAIN)
    8468             :                         break;
    8469             : 
    8470        3230 :                 ret = btrfs_update_inode(trans, root, inode);
    8471        3230 :                 if (ret)
    8472             :                         break;
    8473             : 
    8474        3230 :                 btrfs_end_transaction(trans);
    8475        3230 :                 btrfs_btree_balance_dirty(fs_info);
    8476             : 
    8477        3230 :                 trans = btrfs_start_transaction(root, 2);
    8478        3230 :                 if (IS_ERR(trans)) {
    8479           0 :                         ret = PTR_ERR(trans);
    8480           0 :                         trans = NULL;
    8481           0 :                         break;
    8482             :                 }
    8483             : 
    8484        3230 :                 btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
    8485        3230 :                 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
    8486             :                                               rsv, min_size, false);
    8487             :                 /*
    8488             :                  * We have reserved 2 metadata units when we started the
    8489             :                  * transaction and min_size matches 1 unit, so this should never
    8490             :                  * fail, but if it does, it's not critical we just fail truncation.
    8491             :                  */
    8492        3230 :                 if (WARN_ON(ret))
    8493             :                         break;
    8494             : 
    8495        3230 :                 trans->block_rsv = rsv;
    8496             :         }
    8497             : 
    8498             :         /*
    8499             :          * We can't call btrfs_truncate_block inside a trans handle as we could
    8500             :          * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
    8501             :          * know we've truncated everything except the last little bit, and can
    8502             :          * do btrfs_truncate_block and then update the disk_i_size.
    8503             :          */
    8504      245084 :         if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
    8505           1 :                 btrfs_end_transaction(trans);
    8506           1 :                 btrfs_btree_balance_dirty(fs_info);
    8507             : 
    8508           1 :                 ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
    8509           1 :                 if (ret)
    8510           0 :                         goto out;
    8511           1 :                 trans = btrfs_start_transaction(root, 1);
    8512           1 :                 if (IS_ERR(trans)) {
    8513           0 :                         ret = PTR_ERR(trans);
    8514           0 :                         goto out;
    8515             :                 }
    8516           1 :                 btrfs_inode_safe_disk_i_size_write(inode, 0);
    8517             :         }
    8518             : 
    8519      245084 :         if (trans) {
    8520      245084 :                 int ret2;
    8521             : 
    8522      245084 :                 trans->block_rsv = &fs_info->trans_block_rsv;
    8523      245084 :                 ret2 = btrfs_update_inode(trans, root, inode);
    8524      245084 :                 if (ret2 && !ret)
    8525           0 :                         ret = ret2;
    8526             : 
    8527      245084 :                 ret2 = btrfs_end_transaction(trans);
    8528      245084 :                 if (ret2 && !ret)
    8529           0 :                         ret = ret2;
    8530      245084 :                 btrfs_btree_balance_dirty(fs_info);
    8531             :         }
    8532           0 : out:
    8533      245205 :         btrfs_free_block_rsv(fs_info, rsv);
    8534             :         /*
    8535             :          * So if we truncate and then write and fsync we normally would just
    8536             :          * write the extents that changed, which is a problem if we need to
    8537             :          * first truncate that entire inode.  So set this flag so we write out
    8538             :          * all of the extents in the inode to the sync log so we're completely
    8539             :          * safe.
    8540             :          *
    8541             :          * If no extents were dropped or trimmed we don't need to force the next
    8542             :          * fsync to truncate all the inode's items from the log and re-log them
    8543             :          * all. This means the truncate operation did not change the file size,
    8544             :          * or changed it to a smaller size but there was only an implicit hole
    8545             :          * between the old i_size and the new i_size, and there were no prealloc
    8546             :          * extents beyond i_size to drop.
    8547             :          */
    8548      245209 :         if (control.extents_found > 0)
    8549      159391 :                 btrfs_set_inode_full_sync(inode);
    8550             : 
    8551             :         return ret;
    8552             : }
    8553             : 
    8554         252 : struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
    8555             :                                      struct inode *dir)
    8556             : {
    8557         252 :         struct inode *inode;
    8558             : 
    8559         252 :         inode = new_inode(dir->i_sb);
    8560         252 :         if (inode) {
    8561             :                 /*
    8562             :                  * Subvolumes don't inherit the sgid bit or the parent's gid if
    8563             :                  * the parent's sgid bit is set. This is probably a bug.
    8564             :                  */
    8565         504 :                 inode_init_owner(idmap, inode, NULL,
    8566         252 :                                  S_IFDIR | (~current_umask() & S_IRWXUGO));
    8567         252 :                 inode->i_op = &btrfs_dir_inode_operations;
    8568         252 :                 inode->i_fop = &btrfs_dir_file_operations;
    8569             :         }
    8570         252 :         return inode;
    8571             : }
    8572             : 
    8573     3846997 : struct inode *btrfs_alloc_inode(struct super_block *sb)
    8574             : {
    8575     3846997 :         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
    8576     3846997 :         struct btrfs_inode *ei;
    8577     3846997 :         struct inode *inode;
    8578             : 
    8579     3846997 :         ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
    8580     3857548 :         if (!ei)
    8581             :                 return NULL;
    8582             : 
    8583     3857548 :         ei->root = NULL;
    8584     3857548 :         ei->generation = 0;
    8585     3857548 :         ei->last_trans = 0;
    8586     3857548 :         ei->last_sub_trans = 0;
    8587     3857548 :         ei->logged_trans = 0;
    8588     3857548 :         ei->delalloc_bytes = 0;
    8589     3857548 :         ei->new_delalloc_bytes = 0;
    8590     3857548 :         ei->defrag_bytes = 0;
    8591     3857548 :         ei->disk_i_size = 0;
    8592     3857548 :         ei->flags = 0;
    8593     3857548 :         ei->ro_flags = 0;
    8594     3857548 :         ei->csum_bytes = 0;
    8595     3857548 :         ei->index_cnt = (u64)-1;
    8596     3857548 :         ei->dir_index = 0;
    8597     3857548 :         ei->last_unlink_trans = 0;
    8598     3857548 :         ei->last_reflink_trans = 0;
    8599     3857548 :         ei->last_log_commit = 0;
    8600             : 
    8601     3857548 :         spin_lock_init(&ei->lock);
    8602     3857432 :         ei->outstanding_extents = 0;
    8603     3857432 :         if (sb->s_magic != BTRFS_TEST_MAGIC)
    8604     3863228 :                 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
    8605             :                                               BTRFS_BLOCK_RSV_DELALLOC);
    8606     3854177 :         ei->runtime_flags = 0;
    8607     3854177 :         ei->prop_compress = BTRFS_COMPRESS_NONE;
    8608     3854177 :         ei->defrag_compress = BTRFS_COMPRESS_NONE;
    8609             : 
    8610     3854177 :         ei->delayed_node = NULL;
    8611             : 
    8612     3854177 :         ei->i_otime.tv_sec = 0;
    8613     3854177 :         ei->i_otime.tv_nsec = 0;
    8614             : 
    8615     3854177 :         inode = &ei->vfs_inode;
    8616     3854177 :         extent_map_tree_init(&ei->extent_tree);
    8617     3857618 :         extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
    8618     3855681 :         ei->io_tree.inode = ei;
    8619     3855681 :         extent_io_tree_init(fs_info, &ei->file_extent_tree,
    8620             :                             IO_TREE_INODE_FILE_EXTENT);
    8621     3857215 :         mutex_init(&ei->log_mutex);
    8622     3858504 :         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
    8623     3853026 :         INIT_LIST_HEAD(&ei->delalloc_inodes);
    8624     3853026 :         INIT_LIST_HEAD(&ei->delayed_iput);
    8625     3853026 :         RB_CLEAR_NODE(&ei->rb_node);
    8626     3853026 :         init_rwsem(&ei->i_mmap_lock);
    8627             : 
    8628     3853026 :         return inode;
    8629             : }
    8630             : 
    8631             : #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
    8632             : void btrfs_test_destroy_inode(struct inode *inode)
    8633             : {
    8634             :         btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
    8635             :         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
    8636             : }
    8637             : #endif
    8638             : 
    8639     3856690 : void btrfs_free_inode(struct inode *inode)
    8640             : {
    8641     3856690 :         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
    8642     3862595 : }
    8643             : 
    8644     3867009 : void btrfs_destroy_inode(struct inode *vfs_inode)
    8645             : {
    8646     3867009 :         struct btrfs_ordered_extent *ordered;
    8647     3867009 :         struct btrfs_inode *inode = BTRFS_I(vfs_inode);
    8648     3867009 :         struct btrfs_root *root = inode->root;
    8649     3867009 :         bool freespace_inode;
    8650             : 
    8651     3867009 :         WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
    8652     3867009 :         WARN_ON(vfs_inode->i_data.nrpages);
    8653     3867009 :         WARN_ON(inode->block_rsv.reserved);
    8654     3867009 :         WARN_ON(inode->block_rsv.size);
    8655     3867009 :         WARN_ON(inode->outstanding_extents);
    8656     3867009 :         if (!S_ISDIR(vfs_inode->i_mode)) {
    8657     3698411 :                 WARN_ON(inode->delalloc_bytes);
    8658     3698411 :                 WARN_ON(inode->new_delalloc_bytes);
    8659             :         }
    8660     3867009 :         WARN_ON(inode->csum_bytes);
    8661     3867009 :         WARN_ON(inode->defrag_bytes);
    8662             : 
    8663             :         /*
    8664             :          * This can happen where we create an inode, but somebody else also
    8665             :          * created the same inode and we need to destroy the one we already
    8666             :          * created.
    8667             :          */
    8668     3867009 :         if (!root)
    8669             :                 return;
    8670             : 
    8671             :         /*
    8672             :          * If this is a free space inode do not take the ordered extents lockdep
    8673             :          * map.
    8674             :          */
    8675     3865331 :         freespace_inode = btrfs_is_free_space_inode(inode);
    8676             : 
    8677     3865331 :         while (1) {
    8678     3865331 :                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
    8679     3865321 :                 if (!ordered)
    8680             :                         break;
    8681             :                 else {
    8682           0 :                         btrfs_err(root->fs_info,
    8683             :                                   "found ordered extent %llu %llu on inode cleanup",
    8684             :                                   ordered->file_offset, ordered->num_bytes);
    8685             : 
    8686           0 :                         if (!freespace_inode)
    8687           0 :                                 btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
    8688             : 
    8689           0 :                         btrfs_remove_ordered_extent(inode, ordered);
    8690           0 :                         btrfs_put_ordered_extent(ordered);
    8691           0 :                         btrfs_put_ordered_extent(ordered);
    8692             :                 }
    8693             :         }
    8694     3865321 :         btrfs_qgroup_check_reserved_leak(inode);
    8695     3865244 :         inode_tree_del(inode);
    8696     3865379 :         btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
    8697     3865355 :         btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
    8698     3865333 :         btrfs_put_root(inode->root);
    8699             : }
    8700             : 
    8701     4255618 : int btrfs_drop_inode(struct inode *inode)
    8702             : {
    8703     4255618 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    8704             : 
    8705     4255618 :         if (root == NULL)
    8706             :                 return 1;
    8707             : 
    8708             :         /* the snap/subvol tree is on deleting */
    8709     4253920 :         if (btrfs_root_refs(&root->root_item) == 0)
    8710             :                 return 1;
    8711             :         else
    8712     6715367 :                 return generic_drop_inode(inode);
    8713             : }
    8714             : 
    8715     2649387 : static void init_once(void *foo)
    8716             : {
    8717     2649387 :         struct btrfs_inode *ei = foo;
    8718             : 
    8719     2649387 :         inode_init_once(&ei->vfs_inode);
    8720     2649332 : }
    8721             : 
    8722           0 : void __cold btrfs_destroy_cachep(void)
    8723             : {
    8724             :         /*
    8725             :          * Make sure all delayed rcu free inodes are flushed before we
    8726             :          * destroy cache.
    8727             :          */
    8728           0 :         rcu_barrier();
    8729           0 :         bioset_exit(&btrfs_dio_bioset);
    8730           0 :         kmem_cache_destroy(btrfs_inode_cachep);
    8731           0 : }
    8732             : 
    8733          11 : int __init btrfs_init_cachep(void)
    8734             : {
    8735          11 :         btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
    8736             :                         sizeof(struct btrfs_inode), 0,
    8737             :                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
    8738             :                         init_once);
    8739          11 :         if (!btrfs_inode_cachep)
    8740           0 :                 goto fail;
    8741             : 
    8742          11 :         if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
    8743             :                         offsetof(struct btrfs_dio_private, bbio.bio),
    8744             :                         BIOSET_NEED_BVECS))
    8745           0 :                 goto fail;
    8746             : 
    8747             :         return 0;
    8748           0 : fail:
    8749           0 :         btrfs_destroy_cachep();
    8750           0 :         return -ENOMEM;
    8751             : }
    8752             : 
    8753    17663781 : static int btrfs_getattr(struct mnt_idmap *idmap,
    8754             :                          const struct path *path, struct kstat *stat,
    8755             :                          u32 request_mask, unsigned int flags)
    8756             : {
    8757    17663781 :         u64 delalloc_bytes;
    8758    17663781 :         u64 inode_bytes;
    8759    17663781 :         struct inode *inode = d_inode(path->dentry);
    8760    17663781 :         u32 blocksize = inode->i_sb->s_blocksize;
    8761    17663781 :         u32 bi_flags = BTRFS_I(inode)->flags;
    8762    17663781 :         u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
    8763             : 
    8764    17663781 :         stat->result_mask |= STATX_BTIME;
    8765    17663781 :         stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
    8766    17663781 :         stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
    8767    17663781 :         if (bi_flags & BTRFS_INODE_APPEND)
    8768         155 :                 stat->attributes |= STATX_ATTR_APPEND;
    8769    17663781 :         if (bi_flags & BTRFS_INODE_COMPRESS)
    8770         258 :                 stat->attributes |= STATX_ATTR_COMPRESSED;
    8771    17663781 :         if (bi_flags & BTRFS_INODE_IMMUTABLE)
    8772         174 :                 stat->attributes |= STATX_ATTR_IMMUTABLE;
    8773    17663781 :         if (bi_flags & BTRFS_INODE_NODUMP)
    8774          66 :                 stat->attributes |= STATX_ATTR_NODUMP;
    8775    17663781 :         if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
    8776           0 :                 stat->attributes |= STATX_ATTR_VERITY;
    8777             : 
    8778    17663781 :         stat->attributes_mask |= (STATX_ATTR_APPEND |
    8779             :                                   STATX_ATTR_COMPRESSED |
    8780             :                                   STATX_ATTR_IMMUTABLE |
    8781             :                                   STATX_ATTR_NODUMP);
    8782             : 
    8783    17663781 :         generic_fillattr(idmap, inode, stat);
    8784    17660766 :         stat->dev = BTRFS_I(inode)->root->anon_dev;
    8785             : 
    8786    17660766 :         spin_lock(&BTRFS_I(inode)->lock);
    8787    17678805 :         delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
    8788    17678805 :         inode_bytes = inode_get_bytes(inode);
    8789    17681901 :         spin_unlock(&BTRFS_I(inode)->lock);
    8790    17683016 :         stat->blocks = (ALIGN(inode_bytes, blocksize) +
    8791    17683016 :                         ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
    8792    17683016 :         return 0;
    8793             : }
    8794             : 
    8795       19949 : static int btrfs_rename_exchange(struct inode *old_dir,
    8796             :                               struct dentry *old_dentry,
    8797             :                               struct inode *new_dir,
    8798             :                               struct dentry *new_dentry)
    8799             : {
    8800       19949 :         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
    8801       19949 :         struct btrfs_trans_handle *trans;
    8802       19949 :         unsigned int trans_num_items;
    8803       19949 :         struct btrfs_root *root = BTRFS_I(old_dir)->root;
    8804       19949 :         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
    8805       19949 :         struct inode *new_inode = new_dentry->d_inode;
    8806       19949 :         struct inode *old_inode = old_dentry->d_inode;
    8807       19949 :         struct timespec64 ctime = current_time(old_inode);
    8808       19949 :         struct btrfs_rename_ctx old_rename_ctx;
    8809       19949 :         struct btrfs_rename_ctx new_rename_ctx;
    8810       19949 :         u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
    8811       19949 :         u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
    8812       19949 :         u64 old_idx = 0;
    8813       19949 :         u64 new_idx = 0;
    8814       19949 :         int ret;
    8815       19949 :         int ret2;
    8816       19949 :         bool need_abort = false;
    8817       19949 :         struct fscrypt_name old_fname, new_fname;
    8818       19949 :         struct fscrypt_str *old_name, *new_name;
    8819             : 
    8820             :         /*
    8821             :          * For non-subvolumes allow exchange only within one subvolume, in the
    8822             :          * same inode namespace. Two subvolumes (represented as directory) can
    8823             :          * be exchanged as they're a logical link and have a fixed inode number.
    8824             :          */
    8825       19949 :         if (root != dest &&
    8826          18 :             (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
    8827          18 :              new_ino != BTRFS_FIRST_FREE_OBJECTID))
    8828             :                 return -EXDEV;
    8829             : 
    8830       19932 :         ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
    8831       19932 :         if (ret)
    8832             :                 return ret;
    8833             : 
    8834       19932 :         ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
    8835       19932 :         if (ret) {
    8836             :                 fscrypt_free_filename(&old_fname);
    8837             :                 return ret;
    8838             :         }
    8839             : 
    8840       19932 :         old_name = &old_fname.disk_name;
    8841       19932 :         new_name = &new_fname.disk_name;
    8842             : 
    8843             :         /* close the race window with snapshot create/destroy ioctl */
    8844       19932 :         if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
    8845       19932 :             new_ino == BTRFS_FIRST_FREE_OBJECTID)
    8846           1 :                 down_read(&fs_info->subvol_sem);
    8847             : 
    8848             :         /*
    8849             :          * For each inode:
    8850             :          * 1 to remove old dir item
    8851             :          * 1 to remove old dir index
    8852             :          * 1 to add new dir item
    8853             :          * 1 to add new dir index
    8854             :          * 1 to update parent inode
    8855             :          *
    8856             :          * If the parents are the same, we only need to account for one
    8857             :          */
    8858       19932 :         trans_num_items = (old_dir == new_dir ? 9 : 10);
    8859       19932 :         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
    8860             :                 /*
    8861             :                  * 1 to remove old root ref
    8862             :                  * 1 to remove old root backref
    8863             :                  * 1 to add new root ref
    8864             :                  * 1 to add new root backref
    8865             :                  */
    8866           1 :                 trans_num_items += 4;
    8867             :         } else {
    8868             :                 /*
    8869             :                  * 1 to update inode item
    8870             :                  * 1 to remove old inode ref
    8871             :                  * 1 to add new inode ref
    8872             :                  */
    8873       19931 :                 trans_num_items += 3;
    8874             :         }
    8875       19932 :         if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
    8876           1 :                 trans_num_items += 4;
    8877             :         else
    8878       19931 :                 trans_num_items += 3;
    8879       19932 :         trans = btrfs_start_transaction(root, trans_num_items);
    8880       19932 :         if (IS_ERR(trans)) {
    8881         248 :                 ret = PTR_ERR(trans);
    8882         248 :                 goto out_notrans;
    8883             :         }
    8884             : 
    8885       19684 :         if (dest != root) {
    8886           1 :                 ret = btrfs_record_root_in_trans(trans, dest);
    8887           1 :                 if (ret)
    8888           0 :                         goto out_fail;
    8889             :         }
    8890             : 
    8891             :         /*
    8892             :          * We need to find a free sequence number both in the source and
    8893             :          * in the destination directory for the exchange.
    8894             :          */
    8895       19684 :         ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
    8896       19684 :         if (ret)
    8897           0 :                 goto out_fail;
    8898       19684 :         ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
    8899       19684 :         if (ret)
    8900           0 :                 goto out_fail;
    8901             : 
    8902       19684 :         BTRFS_I(old_inode)->dir_index = 0ULL;
    8903       19684 :         BTRFS_I(new_inode)->dir_index = 0ULL;
    8904             : 
    8905             :         /* Reference for the source. */
    8906       19684 :         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
    8907             :                 /* force full log commit if subvolume involved. */
    8908           1 :                 btrfs_set_log_full_commit(trans);
    8909             :         } else {
    8910       19683 :                 ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
    8911             :                                              btrfs_ino(BTRFS_I(new_dir)),
    8912             :                                              old_idx);
    8913       19683 :                 if (ret)
    8914           0 :                         goto out_fail;
    8915             :                 need_abort = true;
    8916             :         }
    8917             : 
    8918             :         /* And now for the dest. */
    8919       19684 :         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
    8920             :                 /* force full log commit if subvolume involved. */
    8921           1 :                 btrfs_set_log_full_commit(trans);
    8922             :         } else {
    8923       19683 :                 ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
    8924             :                                              btrfs_ino(BTRFS_I(old_dir)),
    8925             :                                              new_idx);
    8926       19683 :                 if (ret) {
    8927           0 :                         if (need_abort)
    8928           0 :                                 btrfs_abort_transaction(trans, ret);
    8929           0 :                         goto out_fail;
    8930             :                 }
    8931             :         }
    8932             : 
    8933             :         /* Update inode version and ctime/mtime. */
    8934       19684 :         inode_inc_iversion(old_dir);
    8935       19684 :         inode_inc_iversion(new_dir);
    8936       19684 :         inode_inc_iversion(old_inode);
    8937       19684 :         inode_inc_iversion(new_inode);
    8938       19684 :         old_dir->i_mtime = ctime;
    8939       19684 :         old_dir->i_ctime = ctime;
    8940       19684 :         new_dir->i_mtime = ctime;
    8941       19684 :         new_dir->i_ctime = ctime;
    8942       19684 :         old_inode->i_ctime = ctime;
    8943       19684 :         new_inode->i_ctime = ctime;
    8944             : 
    8945       19684 :         if (old_dentry->d_parent != new_dentry->d_parent) {
    8946       18416 :                 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
    8947             :                                         BTRFS_I(old_inode), true);
    8948       18416 :                 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
    8949             :                                         BTRFS_I(new_inode), true);
    8950             :         }
    8951             : 
    8952             :         /* src is a subvolume */
    8953       19684 :         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
    8954           1 :                 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
    8955             :         } else { /* src is an inode */
    8956       19683 :                 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
    8957       19683 :                                            BTRFS_I(old_dentry->d_inode),
    8958             :                                            old_name, &old_rename_ctx);
    8959       19683 :                 if (!ret)
    8960       19683 :                         ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
    8961             :         }
    8962       19684 :         if (ret) {
    8963           0 :                 btrfs_abort_transaction(trans, ret);
    8964           0 :                 goto out_fail;
    8965             :         }
    8966             : 
    8967             :         /* dest is a subvolume */
    8968       19684 :         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
    8969           1 :                 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
    8970             :         } else { /* dest is an inode */
    8971       19683 :                 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
    8972       19683 :                                            BTRFS_I(new_dentry->d_inode),
    8973             :                                            new_name, &new_rename_ctx);
    8974       19683 :                 if (!ret)
    8975       19683 :                         ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
    8976             :         }
    8977       19684 :         if (ret) {
    8978           0 :                 btrfs_abort_transaction(trans, ret);
    8979           0 :                 goto out_fail;
    8980             :         }
    8981             : 
    8982       19684 :         ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
    8983             :                              new_name, 0, old_idx);
    8984       19684 :         if (ret) {
    8985           0 :                 btrfs_abort_transaction(trans, ret);
    8986           0 :                 goto out_fail;
    8987             :         }
    8988             : 
    8989       19684 :         ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
    8990             :                              old_name, 0, new_idx);
    8991       19684 :         if (ret) {
    8992           0 :                 btrfs_abort_transaction(trans, ret);
    8993           0 :                 goto out_fail;
    8994             :         }
    8995             : 
    8996       19684 :         if (old_inode->i_nlink == 1)
    8997       17097 :                 BTRFS_I(old_inode)->dir_index = old_idx;
    8998       19684 :         if (new_inode->i_nlink == 1)
    8999       17073 :                 BTRFS_I(new_inode)->dir_index = new_idx;
    9000             : 
    9001             :         /*
    9002             :          * Now pin the logs of the roots. We do it to ensure that no other task
    9003             :          * can sync the logs while we are in progress with the rename, because
    9004             :          * that could result in an inconsistency in case any of the inodes that
    9005             :          * are part of this rename operation were logged before.
    9006             :          */
    9007       19684 :         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
    9008       19683 :                 btrfs_pin_log_trans(root);
    9009       19684 :         if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
    9010       19683 :                 btrfs_pin_log_trans(dest);
    9011             : 
    9012             :         /* Do the log updates for all inodes. */
    9013       19684 :         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
    9014       19683 :                 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
    9015             :                                    old_rename_ctx.index, new_dentry->d_parent);
    9016       19684 :         if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
    9017       19683 :                 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
    9018             :                                    new_rename_ctx.index, old_dentry->d_parent);
    9019             : 
    9020             :         /* Now unpin the logs. */
    9021       19684 :         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
    9022       19683 :                 btrfs_end_log_trans(root);
    9023       19684 :         if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
    9024       19683 :                 btrfs_end_log_trans(dest);
    9025           1 : out_fail:
    9026       19684 :         ret2 = btrfs_end_transaction(trans);
    9027       19684 :         ret = ret ? ret : ret2;
    9028       19932 : out_notrans:
    9029       19932 :         if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
    9030             :             old_ino == BTRFS_FIRST_FREE_OBJECTID)
    9031           1 :                 up_read(&fs_info->subvol_sem);
    9032             : 
    9033             :         fscrypt_free_filename(&new_fname);
    9034             :         fscrypt_free_filename(&old_fname);
    9035             :         return ret;
    9036             : }
    9037             : 
    9038       35445 : static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
    9039             :                                         struct inode *dir)
    9040             : {
    9041       35445 :         struct inode *inode;
    9042             : 
    9043       35445 :         inode = new_inode(dir->i_sb);
    9044       35446 :         if (inode) {
    9045       35446 :                 inode_init_owner(idmap, inode, dir,
    9046             :                                  S_IFCHR | WHITEOUT_MODE);
    9047       35443 :                 inode->i_op = &btrfs_special_inode_operations;
    9048       35443 :                 init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
    9049             :         }
    9050       35442 :         return inode;
    9051             : }
    9052             : 
    9053      424351 : static int btrfs_rename(struct mnt_idmap *idmap,
    9054             :                         struct inode *old_dir, struct dentry *old_dentry,
    9055             :                         struct inode *new_dir, struct dentry *new_dentry,
    9056             :                         unsigned int flags)
    9057             : {
    9058      424351 :         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
    9059      424351 :         struct btrfs_new_inode_args whiteout_args = {
    9060             :                 .dir = old_dir,
    9061             :                 .dentry = old_dentry,
    9062             :         };
    9063      424351 :         struct btrfs_trans_handle *trans;
    9064      424351 :         unsigned int trans_num_items;
    9065      424351 :         struct btrfs_root *root = BTRFS_I(old_dir)->root;
    9066      424351 :         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
    9067      424351 :         struct inode *new_inode = d_inode(new_dentry);
    9068      424351 :         struct inode *old_inode = d_inode(old_dentry);
    9069      424351 :         struct btrfs_rename_ctx rename_ctx;
    9070      424351 :         u64 index = 0;
    9071      424351 :         int ret;
    9072      424351 :         int ret2;
    9073      424351 :         u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
    9074      424351 :         struct fscrypt_name old_fname, new_fname;
    9075             : 
    9076      424351 :         if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
    9077             :                 return -EPERM;
    9078             : 
    9079             :         /* we only allow rename subvolume link between subvolumes */
    9080      424351 :         if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
    9081             :                 return -EXDEV;
    9082             : 
    9083      424336 :         if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
    9084       38685 :             (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
    9085             :                 return -ENOTEMPTY;
    9086             : 
    9087      424336 :         if (S_ISDIR(old_inode->i_mode) && new_inode &&
    9088         182 :             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
    9089             :                 return -ENOTEMPTY;
    9090             : 
    9091      424327 :         ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
    9092      424324 :         if (ret)
    9093             :                 return ret;
    9094             : 
    9095      424327 :         ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
    9096      424323 :         if (ret) {
    9097             :                 fscrypt_free_filename(&old_fname);
    9098             :                 return ret;
    9099             :         }
    9100             : 
    9101             :         /* check for collisions, even if the  name isn't there */
    9102      424327 :         ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
    9103      424326 :         if (ret) {
    9104       38676 :                 if (ret == -EEXIST) {
    9105             :                         /* we shouldn't get
    9106             :                          * eexist without a new_inode */
    9107       38676 :                         if (WARN_ON(!new_inode)) {
    9108           0 :                                 goto out_fscrypt_names;
    9109             :                         }
    9110             :                 } else {
    9111             :                         /* maybe -EOVERFLOW */
    9112           0 :                         goto out_fscrypt_names;
    9113             :                 }
    9114             :         }
    9115      424326 :         ret = 0;
    9116             : 
    9117             :         /*
    9118             :          * we're using rename to replace one file with another.  Start IO on it
    9119             :          * now so  we don't add too much work to the end of the transaction
    9120             :          */
    9121      424326 :         if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
    9122       33064 :                 filemap_flush(old_inode->i_mapping);
    9123             : 
    9124      424326 :         if (flags & RENAME_WHITEOUT) {
    9125       35446 :                 whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
    9126       35442 :                 if (!whiteout_args.inode) {
    9127           0 :                         ret = -ENOMEM;
    9128           0 :                         goto out_fscrypt_names;
    9129             :                 }
    9130       35442 :                 ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
    9131       35444 :                 if (ret)
    9132           0 :                         goto out_whiteout_inode;
    9133             :         } else {
    9134             :                 /* 1 to update the old parent inode. */
    9135      388880 :                 trans_num_items = 1;
    9136             :         }
    9137             : 
    9138      424324 :         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
    9139             :                 /* Close the race window with snapshot create/destroy ioctl */
    9140           4 :                 down_read(&fs_info->subvol_sem);
    9141             :                 /*
    9142             :                  * 1 to remove old root ref
    9143             :                  * 1 to remove old root backref
    9144             :                  * 1 to add new root ref
    9145             :                  * 1 to add new root backref
    9146             :                  */
    9147           4 :                 trans_num_items += 4;
    9148             :         } else {
    9149             :                 /*
    9150             :                  * 1 to update inode
    9151             :                  * 1 to remove old inode ref
    9152             :                  * 1 to add new inode ref
    9153             :                  */
    9154      424320 :                 trans_num_items += 3;
    9155             :         }
    9156             :         /*
    9157             :          * 1 to remove old dir item
    9158             :          * 1 to remove old dir index
    9159             :          * 1 to add new dir item
    9160             :          * 1 to add new dir index
    9161             :          */
    9162      424324 :         trans_num_items += 4;
    9163             :         /* 1 to update new parent inode if it's not the same as the old parent */
    9164      424324 :         if (new_dir != old_dir)
    9165      122031 :                 trans_num_items++;
    9166      424324 :         if (new_inode) {
    9167             :                 /*
    9168             :                  * 1 to update inode
    9169             :                  * 1 to remove inode ref
    9170             :                  * 1 to remove dir item
    9171             :                  * 1 to remove dir index
    9172             :                  * 1 to possibly add orphan item
    9173             :                  */
    9174       38672 :                 trans_num_items += 5;
    9175             :         }
    9176      424324 :         trans = btrfs_start_transaction(root, trans_num_items);
    9177      424330 :         if (IS_ERR(trans)) {
    9178         839 :                 ret = PTR_ERR(trans);
    9179         839 :                 goto out_notrans;
    9180             :         }
    9181             : 
    9182      423491 :         if (dest != root) {
    9183           3 :                 ret = btrfs_record_root_in_trans(trans, dest);
    9184           3 :                 if (ret)
    9185           0 :                         goto out_fail;
    9186             :         }
    9187             : 
    9188      423491 :         ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
    9189      423489 :         if (ret)
    9190           0 :                 goto out_fail;
    9191             : 
    9192      423489 :         BTRFS_I(old_inode)->dir_index = 0ULL;
    9193      423489 :         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
    9194             :                 /* force full log commit if subvolume involved. */
    9195           4 :                 btrfs_set_log_full_commit(trans);
    9196             :         } else {
    9197      423485 :                 ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
    9198             :                                              old_ino, btrfs_ino(BTRFS_I(new_dir)),
    9199             :                                              index);
    9200      423487 :                 if (ret)
    9201           0 :                         goto out_fail;
    9202             :         }
    9203             : 
    9204      423491 :         inode_inc_iversion(old_dir);
    9205      423491 :         inode_inc_iversion(new_dir);
    9206      423491 :         inode_inc_iversion(old_inode);
    9207      423491 :         old_dir->i_mtime = current_time(old_dir);
    9208      423491 :         old_dir->i_ctime = old_dir->i_mtime;
    9209      423491 :         new_dir->i_mtime = old_dir->i_mtime;
    9210      423491 :         new_dir->i_ctime = old_dir->i_mtime;
    9211      423491 :         old_inode->i_ctime = old_dir->i_mtime;
    9212             : 
    9213      423491 :         if (old_dentry->d_parent != new_dentry->d_parent)
    9214      121316 :                 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
    9215             :                                         BTRFS_I(old_inode), true);
    9216             : 
    9217      423491 :         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
    9218           4 :                 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
    9219             :         } else {
    9220      423487 :                 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
    9221             :                                            BTRFS_I(d_inode(old_dentry)),
    9222             :                                            &old_fname.disk_name, &rename_ctx);
    9223      423487 :                 if (!ret)
    9224      423487 :                         ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
    9225             :         }
    9226      423491 :         if (ret) {
    9227           0 :                 btrfs_abort_transaction(trans, ret);
    9228           0 :                 goto out_fail;
    9229             :         }
    9230             : 
    9231      423491 :         if (new_inode) {
    9232       38676 :                 inode_inc_iversion(new_inode);
    9233       38676 :                 new_inode->i_ctime = current_time(new_inode);
    9234       38676 :                 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
    9235             :                              BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
    9236           0 :                         ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
    9237           0 :                         BUG_ON(new_inode->i_nlink == 0);
    9238             :                 } else {
    9239       38676 :                         ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
    9240             :                                                  BTRFS_I(d_inode(new_dentry)),
    9241             :                                                  &new_fname.disk_name);
    9242             :                 }
    9243       38676 :                 if (!ret && new_inode->i_nlink == 0)
    9244       38663 :                         ret = btrfs_orphan_add(trans,
    9245             :                                         BTRFS_I(d_inode(new_dentry)));
    9246       38676 :                 if (ret) {
    9247           0 :                         btrfs_abort_transaction(trans, ret);
    9248           0 :                         goto out_fail;
    9249             :                 }
    9250             :         }
    9251             : 
    9252      423491 :         ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
    9253             :                              &new_fname.disk_name, 0, index);
    9254      423491 :         if (ret) {
    9255           0 :                 btrfs_abort_transaction(trans, ret);
    9256           0 :                 goto out_fail;
    9257             :         }
    9258             : 
    9259      423491 :         if (old_inode->i_nlink == 1)
    9260      414367 :                 BTRFS_I(old_inode)->dir_index = index;
    9261             : 
    9262      423491 :         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
    9263      423487 :                 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
    9264             :                                    rename_ctx.index, new_dentry->d_parent);
    9265             : 
    9266      423491 :         if (flags & RENAME_WHITEOUT) {
    9267       35176 :                 ret = btrfs_create_new_inode(trans, &whiteout_args);
    9268       35176 :                 if (ret) {
    9269           0 :                         btrfs_abort_transaction(trans, ret);
    9270           0 :                         goto out_fail;
    9271             :                 } else {
    9272       35176 :                         unlock_new_inode(whiteout_args.inode);
    9273       35176 :                         iput(whiteout_args.inode);
    9274       35176 :                         whiteout_args.inode = NULL;
    9275             :                 }
    9276             :         }
    9277      388315 : out_fail:
    9278      423491 :         ret2 = btrfs_end_transaction(trans);
    9279      423488 :         ret = ret ? ret : ret2;
    9280      424327 : out_notrans:
    9281      424327 :         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
    9282           4 :                 up_read(&fs_info->subvol_sem);
    9283      424327 :         if (flags & RENAME_WHITEOUT)
    9284       35444 :                 btrfs_new_inode_args_destroy(&whiteout_args);
    9285      388883 : out_whiteout_inode:
    9286      424329 :         if (flags & RENAME_WHITEOUT)
    9287       35445 :                 iput(whiteout_args.inode);
    9288      388884 : out_fscrypt_names:
    9289             :         fscrypt_free_filename(&old_fname);
    9290             :         fscrypt_free_filename(&new_fname);
    9291             :         return ret;
    9292             : }
    9293             : 
    9294      444301 : static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
    9295             :                          struct dentry *old_dentry, struct inode *new_dir,
    9296             :                          struct dentry *new_dentry, unsigned int flags)
    9297             : {
    9298      444301 :         int ret;
    9299             : 
    9300      444301 :         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
    9301             :                 return -EINVAL;
    9302             : 
    9303      444301 :         if (flags & RENAME_EXCHANGE)
    9304       19949 :                 ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
    9305             :                                             new_dentry);
    9306             :         else
    9307      424352 :                 ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
    9308             :                                    new_dentry, flags);
    9309             : 
    9310      444300 :         btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
    9311             : 
    9312      444300 :         return ret;
    9313             : }
    9314             : 
    9315             : struct btrfs_delalloc_work {
    9316             :         struct inode *inode;
    9317             :         struct completion completion;
    9318             :         struct list_head list;
    9319             :         struct btrfs_work work;
    9320             : };
    9321             : 
    9322       20374 : static void btrfs_run_delalloc_work(struct btrfs_work *work)
    9323             : {
    9324       20374 :         struct btrfs_delalloc_work *delalloc_work;
    9325       20374 :         struct inode *inode;
    9326             : 
    9327       20374 :         delalloc_work = container_of(work, struct btrfs_delalloc_work,
    9328             :                                      work);
    9329       20374 :         inode = delalloc_work->inode;
    9330       20374 :         filemap_flush(inode->i_mapping);
    9331       20363 :         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
    9332             :                                 &BTRFS_I(inode)->runtime_flags))
    9333           5 :                 filemap_flush(inode->i_mapping);
    9334             : 
    9335       20363 :         iput(inode);
    9336       20361 :         complete(&delalloc_work->completion);
    9337       20346 : }
    9338             : 
    9339       20374 : static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
    9340             : {
    9341       20374 :         struct btrfs_delalloc_work *work;
    9342             : 
    9343       20374 :         work = kmalloc(sizeof(*work), GFP_NOFS);
    9344       20374 :         if (!work)
    9345             :                 return NULL;
    9346             : 
    9347       20374 :         init_completion(&work->completion);
    9348       20374 :         INIT_LIST_HEAD(&work->list);
    9349       20374 :         work->inode = inode;
    9350       20374 :         btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
    9351             : 
    9352       20374 :         return work;
    9353             : }
    9354             : 
    9355             : /*
    9356             :  * some fairly slow code that needs optimization. This walks the list
    9357             :  * of all the inodes with pending delalloc and forces them to disk.
    9358             :  */
    9359       36375 : static int start_delalloc_inodes(struct btrfs_root *root,
    9360             :                                  struct writeback_control *wbc, bool snapshot,
    9361             :                                  bool in_reclaim_context)
    9362             : {
    9363       36375 :         struct btrfs_inode *binode;
    9364       36375 :         struct inode *inode;
    9365       36375 :         struct btrfs_delalloc_work *work, *next;
    9366       36375 :         struct list_head works;
    9367       36375 :         struct list_head splice;
    9368       36375 :         int ret = 0;
    9369       36375 :         bool full_flush = wbc->nr_to_write == LONG_MAX;
    9370             : 
    9371       36375 :         INIT_LIST_HEAD(&works);
    9372       36375 :         INIT_LIST_HEAD(&splice);
    9373             : 
    9374       36375 :         mutex_lock(&root->delalloc_mutex);
    9375       36374 :         spin_lock(&root->delalloc_lock);
    9376       36375 :         list_splice_init(&root->delalloc_inodes, &splice);
    9377     1560874 :         while (!list_empty(&splice)) {
    9378     1552950 :                 binode = list_entry(splice.next, struct btrfs_inode,
    9379             :                                     delalloc_inodes);
    9380             : 
    9381     1552950 :                 list_move_tail(&binode->delalloc_inodes,
    9382             :                                &root->delalloc_inodes);
    9383             : 
    9384     3086963 :                 if (in_reclaim_context &&
    9385     1534014 :                     test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
    9386           0 :                         continue;
    9387             : 
    9388     1552949 :                 inode = igrab(&binode->vfs_inode);
    9389     1552949 :                 if (!inode) {
    9390          18 :                         cond_resched_lock(&root->delalloc_lock);
    9391          18 :                         continue;
    9392             :                 }
    9393     1552931 :                 spin_unlock(&root->delalloc_lock);
    9394             : 
    9395     1552931 :                 if (snapshot)
    9396       20241 :                         set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
    9397             :                                 &binode->runtime_flags);
    9398     1552931 :                 if (full_flush) {
    9399       20374 :                         work = btrfs_alloc_delalloc_work(inode);
    9400       20374 :                         if (!work) {
    9401           0 :                                 iput(inode);
    9402           0 :                                 ret = -ENOMEM;
    9403           0 :                                 goto out;
    9404             :                         }
    9405       20374 :                         list_add_tail(&work->list, &works);
    9406       20374 :                         btrfs_queue_work(root->fs_info->flush_workers,
    9407             :                                          &work->work);
    9408             :                 } else {
    9409     1532557 :                         ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
    9410     1532557 :                         btrfs_add_delayed_iput(BTRFS_I(inode));
    9411     1532557 :                         if (ret || wbc->nr_to_write <= 0)
    9412       28450 :                                 goto out;
    9413             :                 }
    9414     1524481 :                 cond_resched();
    9415     1524481 :                 spin_lock(&root->delalloc_lock);
    9416             :         }
    9417        7924 :         spin_unlock(&root->delalloc_lock);
    9418             : 
    9419       36375 : out:
    9420       56749 :         list_for_each_entry_safe(work, next, &works, list) {
    9421       20374 :                 list_del_init(&work->list);
    9422       20374 :                 wait_for_completion(&work->completion);
    9423       20374 :                 kfree(work);
    9424             :         }
    9425             : 
    9426       36375 :         if (!list_empty(&splice)) {
    9427         757 :                 spin_lock(&root->delalloc_lock);
    9428         757 :                 list_splice_tail(&splice, &root->delalloc_inodes);
    9429         757 :                 spin_unlock(&root->delalloc_lock);
    9430             :         }
    9431       36375 :         mutex_unlock(&root->delalloc_mutex);
    9432       36373 :         return ret;
    9433             : }
    9434             : 
    9435        7060 : int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
    9436             : {
    9437        7060 :         struct writeback_control wbc = {
    9438             :                 .nr_to_write = LONG_MAX,
    9439             :                 .sync_mode = WB_SYNC_NONE,
    9440             :                 .range_start = 0,
    9441             :                 .range_end = LLONG_MAX,
    9442             :         };
    9443        7060 :         struct btrfs_fs_info *fs_info = root->fs_info;
    9444             : 
    9445        7060 :         if (BTRFS_FS_ERROR(fs_info))
    9446             :                 return -EROFS;
    9447             : 
    9448        7060 :         return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
    9449             : }
    9450             : 
    9451       32217 : int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
    9452             :                                bool in_reclaim_context)
    9453             : {
    9454       32217 :         struct writeback_control wbc = {
    9455             :                 .nr_to_write = nr,
    9456             :                 .sync_mode = WB_SYNC_NONE,
    9457             :                 .range_start = 0,
    9458             :                 .range_end = LLONG_MAX,
    9459             :         };
    9460       32217 :         struct btrfs_root *root;
    9461       32217 :         struct list_head splice;
    9462       32217 :         int ret;
    9463             : 
    9464       32217 :         if (BTRFS_FS_ERROR(fs_info))
    9465             :                 return -EROFS;
    9466             : 
    9467       32217 :         INIT_LIST_HEAD(&splice);
    9468             : 
    9469       32217 :         mutex_lock(&fs_info->delalloc_root_mutex);
    9470       32217 :         spin_lock(&fs_info->delalloc_root_lock);
    9471       32217 :         list_splice_init(&fs_info->delalloc_roots, &splice);
    9472       33081 :         while (!list_empty(&splice)) {
    9473             :                 /*
    9474             :                  * Reset nr_to_write here so we know that we're doing a full
    9475             :                  * flush.
    9476             :                  */
    9477       29314 :                 if (nr == LONG_MAX)
    9478           5 :                         wbc.nr_to_write = LONG_MAX;
    9479             : 
    9480       29314 :                 root = list_first_entry(&splice, struct btrfs_root,
    9481             :                                         delalloc_root);
    9482       29314 :                 root = btrfs_grab_root(root);
    9483       29314 :                 BUG_ON(!root);
    9484       29314 :                 list_move_tail(&root->delalloc_root,
    9485             :                                &fs_info->delalloc_roots);
    9486       29314 :                 spin_unlock(&fs_info->delalloc_root_lock);
    9487             : 
    9488       29314 :                 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
    9489       29314 :                 btrfs_put_root(root);
    9490       29314 :                 if (ret < 0 || wbc.nr_to_write <= 0)
    9491       28450 :                         goto out;
    9492         864 :                 spin_lock(&fs_info->delalloc_root_lock);
    9493             :         }
    9494        3767 :         spin_unlock(&fs_info->delalloc_root_lock);
    9495             : 
    9496        3767 :         ret = 0;
    9497       32217 : out:
    9498       32217 :         if (!list_empty(&splice)) {
    9499           2 :                 spin_lock(&fs_info->delalloc_root_lock);
    9500           2 :                 list_splice_tail(&splice, &fs_info->delalloc_roots);
    9501           2 :                 spin_unlock(&fs_info->delalloc_root_lock);
    9502             :         }
    9503       32217 :         mutex_unlock(&fs_info->delalloc_root_mutex);
    9504       32217 :         return ret;
    9505             : }
    9506             : 
    9507       34982 : static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
    9508             :                          struct dentry *dentry, const char *symname)
    9509             : {
    9510       34982 :         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
    9511       34982 :         struct btrfs_trans_handle *trans;
    9512       34982 :         struct btrfs_root *root = BTRFS_I(dir)->root;
    9513       34982 :         struct btrfs_path *path;
    9514       34982 :         struct btrfs_key key;
    9515       34982 :         struct inode *inode;
    9516       34982 :         struct btrfs_new_inode_args new_inode_args = {
    9517             :                 .dir = dir,
    9518             :                 .dentry = dentry,
    9519             :         };
    9520       34982 :         unsigned int trans_num_items;
    9521       34982 :         int err;
    9522       34982 :         int name_len;
    9523       34982 :         int datasize;
    9524       34982 :         unsigned long ptr;
    9525       34982 :         struct btrfs_file_extent_item *ei;
    9526       34982 :         struct extent_buffer *leaf;
    9527             : 
    9528       34982 :         name_len = strlen(symname);
    9529       34982 :         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
    9530             :                 return -ENAMETOOLONG;
    9531             : 
    9532       34982 :         inode = new_inode(dir->i_sb);
    9533       34981 :         if (!inode)
    9534             :                 return -ENOMEM;
    9535       34981 :         inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
    9536       34980 :         inode->i_op = &btrfs_symlink_inode_operations;
    9537       34980 :         inode_nohighmem(inode);
    9538       34980 :         inode->i_mapping->a_ops = &btrfs_aops;
    9539       34980 :         btrfs_i_size_write(BTRFS_I(inode), name_len);
    9540       34980 :         inode_set_bytes(inode, name_len);
    9541             : 
    9542       34980 :         new_inode_args.inode = inode;
    9543       34980 :         err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
    9544       34980 :         if (err)
    9545           0 :                 goto out_inode;
    9546             :         /* 1 additional item for the inline extent */
    9547       34980 :         trans_num_items++;
    9548             : 
    9549       34980 :         trans = btrfs_start_transaction(root, trans_num_items);
    9550       34982 :         if (IS_ERR(trans)) {
    9551         280 :                 err = PTR_ERR(trans);
    9552         280 :                 goto out_new_inode_args;
    9553             :         }
    9554             : 
    9555       34702 :         err = btrfs_create_new_inode(trans, &new_inode_args);
    9556       34702 :         if (err)
    9557           0 :                 goto out;
    9558             : 
    9559       34702 :         path = btrfs_alloc_path();
    9560       34701 :         if (!path) {
    9561           0 :                 err = -ENOMEM;
    9562           0 :                 btrfs_abort_transaction(trans, err);
    9563           0 :                 discard_new_inode(inode);
    9564           0 :                 inode = NULL;
    9565           0 :                 goto out;
    9566             :         }
    9567       34701 :         key.objectid = btrfs_ino(BTRFS_I(inode));
    9568       34701 :         key.offset = 0;
    9569       34701 :         key.type = BTRFS_EXTENT_DATA_KEY;
    9570       34701 :         datasize = btrfs_file_extent_calc_inline_size(name_len);
    9571       34701 :         err = btrfs_insert_empty_item(trans, root, path, &key,
    9572             :                                       datasize);
    9573       34702 :         if (err) {
    9574           0 :                 btrfs_abort_transaction(trans, err);
    9575           0 :                 btrfs_free_path(path);
    9576           0 :                 discard_new_inode(inode);
    9577           0 :                 inode = NULL;
    9578           0 :                 goto out;
    9579             :         }
    9580       34702 :         leaf = path->nodes[0];
    9581       34702 :         ei = btrfs_item_ptr(leaf, path->slots[0],
    9582             :                             struct btrfs_file_extent_item);
    9583       34702 :         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
    9584       34702 :         btrfs_set_file_extent_type(leaf, ei,
    9585             :                                    BTRFS_FILE_EXTENT_INLINE);
    9586       34702 :         btrfs_set_file_extent_encryption(leaf, ei, 0);
    9587       34702 :         btrfs_set_file_extent_compression(leaf, ei, 0);
    9588       34702 :         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
    9589       34702 :         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
    9590             : 
    9591       34702 :         ptr = btrfs_file_extent_inline_start(ei);
    9592       34702 :         write_extent_buffer(leaf, symname, ptr, name_len);
    9593       34702 :         btrfs_mark_buffer_dirty(leaf);
    9594       34702 :         btrfs_free_path(path);
    9595             : 
    9596       34702 :         d_instantiate_new(dentry, inode);
    9597       34702 :         err = 0;
    9598       34702 : out:
    9599       34702 :         btrfs_end_transaction(trans);
    9600       34702 :         btrfs_btree_balance_dirty(fs_info);
    9601       34982 : out_new_inode_args:
    9602       34982 :         btrfs_new_inode_args_destroy(&new_inode_args);
    9603       34981 : out_inode:
    9604       34981 :         if (err)
    9605         280 :                 iput(inode);
    9606             :         return err;
    9607             : }
    9608             : 
    9609      439577 : static struct btrfs_trans_handle *insert_prealloc_file_extent(
    9610             :                                        struct btrfs_trans_handle *trans_in,
    9611             :                                        struct btrfs_inode *inode,
    9612             :                                        struct btrfs_key *ins,
    9613             :                                        u64 file_offset)
    9614             : {
    9615      439577 :         struct btrfs_file_extent_item stack_fi;
    9616      439577 :         struct btrfs_replace_extent_info extent_info;
    9617      439577 :         struct btrfs_trans_handle *trans = trans_in;
    9618      439577 :         struct btrfs_path *path;
    9619      439577 :         u64 start = ins->objectid;
    9620      439577 :         u64 len = ins->offset;
    9621      439577 :         int qgroup_released;
    9622      439577 :         int ret;
    9623             : 
    9624      439577 :         memset(&stack_fi, 0, sizeof(stack_fi));
    9625             : 
    9626      439577 :         btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
    9627      439577 :         btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
    9628      439577 :         btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
    9629      439577 :         btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
    9630      439577 :         btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
    9631      439577 :         btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
    9632             :         /* Encryption and other encoding is reserved and all 0 */
    9633             : 
    9634      439577 :         qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
    9635      439575 :         if (qgroup_released < 0)
    9636           0 :                 return ERR_PTR(qgroup_released);
    9637             : 
    9638      439575 :         if (trans) {
    9639          16 :                 ret = insert_reserved_file_extent(trans, inode,
    9640             :                                                   file_offset, &stack_fi,
    9641             :                                                   true, qgroup_released);
    9642          16 :                 if (ret)
    9643           0 :                         goto free_qgroup;
    9644          16 :                 return trans;
    9645             :         }
    9646             : 
    9647      439559 :         extent_info.disk_offset = start;
    9648      439559 :         extent_info.disk_len = len;
    9649      439559 :         extent_info.data_offset = 0;
    9650      439559 :         extent_info.data_len = len;
    9651      439559 :         extent_info.file_offset = file_offset;
    9652      439559 :         extent_info.extent_buf = (char *)&stack_fi;
    9653      439559 :         extent_info.is_new_extent = true;
    9654      439559 :         extent_info.update_times = true;
    9655      439559 :         extent_info.qgroup_reserved = qgroup_released;
    9656      439559 :         extent_info.insertions = 0;
    9657             : 
    9658      439559 :         path = btrfs_alloc_path();
    9659      439562 :         if (!path) {
    9660           0 :                 ret = -ENOMEM;
    9661           0 :                 goto free_qgroup;
    9662             :         }
    9663             : 
    9664      439562 :         ret = btrfs_replace_file_extents(inode, path, file_offset,
    9665      439562 :                                      file_offset + len - 1, &extent_info,
    9666             :                                      &trans);
    9667      439562 :         btrfs_free_path(path);
    9668      439562 :         if (ret)
    9669          10 :                 goto free_qgroup;
    9670      439552 :         return trans;
    9671             : 
    9672          10 : free_qgroup:
    9673             :         /*
    9674             :          * We have released qgroup data range at the beginning of the function,
    9675             :          * and normally qgroup_released bytes will be freed when committing
    9676             :          * transaction.
    9677             :          * But if we error out early, we have to free what we have released
    9678             :          * or we leak qgroup data reservation.
    9679             :          */
    9680          10 :         btrfs_qgroup_free_refroot(inode->root->fs_info,
    9681          10 :                         inode->root->root_key.objectid, qgroup_released,
    9682             :                         BTRFS_QGROUP_RSV_DATA);
    9683          10 :         return ERR_PTR(ret);
    9684             : }
    9685             : 
    9686      437029 : static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
    9687             :                                        u64 start, u64 num_bytes, u64 min_size,
    9688             :                                        loff_t actual_len, u64 *alloc_hint,
    9689             :                                        struct btrfs_trans_handle *trans)
    9690             : {
    9691      437029 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    9692      437029 :         struct extent_map *em;
    9693      437029 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    9694      437029 :         struct btrfs_key ins;
    9695      437029 :         u64 cur_offset = start;
    9696      437029 :         u64 clear_offset = start;
    9697      437029 :         u64 i_size;
    9698      437029 :         u64 cur_bytes;
    9699      437029 :         u64 last_alloc = (u64)-1;
    9700      437029 :         int ret = 0;
    9701      437029 :         bool own_trans = true;
    9702      437029 :         u64 end = start + num_bytes - 1;
    9703             : 
    9704      437029 :         if (trans)
    9705          16 :                 own_trans = false;
    9706      876597 :         while (num_bytes > 0) {
    9707      439578 :                 cur_bytes = min_t(u64, num_bytes, SZ_256M);
    9708      439578 :                 cur_bytes = max(cur_bytes, min_size);
    9709             :                 /*
    9710             :                  * If we are severely fragmented we could end up with really
    9711             :                  * small allocations, so if the allocator is returning small
    9712             :                  * chunks lets make its job easier by only searching for those
    9713             :                  * sized chunks.
    9714             :                  */
    9715      439578 :                 cur_bytes = min(cur_bytes, last_alloc);
    9716      439578 :                 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
    9717             :                                 min_size, 0, *alloc_hint, &ins, 1, 0);
    9718      439578 :                 if (ret)
    9719             :                         break;
    9720             : 
    9721             :                 /*
    9722             :                  * We've reserved this space, and thus converted it from
    9723             :                  * ->bytes_may_use to ->bytes_reserved.  Any error that happens
    9724             :                  * from here on out we will only need to clear our reservation
    9725             :                  * for the remaining unreserved area, so advance our
    9726             :                  * clear_offset by our extent size.
    9727             :                  */
    9728      439578 :                 clear_offset += ins.offset;
    9729             : 
    9730      439578 :                 last_alloc = ins.offset;
    9731      439578 :                 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
    9732             :                                                     &ins, cur_offset);
    9733             :                 /*
    9734             :                  * Now that we inserted the prealloc extent we can finally
    9735             :                  * decrement the number of reservations in the block group.
    9736             :                  * If we did it before, we could race with relocation and have
    9737             :                  * relocation miss the reserved extent, making it fail later.
    9738             :                  */
    9739      439578 :                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
    9740      439578 :                 if (IS_ERR(trans)) {
    9741          10 :                         ret = PTR_ERR(trans);
    9742          10 :                         btrfs_free_reserved_extent(fs_info, ins.objectid,
    9743             :                                                    ins.offset, 0);
    9744          10 :                         break;
    9745             :                 }
    9746             : 
    9747      439568 :                 em = alloc_extent_map();
    9748      439568 :                 if (!em) {
    9749           0 :                         btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
    9750           0 :                                             cur_offset + ins.offset - 1, false);
    9751           0 :                         btrfs_set_inode_full_sync(BTRFS_I(inode));
    9752           0 :                         goto next;
    9753             :                 }
    9754             : 
    9755      439568 :                 em->start = cur_offset;
    9756      439568 :                 em->orig_start = cur_offset;
    9757      439568 :                 em->len = ins.offset;
    9758      439568 :                 em->block_start = ins.objectid;
    9759      439568 :                 em->block_len = ins.offset;
    9760      439568 :                 em->orig_block_len = ins.offset;
    9761      439568 :                 em->ram_bytes = ins.offset;
    9762      439568 :                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
    9763      439568 :                 em->generation = trans->transid;
    9764             : 
    9765      439568 :                 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
    9766      439567 :                 free_extent_map(em);
    9767      439568 : next:
    9768      439568 :                 num_bytes -= ins.offset;
    9769      439568 :                 cur_offset += ins.offset;
    9770      439568 :                 *alloc_hint = ins.objectid + ins.offset;
    9771             : 
    9772      439568 :                 inode_inc_iversion(inode);
    9773      439568 :                 inode->i_ctime = current_time(inode);
    9774      439567 :                 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
    9775      439567 :                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
    9776      244223 :                     (actual_len > inode->i_size) &&
    9777      122446 :                     (cur_offset > inode->i_size)) {
    9778      118050 :                         if (cur_offset > actual_len)
    9779             :                                 i_size = actual_len;
    9780             :                         else
    9781             :                                 i_size = cur_offset;
    9782      118050 :                         i_size_write(inode, i_size);
    9783      118050 :                         btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
    9784             :                 }
    9785             : 
    9786      439567 :                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    9787             : 
    9788      439568 :                 if (ret) {
    9789           0 :                         btrfs_abort_transaction(trans, ret);
    9790           0 :                         if (own_trans)
    9791           0 :                                 btrfs_end_transaction(trans);
    9792             :                         break;
    9793             :                 }
    9794             : 
    9795      439568 :                 if (own_trans) {
    9796      439552 :                         btrfs_end_transaction(trans);
    9797      439552 :                         trans = NULL;
    9798             :                 }
    9799             :         }
    9800      437029 :         if (clear_offset < end)
    9801           0 :                 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
    9802           0 :                         end - clear_offset + 1);
    9803      437029 :         return ret;
    9804             : }
    9805             : 
    9806      437013 : int btrfs_prealloc_file_range(struct inode *inode, int mode,
    9807             :                               u64 start, u64 num_bytes, u64 min_size,
    9808             :                               loff_t actual_len, u64 *alloc_hint)
    9809             : {
    9810      437013 :         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
    9811             :                                            min_size, actual_len, alloc_hint,
    9812             :                                            NULL);
    9813             : }
    9814             : 
    9815          16 : int btrfs_prealloc_file_range_trans(struct inode *inode,
    9816             :                                     struct btrfs_trans_handle *trans, int mode,
    9817             :                                     u64 start, u64 num_bytes, u64 min_size,
    9818             :                                     loff_t actual_len, u64 *alloc_hint)
    9819             : {
    9820          16 :         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
    9821             :                                            min_size, actual_len, alloc_hint, trans);
    9822             : }
    9823             : 
    9824    99642560 : static int btrfs_permission(struct mnt_idmap *idmap,
    9825             :                             struct inode *inode, int mask)
    9826             : {
    9827    99642560 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    9828    99642560 :         umode_t mode = inode->i_mode;
    9829             : 
    9830    99642560 :         if (mask & MAY_WRITE &&
    9831     9648880 :             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
    9832     9640042 :                 if (btrfs_root_readonly(root))
    9833             :                         return -EROFS;
    9834     9640036 :                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
    9835             :                         return -EACCES;
    9836             :         }
    9837    99642553 :         return generic_permission(idmap, inode, mask);
    9838             : }
    9839             : 
    9840      397271 : static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
    9841             :                          struct file *file, umode_t mode)
    9842             : {
    9843      397271 :         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
    9844      397271 :         struct btrfs_trans_handle *trans;
    9845      397271 :         struct btrfs_root *root = BTRFS_I(dir)->root;
    9846      397271 :         struct inode *inode;
    9847      397271 :         struct btrfs_new_inode_args new_inode_args = {
    9848             :                 .dir = dir,
    9849      397271 :                 .dentry = file->f_path.dentry,
    9850             :                 .orphan = true,
    9851             :         };
    9852      397271 :         unsigned int trans_num_items;
    9853      397271 :         int ret;
    9854             : 
    9855      397271 :         inode = new_inode(dir->i_sb);
    9856      404041 :         if (!inode)
    9857             :                 return -ENOMEM;
    9858      404041 :         inode_init_owner(idmap, inode, dir, mode);
    9859      403770 :         inode->i_fop = &btrfs_file_operations;
    9860      403770 :         inode->i_op = &btrfs_file_inode_operations;
    9861      403770 :         inode->i_mapping->a_ops = &btrfs_aops;
    9862             : 
    9863      403770 :         new_inode_args.inode = inode;
    9864      403770 :         ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
    9865      395093 :         if (ret)
    9866           0 :                 goto out_inode;
    9867             : 
    9868      395093 :         trans = btrfs_start_transaction(root, trans_num_items);
    9869      404438 :         if (IS_ERR(trans)) {
    9870           0 :                 ret = PTR_ERR(trans);
    9871           0 :                 goto out_new_inode_args;
    9872             :         }
    9873             : 
    9874      404438 :         ret = btrfs_create_new_inode(trans, &new_inode_args);
    9875             : 
    9876             :         /*
    9877             :          * We set number of links to 0 in btrfs_create_new_inode(), and here we
    9878             :          * set it to 1 because d_tmpfile() will issue a warning if the count is
    9879             :          * 0, through:
    9880             :          *
    9881             :          *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
    9882             :          */
    9883      404444 :         set_nlink(inode, 1);
    9884             : 
    9885      404446 :         if (!ret) {
    9886      404446 :                 d_tmpfile(file, inode);
    9887      404437 :                 unlock_new_inode(inode);
    9888      404438 :                 mark_inode_dirty(inode);
    9889             :         }
    9890             : 
    9891      404441 :         btrfs_end_transaction(trans);
    9892      404404 :         btrfs_btree_balance_dirty(fs_info);
    9893      404282 : out_new_inode_args:
    9894      404282 :         btrfs_new_inode_args_destroy(&new_inode_args);
    9895      404186 : out_inode:
    9896      404186 :         if (ret)
    9897           0 :                 iput(inode);
    9898      404186 :         return finish_open_simple(file, ret);
    9899             : }
    9900             : 
    9901    50232253 : void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
    9902             : {
    9903    50232253 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    9904    50232253 :         unsigned long index = start >> PAGE_SHIFT;
    9905    50232253 :         unsigned long end_index = end >> PAGE_SHIFT;
    9906    50232253 :         struct page *page;
    9907    50232253 :         u32 len;
    9908             : 
    9909    50232253 :         ASSERT(end + 1 - start <= U32_MAX);
    9910    50232253 :         len = end + 1 - start;
    9911   100463394 :         while (index <= end_index) {
    9912    50232316 :                 page = find_get_page(inode->vfs_inode.i_mapping, index);
    9913    50230766 :                 ASSERT(page); /* Pages should be in the extent_io_tree */
    9914             : 
    9915    50230766 :                 btrfs_page_set_writeback(fs_info, page, start, len);
    9916    50229778 :                 put_page(page);
    9917    50231141 :                 index++;
    9918             :         }
    9919    50231078 : }
    9920             : 
    9921           0 : int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
    9922             :                                              int compress_type)
    9923             : {
    9924           0 :         switch (compress_type) {
    9925             :         case BTRFS_COMPRESS_NONE:
    9926             :                 return BTRFS_ENCODED_IO_COMPRESSION_NONE;
    9927           0 :         case BTRFS_COMPRESS_ZLIB:
    9928           0 :                 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
    9929           0 :         case BTRFS_COMPRESS_LZO:
    9930             :                 /*
    9931             :                  * The LZO format depends on the sector size. 64K is the maximum
    9932             :                  * sector size that we support.
    9933             :                  */
    9934           0 :                 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
    9935             :                         return -EINVAL;
    9936           0 :                 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
    9937           0 :                        (fs_info->sectorsize_bits - 12);
    9938           0 :         case BTRFS_COMPRESS_ZSTD:
    9939           0 :                 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
    9940           0 :         default:
    9941           0 :                 return -EUCLEAN;
    9942             :         }
    9943             : }
    9944             : 
    9945           0 : static ssize_t btrfs_encoded_read_inline(
    9946             :                                 struct kiocb *iocb,
    9947             :                                 struct iov_iter *iter, u64 start,
    9948             :                                 u64 lockend,
    9949             :                                 struct extent_state **cached_state,
    9950             :                                 u64 extent_start, size_t count,
    9951             :                                 struct btrfs_ioctl_encoded_io_args *encoded,
    9952             :                                 bool *unlocked)
    9953             : {
    9954           0 :         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
    9955           0 :         struct btrfs_root *root = inode->root;
    9956           0 :         struct btrfs_fs_info *fs_info = root->fs_info;
    9957           0 :         struct extent_io_tree *io_tree = &inode->io_tree;
    9958           0 :         struct btrfs_path *path;
    9959           0 :         struct extent_buffer *leaf;
    9960           0 :         struct btrfs_file_extent_item *item;
    9961           0 :         u64 ram_bytes;
    9962           0 :         unsigned long ptr;
    9963           0 :         void *tmp;
    9964           0 :         ssize_t ret;
    9965             : 
    9966           0 :         path = btrfs_alloc_path();
    9967           0 :         if (!path) {
    9968           0 :                 ret = -ENOMEM;
    9969           0 :                 goto out;
    9970             :         }
    9971           0 :         ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
    9972             :                                        extent_start, 0);
    9973           0 :         if (ret) {
    9974           0 :                 if (ret > 0) {
    9975             :                         /* The extent item disappeared? */
    9976           0 :                         ret = -EIO;
    9977             :                 }
    9978           0 :                 goto out;
    9979             :         }
    9980           0 :         leaf = path->nodes[0];
    9981           0 :         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
    9982             : 
    9983           0 :         ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
    9984           0 :         ptr = btrfs_file_extent_inline_start(item);
    9985             : 
    9986           0 :         encoded->len = min_t(u64, extent_start + ram_bytes,
    9987           0 :                              inode->vfs_inode.i_size) - iocb->ki_pos;
    9988           0 :         ret = btrfs_encoded_io_compression_from_extent(fs_info,
    9989             :                                  btrfs_file_extent_compression(leaf, item));
    9990           0 :         if (ret < 0)
    9991           0 :                 goto out;
    9992           0 :         encoded->compression = ret;
    9993           0 :         if (encoded->compression) {
    9994           0 :                 size_t inline_size;
    9995             : 
    9996           0 :                 inline_size = btrfs_file_extent_inline_item_len(leaf,
    9997             :                                                                 path->slots[0]);
    9998           0 :                 if (inline_size > count) {
    9999           0 :                         ret = -ENOBUFS;
   10000           0 :                         goto out;
   10001             :                 }
   10002           0 :                 count = inline_size;
   10003           0 :                 encoded->unencoded_len = ram_bytes;
   10004           0 :                 encoded->unencoded_offset = iocb->ki_pos - extent_start;
   10005             :         } else {
   10006           0 :                 count = min_t(u64, count, encoded->len);
   10007           0 :                 encoded->len = count;
   10008           0 :                 encoded->unencoded_len = count;
   10009           0 :                 ptr += iocb->ki_pos - extent_start;
   10010             :         }
   10011             : 
   10012           0 :         tmp = kmalloc(count, GFP_NOFS);
   10013           0 :         if (!tmp) {
   10014           0 :                 ret = -ENOMEM;
   10015           0 :                 goto out;
   10016             :         }
   10017           0 :         read_extent_buffer(leaf, tmp, ptr, count);
   10018           0 :         btrfs_release_path(path);
   10019           0 :         unlock_extent(io_tree, start, lockend, cached_state);
   10020           0 :         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
   10021           0 :         *unlocked = true;
   10022             : 
   10023           0 :         ret = copy_to_iter(tmp, count, iter);
   10024           0 :         if (ret != count)
   10025           0 :                 ret = -EFAULT;
   10026           0 :         kfree(tmp);
   10027           0 : out:
   10028           0 :         btrfs_free_path(path);
   10029           0 :         return ret;
   10030             : }
   10031             : 
   10032             : struct btrfs_encoded_read_private {
   10033             :         wait_queue_head_t wait;
   10034             :         atomic_t pending;
   10035             :         blk_status_t status;
   10036             : };
   10037             : 
   10038           0 : static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
   10039             : {
   10040           0 :         struct btrfs_encoded_read_private *priv = bbio->private;
   10041             : 
   10042           0 :         if (bbio->bio.bi_status) {
   10043             :                 /*
   10044             :                  * The memory barrier implied by the atomic_dec_return() here
   10045             :                  * pairs with the memory barrier implied by the
   10046             :                  * atomic_dec_return() or io_wait_event() in
   10047             :                  * btrfs_encoded_read_regular_fill_pages() to ensure that this
   10048             :                  * write is observed before the load of status in
   10049             :                  * btrfs_encoded_read_regular_fill_pages().
   10050             :                  */
   10051           0 :                 WRITE_ONCE(priv->status, bbio->bio.bi_status);
   10052             :         }
   10053           0 :         if (!atomic_dec_return(&priv->pending))
   10054           0 :                 wake_up(&priv->wait);
   10055           0 :         bio_put(&bbio->bio);
   10056           0 : }
   10057             : 
   10058           0 : int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
   10059             :                                           u64 file_offset, u64 disk_bytenr,
   10060             :                                           u64 disk_io_size, struct page **pages)
   10061             : {
   10062           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
   10063           0 :         struct btrfs_encoded_read_private priv = {
   10064             :                 .pending = ATOMIC_INIT(1),
   10065             :         };
   10066           0 :         unsigned long i = 0;
   10067           0 :         struct btrfs_bio *bbio;
   10068             : 
   10069           0 :         init_waitqueue_head(&priv.wait);
   10070             : 
   10071           0 :         bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
   10072             :                                btrfs_encoded_read_endio, &priv);
   10073           0 :         bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
   10074           0 :         bbio->inode = inode;
   10075             : 
   10076           0 :         do {
   10077           0 :                 size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
   10078             : 
   10079           0 :                 if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
   10080           0 :                         atomic_inc(&priv.pending);
   10081           0 :                         btrfs_submit_bio(bbio, 0);
   10082             : 
   10083           0 :                         bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
   10084             :                                                btrfs_encoded_read_endio, &priv);
   10085           0 :                         bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
   10086           0 :                         bbio->inode = inode;
   10087           0 :                         continue;
   10088             :                 }
   10089             : 
   10090           0 :                 i++;
   10091           0 :                 disk_bytenr += bytes;
   10092           0 :                 disk_io_size -= bytes;
   10093           0 :         } while (disk_io_size);
   10094             : 
   10095           0 :         atomic_inc(&priv.pending);
   10096           0 :         btrfs_submit_bio(bbio, 0);
   10097             : 
   10098           0 :         if (atomic_dec_return(&priv.pending))
   10099           0 :                 io_wait_event(priv.wait, !atomic_read(&priv.pending));
   10100             :         /* See btrfs_encoded_read_endio() for ordering. */
   10101           0 :         return blk_status_to_errno(READ_ONCE(priv.status));
   10102             : }
   10103             : 
   10104           0 : static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
   10105             :                                           struct iov_iter *iter,
   10106             :                                           u64 start, u64 lockend,
   10107             :                                           struct extent_state **cached_state,
   10108             :                                           u64 disk_bytenr, u64 disk_io_size,
   10109             :                                           size_t count, bool compressed,
   10110             :                                           bool *unlocked)
   10111             : {
   10112           0 :         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
   10113           0 :         struct extent_io_tree *io_tree = &inode->io_tree;
   10114           0 :         struct page **pages;
   10115           0 :         unsigned long nr_pages, i;
   10116           0 :         u64 cur;
   10117           0 :         size_t page_offset;
   10118           0 :         ssize_t ret;
   10119             : 
   10120           0 :         nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
   10121           0 :         pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
   10122           0 :         if (!pages)
   10123             :                 return -ENOMEM;
   10124           0 :         ret = btrfs_alloc_page_array(nr_pages, pages);
   10125           0 :         if (ret) {
   10126           0 :                 ret = -ENOMEM;
   10127           0 :                 goto out;
   10128             :                 }
   10129             : 
   10130           0 :         ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
   10131             :                                                     disk_io_size, pages);
   10132           0 :         if (ret)
   10133           0 :                 goto out;
   10134             : 
   10135           0 :         unlock_extent(io_tree, start, lockend, cached_state);
   10136           0 :         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
   10137           0 :         *unlocked = true;
   10138             : 
   10139           0 :         if (compressed) {
   10140             :                 i = 0;
   10141             :                 page_offset = 0;
   10142             :         } else {
   10143           0 :                 i = (iocb->ki_pos - start) >> PAGE_SHIFT;
   10144           0 :                 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
   10145             :         }
   10146             :         cur = 0;
   10147           0 :         while (cur < count) {
   10148           0 :                 size_t bytes = min_t(size_t, count - cur,
   10149             :                                      PAGE_SIZE - page_offset);
   10150             : 
   10151           0 :                 if (copy_page_to_iter(pages[i], page_offset, bytes,
   10152             :                                       iter) != bytes) {
   10153           0 :                         ret = -EFAULT;
   10154           0 :                         goto out;
   10155             :                 }
   10156           0 :                 i++;
   10157           0 :                 cur += bytes;
   10158           0 :                 page_offset = 0;
   10159             :         }
   10160           0 :         ret = count;
   10161           0 : out:
   10162           0 :         for (i = 0; i < nr_pages; i++) {
   10163           0 :                 if (pages[i])
   10164           0 :                         __free_page(pages[i]);
   10165             :         }
   10166           0 :         kfree(pages);
   10167           0 :         return ret;
   10168             : }
   10169             : 
   10170           0 : ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
   10171             :                            struct btrfs_ioctl_encoded_io_args *encoded)
   10172             : {
   10173           0 :         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
   10174           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
   10175           0 :         struct extent_io_tree *io_tree = &inode->io_tree;
   10176           0 :         ssize_t ret;
   10177           0 :         size_t count = iov_iter_count(iter);
   10178           0 :         u64 start, lockend, disk_bytenr, disk_io_size;
   10179           0 :         struct extent_state *cached_state = NULL;
   10180           0 :         struct extent_map *em;
   10181           0 :         bool unlocked = false;
   10182             : 
   10183           0 :         file_accessed(iocb->ki_filp);
   10184             : 
   10185           0 :         btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
   10186             : 
   10187           0 :         if (iocb->ki_pos >= inode->vfs_inode.i_size) {
   10188           0 :                 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
   10189           0 :                 return 0;
   10190             :         }
   10191           0 :         start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
   10192             :         /*
   10193             :          * We don't know how long the extent containing iocb->ki_pos is, but if
   10194             :          * it's compressed we know that it won't be longer than this.
   10195             :          */
   10196           0 :         lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
   10197             : 
   10198           0 :         for (;;) {
   10199           0 :                 struct btrfs_ordered_extent *ordered;
   10200             : 
   10201           0 :                 ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
   10202             :                                                lockend - start + 1);
   10203           0 :                 if (ret)
   10204           0 :                         goto out_unlock_inode;
   10205           0 :                 lock_extent(io_tree, start, lockend, &cached_state);
   10206           0 :                 ordered = btrfs_lookup_ordered_range(inode, start,
   10207             :                                                      lockend - start + 1);
   10208           0 :                 if (!ordered)
   10209             :                         break;
   10210           0 :                 btrfs_put_ordered_extent(ordered);
   10211           0 :                 unlock_extent(io_tree, start, lockend, &cached_state);
   10212           0 :                 cond_resched();
   10213             :         }
   10214             : 
   10215           0 :         em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
   10216           0 :         if (IS_ERR(em)) {
   10217           0 :                 ret = PTR_ERR(em);
   10218           0 :                 goto out_unlock_extent;
   10219             :         }
   10220             : 
   10221           0 :         if (em->block_start == EXTENT_MAP_INLINE) {
   10222           0 :                 u64 extent_start = em->start;
   10223             : 
   10224             :                 /*
   10225             :                  * For inline extents we get everything we need out of the
   10226             :                  * extent item.
   10227             :                  */
   10228           0 :                 free_extent_map(em);
   10229           0 :                 em = NULL;
   10230           0 :                 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
   10231             :                                                 &cached_state, extent_start,
   10232             :                                                 count, encoded, &unlocked);
   10233           0 :                 goto out;
   10234             :         }
   10235             : 
   10236             :         /*
   10237             :          * We only want to return up to EOF even if the extent extends beyond
   10238             :          * that.
   10239             :          */
   10240           0 :         encoded->len = min_t(u64, extent_map_end(em),
   10241           0 :                              inode->vfs_inode.i_size) - iocb->ki_pos;
   10242           0 :         if (em->block_start == EXTENT_MAP_HOLE ||
   10243           0 :             test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
   10244           0 :                 disk_bytenr = EXTENT_MAP_HOLE;
   10245           0 :                 count = min_t(u64, count, encoded->len);
   10246           0 :                 encoded->len = count;
   10247           0 :                 encoded->unencoded_len = count;
   10248           0 :         } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
   10249           0 :                 disk_bytenr = em->block_start;
   10250             :                 /*
   10251             :                  * Bail if the buffer isn't large enough to return the whole
   10252             :                  * compressed extent.
   10253             :                  */
   10254           0 :                 if (em->block_len > count) {
   10255           0 :                         ret = -ENOBUFS;
   10256           0 :                         goto out_em;
   10257             :                 }
   10258           0 :                 disk_io_size = em->block_len;
   10259           0 :                 count = em->block_len;
   10260           0 :                 encoded->unencoded_len = em->ram_bytes;
   10261           0 :                 encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
   10262           0 :                 ret = btrfs_encoded_io_compression_from_extent(fs_info,
   10263           0 :                                                              em->compress_type);
   10264           0 :                 if (ret < 0)
   10265           0 :                         goto out_em;
   10266           0 :                 encoded->compression = ret;
   10267             :         } else {
   10268           0 :                 disk_bytenr = em->block_start + (start - em->start);
   10269           0 :                 if (encoded->len > count)
   10270           0 :                         encoded->len = count;
   10271             :                 /*
   10272             :                  * Don't read beyond what we locked. This also limits the page
   10273             :                  * allocations that we'll do.
   10274             :                  */
   10275           0 :                 disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
   10276           0 :                 count = start + disk_io_size - iocb->ki_pos;
   10277           0 :                 encoded->len = count;
   10278           0 :                 encoded->unencoded_len = count;
   10279           0 :                 disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
   10280             :         }
   10281           0 :         free_extent_map(em);
   10282           0 :         em = NULL;
   10283             : 
   10284           0 :         if (disk_bytenr == EXTENT_MAP_HOLE) {
   10285           0 :                 unlock_extent(io_tree, start, lockend, &cached_state);
   10286           0 :                 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
   10287           0 :                 unlocked = true;
   10288           0 :                 ret = iov_iter_zero(count, iter);
   10289           0 :                 if (ret != count)
   10290             :                         ret = -EFAULT;
   10291             :         } else {
   10292           0 :                 ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
   10293             :                                                  &cached_state, disk_bytenr,
   10294             :                                                  disk_io_size, count,
   10295           0 :                                                  encoded->compression,
   10296             :                                                  &unlocked);
   10297             :         }
   10298             : 
   10299           0 : out:
   10300           0 :         if (ret >= 0)
   10301           0 :                 iocb->ki_pos += encoded->len;
   10302           0 : out_em:
   10303           0 :         free_extent_map(em);
   10304           0 : out_unlock_extent:
   10305           0 :         if (!unlocked)
   10306           0 :                 unlock_extent(io_tree, start, lockend, &cached_state);
   10307           0 : out_unlock_inode:
   10308           0 :         if (!unlocked)
   10309           0 :                 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
   10310             :         return ret;
   10311             : }
   10312             : 
   10313           0 : ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
   10314             :                                const struct btrfs_ioctl_encoded_io_args *encoded)
   10315             : {
   10316           0 :         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
   10317           0 :         struct btrfs_root *root = inode->root;
   10318           0 :         struct btrfs_fs_info *fs_info = root->fs_info;
   10319           0 :         struct extent_io_tree *io_tree = &inode->io_tree;
   10320           0 :         struct extent_changeset *data_reserved = NULL;
   10321           0 :         struct extent_state *cached_state = NULL;
   10322           0 :         struct btrfs_ordered_extent *ordered;
   10323           0 :         int compression;
   10324           0 :         size_t orig_count;
   10325           0 :         u64 start, end;
   10326           0 :         u64 num_bytes, ram_bytes, disk_num_bytes;
   10327           0 :         unsigned long nr_pages, i;
   10328           0 :         struct page **pages;
   10329           0 :         struct btrfs_key ins;
   10330           0 :         bool extent_reserved = false;
   10331           0 :         struct extent_map *em;
   10332           0 :         ssize_t ret;
   10333             : 
   10334           0 :         switch (encoded->compression) {
   10335             :         case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
   10336             :                 compression = BTRFS_COMPRESS_ZLIB;
   10337             :                 break;
   10338           0 :         case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
   10339           0 :                 compression = BTRFS_COMPRESS_ZSTD;
   10340           0 :                 break;
   10341           0 :         case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
   10342             :         case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
   10343             :         case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
   10344             :         case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
   10345             :         case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
   10346             :                 /* The sector size must match for LZO. */
   10347           0 :                 if (encoded->compression -
   10348           0 :                     BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
   10349           0 :                     fs_info->sectorsize_bits)
   10350             :                         return -EINVAL;
   10351             :                 compression = BTRFS_COMPRESS_LZO;
   10352             :                 break;
   10353             :         default:
   10354             :                 return -EINVAL;
   10355             :         }
   10356           0 :         if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
   10357             :                 return -EINVAL;
   10358             : 
   10359           0 :         orig_count = iov_iter_count(from);
   10360             : 
   10361             :         /* The extent size must be sane. */
   10362           0 :         if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
   10363           0 :             orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
   10364             :                 return -EINVAL;
   10365             : 
   10366             :         /*
   10367             :          * The compressed data must be smaller than the decompressed data.
   10368             :          *
   10369             :          * It's of course possible for data to compress to larger or the same
   10370             :          * size, but the buffered I/O path falls back to no compression for such
   10371             :          * data, and we don't want to break any assumptions by creating these
   10372             :          * extents.
   10373             :          *
   10374             :          * Note that this is less strict than the current check we have that the
   10375             :          * compressed data must be at least one sector smaller than the
   10376             :          * decompressed data. We only want to enforce the weaker requirement
   10377             :          * from old kernels that it is at least one byte smaller.
   10378             :          */
   10379           0 :         if (orig_count >= encoded->unencoded_len)
   10380             :                 return -EINVAL;
   10381             : 
   10382             :         /* The extent must start on a sector boundary. */
   10383           0 :         start = iocb->ki_pos;
   10384           0 :         if (!IS_ALIGNED(start, fs_info->sectorsize))
   10385             :                 return -EINVAL;
   10386             : 
   10387             :         /*
   10388             :          * The extent must end on a sector boundary. However, we allow a write
   10389             :          * which ends at or extends i_size to have an unaligned length; we round
   10390             :          * up the extent size and set i_size to the unaligned end.
   10391             :          */
   10392           0 :         if (start + encoded->len < inode->vfs_inode.i_size &&
   10393           0 :             !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
   10394             :                 return -EINVAL;
   10395             : 
   10396             :         /* Finally, the offset in the unencoded data must be sector-aligned. */
   10397           0 :         if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
   10398             :                 return -EINVAL;
   10399             : 
   10400           0 :         num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
   10401           0 :         ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
   10402           0 :         end = start + num_bytes - 1;
   10403             : 
   10404             :         /*
   10405             :          * If the extent cannot be inline, the compressed data on disk must be
   10406             :          * sector-aligned. For convenience, we extend it with zeroes if it
   10407             :          * isn't.
   10408             :          */
   10409           0 :         disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
   10410           0 :         nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
   10411           0 :         pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
   10412           0 :         if (!pages)
   10413             :                 return -ENOMEM;
   10414           0 :         for (i = 0; i < nr_pages; i++) {
   10415           0 :                 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
   10416           0 :                 char *kaddr;
   10417             : 
   10418           0 :                 pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
   10419           0 :                 if (!pages[i]) {
   10420           0 :                         ret = -ENOMEM;
   10421           0 :                         goto out_pages;
   10422             :                 }
   10423           0 :                 kaddr = kmap_local_page(pages[i]);
   10424           0 :                 if (copy_from_iter(kaddr, bytes, from) != bytes) {
   10425           0 :                         kunmap_local(kaddr);
   10426           0 :                         ret = -EFAULT;
   10427           0 :                         goto out_pages;
   10428             :                 }
   10429           0 :                 if (bytes < PAGE_SIZE)
   10430           0 :                         memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
   10431           0 :                 kunmap_local(kaddr);
   10432             :         }
   10433             : 
   10434           0 :         for (;;) {
   10435           0 :                 struct btrfs_ordered_extent *ordered;
   10436             : 
   10437           0 :                 ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
   10438           0 :                 if (ret)
   10439           0 :                         goto out_pages;
   10440           0 :                 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
   10441           0 :                                                     start >> PAGE_SHIFT,
   10442           0 :                                                     end >> PAGE_SHIFT);
   10443           0 :                 if (ret)
   10444           0 :                         goto out_pages;
   10445           0 :                 lock_extent(io_tree, start, end, &cached_state);
   10446           0 :                 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
   10447           0 :                 if (!ordered &&
   10448           0 :                     !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
   10449             :                         break;
   10450           0 :                 if (ordered)
   10451           0 :                         btrfs_put_ordered_extent(ordered);
   10452           0 :                 unlock_extent(io_tree, start, end, &cached_state);
   10453           0 :                 cond_resched();
   10454             :         }
   10455             : 
   10456             :         /*
   10457             :          * We don't use the higher-level delalloc space functions because our
   10458             :          * num_bytes and disk_num_bytes are different.
   10459             :          */
   10460           0 :         ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
   10461           0 :         if (ret)
   10462           0 :                 goto out_unlock;
   10463           0 :         ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
   10464           0 :         if (ret)
   10465           0 :                 goto out_free_data_space;
   10466           0 :         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
   10467             :                                               false);
   10468           0 :         if (ret)
   10469           0 :                 goto out_qgroup_free_data;
   10470             : 
   10471             :         /* Try an inline extent first. */
   10472           0 :         if (start == 0 && encoded->unencoded_len == encoded->len &&
   10473           0 :             encoded->unencoded_offset == 0) {
   10474           0 :                 ret = cow_file_range_inline(inode, encoded->len, orig_count,
   10475             :                                             compression, pages, true);
   10476           0 :                 if (ret <= 0) {
   10477           0 :                         if (ret == 0)
   10478           0 :                                 ret = orig_count;
   10479           0 :                         goto out_delalloc_release;
   10480             :                 }
   10481             :         }
   10482             : 
   10483           0 :         ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
   10484             :                                    disk_num_bytes, 0, 0, &ins, 1, 1);
   10485           0 :         if (ret)
   10486           0 :                 goto out_delalloc_release;
   10487           0 :         extent_reserved = true;
   10488             : 
   10489           0 :         em = create_io_em(inode, start, num_bytes,
   10490           0 :                           start - encoded->unencoded_offset, ins.objectid,
   10491             :                           ins.offset, ins.offset, ram_bytes, compression,
   10492             :                           BTRFS_ORDERED_COMPRESSED);
   10493           0 :         if (IS_ERR(em)) {
   10494           0 :                 ret = PTR_ERR(em);
   10495           0 :                 goto out_free_reserved;
   10496             :         }
   10497           0 :         free_extent_map(em);
   10498             : 
   10499           0 :         ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
   10500             :                                        ins.objectid, ins.offset,
   10501           0 :                                        encoded->unencoded_offset,
   10502             :                                        (1 << BTRFS_ORDERED_ENCODED) |
   10503             :                                        (1 << BTRFS_ORDERED_COMPRESSED),
   10504             :                                        compression);
   10505           0 :         if (IS_ERR(ordered)) {
   10506           0 :                 btrfs_drop_extent_map_range(inode, start, end, false);
   10507           0 :                 ret = PTR_ERR(ordered);
   10508           0 :                 goto out_free_reserved;
   10509             :         }
   10510           0 :         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
   10511             : 
   10512           0 :         if (start + encoded->len > inode->vfs_inode.i_size)
   10513           0 :                 i_size_write(&inode->vfs_inode, start + encoded->len);
   10514             : 
   10515           0 :         unlock_extent(io_tree, start, end, &cached_state);
   10516             : 
   10517           0 :         btrfs_delalloc_release_extents(inode, num_bytes);
   10518             : 
   10519           0 :         btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
   10520           0 :         ret = orig_count;
   10521           0 :         goto out;
   10522             : 
   10523           0 : out_free_reserved:
   10524           0 :         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
   10525           0 :         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
   10526           0 : out_delalloc_release:
   10527           0 :         btrfs_delalloc_release_extents(inode, num_bytes);
   10528           0 :         btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
   10529           0 : out_qgroup_free_data:
   10530           0 :         if (ret < 0)
   10531           0 :                 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
   10532           0 : out_free_data_space:
   10533             :         /*
   10534             :          * If btrfs_reserve_extent() succeeded, then we already decremented
   10535             :          * bytes_may_use.
   10536             :          */
   10537           0 :         if (!extent_reserved)
   10538           0 :                 btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
   10539           0 : out_unlock:
   10540           0 :         unlock_extent(io_tree, start, end, &cached_state);
   10541           0 : out_pages:
   10542           0 :         for (i = 0; i < nr_pages; i++) {
   10543           0 :                 if (pages[i])
   10544           0 :                         __free_page(pages[i]);
   10545             :         }
   10546           0 :         kvfree(pages);
   10547           0 : out:
   10548           0 :         if (ret >= 0)
   10549           0 :                 iocb->ki_pos += encoded->len;
   10550             :         return ret;
   10551             : }
   10552             : 
   10553             : #ifdef CONFIG_SWAP
   10554             : /*
   10555             :  * Add an entry indicating a block group or device which is pinned by a
   10556             :  * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
   10557             :  * negative errno on failure.
   10558             :  */
   10559         593 : static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
   10560             :                                   bool is_block_group)
   10561             : {
   10562         593 :         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
   10563         593 :         struct btrfs_swapfile_pin *sp, *entry;
   10564         593 :         struct rb_node **p;
   10565         593 :         struct rb_node *parent = NULL;
   10566             : 
   10567         593 :         sp = kmalloc(sizeof(*sp), GFP_NOFS);
   10568         593 :         if (!sp)
   10569             :                 return -ENOMEM;
   10570         593 :         sp->ptr = ptr;
   10571         593 :         sp->inode = inode;
   10572         593 :         sp->is_block_group = is_block_group;
   10573         593 :         sp->bg_extent_count = 1;
   10574             : 
   10575         593 :         spin_lock(&fs_info->swapfile_pins_lock);
   10576         593 :         p = &fs_info->swapfile_pins.rb_node;
   10577        1146 :         while (*p) {
   10578        1066 :                 parent = *p;
   10579        1066 :                 entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
   10580        1066 :                 if (sp->ptr < entry->ptr ||
   10581         513 :                     (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
   10582         534 :                         p = &(*p)->rb_left;
   10583         532 :                 } else if (sp->ptr > entry->ptr ||
   10584         513 :                            (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
   10585          19 :                         p = &(*p)->rb_right;
   10586             :                 } else {
   10587         513 :                         if (is_block_group)
   10588         513 :                                 entry->bg_extent_count++;
   10589         513 :                         spin_unlock(&fs_info->swapfile_pins_lock);
   10590         513 :                         kfree(sp);
   10591         513 :                         return 1;
   10592             :                 }
   10593             :         }
   10594          80 :         rb_link_node(&sp->node, parent, p);
   10595          80 :         rb_insert_color(&sp->node, &fs_info->swapfile_pins);
   10596          80 :         spin_unlock(&fs_info->swapfile_pins_lock);
   10597          80 :         return 0;
   10598             : }
   10599             : 
   10600             : /* Free all of the entries pinned by this swapfile. */
   10601          43 : static void btrfs_free_swapfile_pins(struct inode *inode)
   10602             : {
   10603          43 :         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
   10604          43 :         struct btrfs_swapfile_pin *sp;
   10605          43 :         struct rb_node *node, *next;
   10606             : 
   10607          43 :         spin_lock(&fs_info->swapfile_pins_lock);
   10608          43 :         node = rb_first(&fs_info->swapfile_pins);
   10609         123 :         while (node) {
   10610          80 :                 next = rb_next(node);
   10611          80 :                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
   10612          80 :                 if (sp->inode == inode) {
   10613          80 :                         rb_erase(&sp->node, &fs_info->swapfile_pins);
   10614          80 :                         if (sp->is_block_group) {
   10615          40 :                                 btrfs_dec_block_group_swap_extents(sp->ptr,
   10616             :                                                            sp->bg_extent_count);
   10617          40 :                                 btrfs_put_block_group(sp->ptr);
   10618             :                         }
   10619          80 :                         kfree(sp);
   10620             :                 }
   10621             :                 node = next;
   10622             :         }
   10623          43 :         spin_unlock(&fs_info->swapfile_pins_lock);
   10624          43 : }
   10625             : 
   10626             : struct btrfs_swap_info {
   10627             :         u64 start;
   10628             :         u64 block_start;
   10629             :         u64 block_len;
   10630             :         u64 lowest_ppage;
   10631             :         u64 highest_ppage;
   10632             :         unsigned long nr_pages;
   10633             :         int nr_extents;
   10634             : };
   10635             : 
   10636          39 : static int btrfs_add_swap_extent(struct swap_info_struct *sis,
   10637             :                                  struct btrfs_swap_info *bsi)
   10638             : {
   10639          39 :         unsigned long nr_pages;
   10640          39 :         unsigned long max_pages;
   10641          39 :         u64 first_ppage, first_ppage_reported, next_ppage;
   10642          39 :         int ret;
   10643             : 
   10644             :         /*
   10645             :          * Our swapfile may have had its size extended after the swap header was
   10646             :          * written. In that case activating the swapfile should not go beyond
   10647             :          * the max size set in the swap header.
   10648             :          */
   10649          39 :         if (bsi->nr_pages >= sis->max)
   10650             :                 return 0;
   10651             : 
   10652          39 :         max_pages = sis->max - bsi->nr_pages;
   10653          39 :         first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
   10654          39 :         next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
   10655             : 
   10656          39 :         if (first_ppage >= next_ppage)
   10657             :                 return 0;
   10658          39 :         nr_pages = next_ppage - first_ppage;
   10659          39 :         nr_pages = min(nr_pages, max_pages);
   10660             : 
   10661          39 :         first_ppage_reported = first_ppage;
   10662          39 :         if (bsi->start == 0)
   10663          39 :                 first_ppage_reported++;
   10664          39 :         if (bsi->lowest_ppage > first_ppage_reported)
   10665          39 :                 bsi->lowest_ppage = first_ppage_reported;
   10666          39 :         if (bsi->highest_ppage < (next_ppage - 1))
   10667          39 :                 bsi->highest_ppage = next_ppage - 1;
   10668             : 
   10669          39 :         ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
   10670          39 :         if (ret < 0)
   10671             :                 return ret;
   10672          39 :         bsi->nr_extents += ret;
   10673          39 :         bsi->nr_pages += nr_pages;
   10674          39 :         return 0;
   10675             : }
   10676             : 
   10677          43 : static void btrfs_swap_deactivate(struct file *file)
   10678             : {
   10679          43 :         struct inode *inode = file_inode(file);
   10680             : 
   10681          43 :         btrfs_free_swapfile_pins(inode);
   10682          43 :         atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
   10683          43 : }
   10684             : 
   10685          44 : static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
   10686             :                                sector_t *span)
   10687             : {
   10688          44 :         struct inode *inode = file_inode(file);
   10689          44 :         struct btrfs_root *root = BTRFS_I(inode)->root;
   10690          44 :         struct btrfs_fs_info *fs_info = root->fs_info;
   10691          44 :         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
   10692          44 :         struct extent_state *cached_state = NULL;
   10693          44 :         struct extent_map *em = NULL;
   10694          44 :         struct btrfs_device *device = NULL;
   10695          44 :         struct btrfs_swap_info bsi = {
   10696             :                 .lowest_ppage = (sector_t)-1ULL,
   10697             :         };
   10698          44 :         int ret = 0;
   10699          44 :         u64 isize;
   10700          44 :         u64 start;
   10701             : 
   10702             :         /*
   10703             :          * If the swap file was just created, make sure delalloc is done. If the
   10704             :          * file changes again after this, the user is doing something stupid and
   10705             :          * we don't really care.
   10706             :          */
   10707          44 :         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
   10708          44 :         if (ret)
   10709             :                 return ret;
   10710             : 
   10711             :         /*
   10712             :          * The inode is locked, so these flags won't change after we check them.
   10713             :          */
   10714          44 :         if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
   10715           0 :                 btrfs_warn(fs_info, "swapfile must not be compressed");
   10716           0 :                 return -EINVAL;
   10717             :         }
   10718          44 :         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
   10719           1 :                 btrfs_warn(fs_info, "swapfile must not be copy-on-write");
   10720           1 :                 return -EINVAL;
   10721             :         }
   10722          43 :         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
   10723           0 :                 btrfs_warn(fs_info, "swapfile must not be checksummed");
   10724           0 :                 return -EINVAL;
   10725             :         }
   10726             : 
   10727             :         /*
   10728             :          * Balance or device remove/replace/resize can move stuff around from
   10729             :          * under us. The exclop protection makes sure they aren't running/won't
   10730             :          * run concurrently while we are mapping the swap extents, and
   10731             :          * fs_info->swapfile_pins prevents them from running while the swap
   10732             :          * file is active and moving the extents. Note that this also prevents
   10733             :          * a concurrent device add which isn't actually necessary, but it's not
   10734             :          * really worth the trouble to allow it.
   10735             :          */
   10736          43 :         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
   10737           0 :                 btrfs_warn(fs_info,
   10738             :            "cannot activate swapfile while exclusive operation is running");
   10739           0 :                 return -EBUSY;
   10740             :         }
   10741             : 
   10742             :         /*
   10743             :          * Prevent snapshot creation while we are activating the swap file.
   10744             :          * We do not want to race with snapshot creation. If snapshot creation
   10745             :          * already started before we bumped nr_swapfiles from 0 to 1 and
   10746             :          * completes before the first write into the swap file after it is
   10747             :          * activated, than that write would fallback to COW.
   10748             :          */
   10749          43 :         if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
   10750           0 :                 btrfs_exclop_finish(fs_info);
   10751           0 :                 btrfs_warn(fs_info,
   10752             :            "cannot activate swapfile because snapshot creation is in progress");
   10753           0 :                 return -EINVAL;
   10754             :         }
   10755             :         /*
   10756             :          * Snapshots can create extents which require COW even if NODATACOW is
   10757             :          * set. We use this counter to prevent snapshots. We must increment it
   10758             :          * before walking the extents because we don't want a concurrent
   10759             :          * snapshot to run after we've already checked the extents.
   10760             :          *
   10761             :          * It is possible that subvolume is marked for deletion but still not
   10762             :          * removed yet. To prevent this race, we check the root status before
   10763             :          * activating the swapfile.
   10764             :          */
   10765          43 :         spin_lock(&root->root_item_lock);
   10766          43 :         if (btrfs_root_dead(root)) {
   10767           0 :                 spin_unlock(&root->root_item_lock);
   10768             : 
   10769           0 :                 btrfs_exclop_finish(fs_info);
   10770           0 :                 btrfs_warn(fs_info,
   10771             :                 "cannot activate swapfile because subvolume %llu is being deleted",
   10772             :                         root->root_key.objectid);
   10773           0 :                 return -EPERM;
   10774             :         }
   10775          43 :         atomic_inc(&root->nr_swapfiles);
   10776          43 :         spin_unlock(&root->root_item_lock);
   10777             : 
   10778          43 :         isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
   10779             : 
   10780          43 :         lock_extent(io_tree, 0, isize - 1, &cached_state);
   10781          43 :         start = 0;
   10782         596 :         while (start < isize) {
   10783         557 :                 u64 logical_block_start, physical_block_start;
   10784         557 :                 struct btrfs_block_group *bg;
   10785         557 :                 u64 len = isize - start;
   10786             : 
   10787         557 :                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
   10788         557 :                 if (IS_ERR(em)) {
   10789           0 :                         ret = PTR_ERR(em);
   10790           4 :                         goto out;
   10791             :                 }
   10792             : 
   10793         557 :                 if (em->block_start == EXTENT_MAP_HOLE) {
   10794           1 :                         btrfs_warn(fs_info, "swapfile must not have holes");
   10795           1 :                         ret = -EINVAL;
   10796           1 :                         goto out;
   10797             :                 }
   10798         556 :                 if (em->block_start == EXTENT_MAP_INLINE) {
   10799             :                         /*
   10800             :                          * It's unlikely we'll ever actually find ourselves
   10801             :                          * here, as a file small enough to fit inline won't be
   10802             :                          * big enough to store more than the swap header, but in
   10803             :                          * case something changes in the future, let's catch it
   10804             :                          * here rather than later.
   10805             :                          */
   10806           0 :                         btrfs_warn(fs_info, "swapfile must not be inline");
   10807           0 :                         ret = -EINVAL;
   10808           0 :                         goto out;
   10809             :                 }
   10810        1112 :                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
   10811           0 :                         btrfs_warn(fs_info, "swapfile must not be compressed");
   10812           0 :                         ret = -EINVAL;
   10813           0 :                         goto out;
   10814             :                 }
   10815             : 
   10816         556 :                 logical_block_start = em->block_start + (start - em->start);
   10817         556 :                 len = min(len, em->len - (start - em->start));
   10818         556 :                 free_extent_map(em);
   10819         556 :                 em = NULL;
   10820             : 
   10821         556 :                 ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
   10822         556 :                 if (ret < 0) {
   10823           0 :                         goto out;
   10824         556 :                 } else if (ret) {
   10825         553 :                         ret = 0;
   10826             :                 } else {
   10827           3 :                         btrfs_warn(fs_info,
   10828             :                                    "swapfile must not be copy-on-write");
   10829           3 :                         ret = -EINVAL;
   10830           3 :                         goto out;
   10831             :                 }
   10832             : 
   10833         553 :                 em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
   10834         553 :                 if (IS_ERR(em)) {
   10835           0 :                         ret = PTR_ERR(em);
   10836           0 :                         goto out;
   10837             :                 }
   10838             : 
   10839         553 :                 if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
   10840           0 :                         btrfs_warn(fs_info,
   10841             :                                    "swapfile must have single data profile");
   10842           0 :                         ret = -EINVAL;
   10843           0 :                         goto out;
   10844             :                 }
   10845             : 
   10846         553 :                 if (device == NULL) {
   10847          40 :                         device = em->map_lookup->stripes[0].dev;
   10848          40 :                         ret = btrfs_add_swapfile_pin(inode, device, false);
   10849          40 :                         if (ret == 1)
   10850             :                                 ret = 0;
   10851          40 :                         else if (ret)
   10852           0 :                                 goto out;
   10853         513 :                 } else if (device != em->map_lookup->stripes[0].dev) {
   10854           0 :                         btrfs_warn(fs_info, "swapfile must be on one device");
   10855           0 :                         ret = -EINVAL;
   10856           0 :                         goto out;
   10857             :                 }
   10858             : 
   10859         553 :                 physical_block_start = (em->map_lookup->stripes[0].physical +
   10860         553 :                                         (logical_block_start - em->start));
   10861         553 :                 len = min(len, em->len - (logical_block_start - em->start));
   10862         553 :                 free_extent_map(em);
   10863         553 :                 em = NULL;
   10864             : 
   10865         553 :                 bg = btrfs_lookup_block_group(fs_info, logical_block_start);
   10866         553 :                 if (!bg) {
   10867           0 :                         btrfs_warn(fs_info,
   10868             :                            "could not find block group containing swapfile");
   10869           0 :                         ret = -EINVAL;
   10870           0 :                         goto out;
   10871             :                 }
   10872             : 
   10873         553 :                 if (!btrfs_inc_block_group_swap_extents(bg)) {
   10874           0 :                         btrfs_warn(fs_info,
   10875             :                            "block group for swapfile at %llu is read-only%s",
   10876             :                            bg->start,
   10877             :                            atomic_read(&fs_info->scrubs_running) ?
   10878             :                                        " (scrub running)" : "");
   10879           0 :                         btrfs_put_block_group(bg);
   10880           0 :                         ret = -EINVAL;
   10881           0 :                         goto out;
   10882             :                 }
   10883             : 
   10884         553 :                 ret = btrfs_add_swapfile_pin(inode, bg, true);
   10885         553 :                 if (ret) {
   10886         513 :                         btrfs_put_block_group(bg);
   10887         513 :                         if (ret == 1)
   10888             :                                 ret = 0;
   10889             :                         else
   10890           0 :                                 goto out;
   10891             :                 }
   10892             : 
   10893         553 :                 if (bsi.block_len &&
   10894         513 :                     bsi.block_start + bsi.block_len == physical_block_start) {
   10895         513 :                         bsi.block_len += len;
   10896             :                 } else {
   10897          40 :                         if (bsi.block_len) {
   10898           0 :                                 ret = btrfs_add_swap_extent(sis, &bsi);
   10899           0 :                                 if (ret)
   10900           0 :                                         goto out;
   10901             :                         }
   10902          40 :                         bsi.start = start;
   10903          40 :                         bsi.block_start = physical_block_start;
   10904          40 :                         bsi.block_len = len;
   10905             :                 }
   10906             : 
   10907         553 :                 start += len;
   10908             :         }
   10909             : 
   10910          39 :         if (bsi.block_len)
   10911          39 :                 ret = btrfs_add_swap_extent(sis, &bsi);
   10912             : 
   10913           0 : out:
   10914          44 :         if (!IS_ERR_OR_NULL(em))
   10915           1 :                 free_extent_map(em);
   10916             : 
   10917          43 :         unlock_extent(io_tree, 0, isize - 1, &cached_state);
   10918             : 
   10919          43 :         if (ret)
   10920           4 :                 btrfs_swap_deactivate(file);
   10921             : 
   10922          43 :         btrfs_drew_write_unlock(&root->snapshot_lock);
   10923             : 
   10924          43 :         btrfs_exclop_finish(fs_info);
   10925             : 
   10926          43 :         if (ret)
   10927             :                 return ret;
   10928             : 
   10929          39 :         if (device)
   10930          39 :                 sis->bdev = device->bdev;
   10931          39 :         *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
   10932          39 :         sis->max = bsi.nr_pages;
   10933          39 :         sis->pages = bsi.nr_pages - 1;
   10934          39 :         sis->highest_bit = bsi.nr_pages - 1;
   10935          39 :         return bsi.nr_extents;
   10936             : }
   10937             : #else
   10938             : static void btrfs_swap_deactivate(struct file *file)
   10939             : {
   10940             : }
   10941             : 
   10942             : static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
   10943             :                                sector_t *span)
   10944             : {
   10945             :         return -EOPNOTSUPP;
   10946             : }
   10947             : #endif
   10948             : 
   10949             : /*
   10950             :  * Update the number of bytes used in the VFS' inode. When we replace extents in
   10951             :  * a range (clone, dedupe, fallocate's zero range), we must update the number of
   10952             :  * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
   10953             :  * always get a correct value.
   10954             :  */
   10955    20215905 : void btrfs_update_inode_bytes(struct btrfs_inode *inode,
   10956             :                               const u64 add_bytes,
   10957             :                               const u64 del_bytes)
   10958             : {
   10959    20215905 :         if (add_bytes == del_bytes)
   10960             :                 return;
   10961             : 
   10962    18466517 :         spin_lock(&inode->lock);
   10963    18466533 :         if (del_bytes > 0)
   10964     2758636 :                 inode_sub_bytes(&inode->vfs_inode, del_bytes);
   10965    18466540 :         if (add_bytes > 0)
   10966    15913802 :                 inode_add_bytes(&inode->vfs_inode, add_bytes);
   10967    18466542 :         spin_unlock(&inode->lock);
   10968             : }
   10969             : 
   10970             : /*
   10971             :  * Verify that there are no ordered extents for a given file range.
   10972             :  *
   10973             :  * @inode:   The target inode.
   10974             :  * @start:   Start offset of the file range, should be sector size aligned.
   10975             :  * @end:     End offset (inclusive) of the file range, its value +1 should be
   10976             :  *           sector size aligned.
   10977             :  *
   10978             :  * This should typically be used for cases where we locked an inode's VFS lock in
   10979             :  * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
   10980             :  * we have flushed all delalloc in the range, we have waited for all ordered
   10981             :  * extents in the range to complete and finally we have locked the file range in
   10982             :  * the inode's io_tree.
   10983             :  */
   10984     5877549 : void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
   10985             : {
   10986     5877549 :         struct btrfs_root *root = inode->root;
   10987     5877549 :         struct btrfs_ordered_extent *ordered;
   10988             : 
   10989     5877549 :         if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
   10990     5877549 :                 return;
   10991             : 
   10992             :         ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
   10993             :         if (ordered) {
   10994             :                 btrfs_err(root->fs_info,
   10995             : "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
   10996             :                           start, end, btrfs_ino(inode), root->root_key.objectid,
   10997             :                           ordered->file_offset,
   10998             :                           ordered->file_offset + ordered->num_bytes - 1);
   10999             :                 btrfs_put_ordered_extent(ordered);
   11000             :         }
   11001             : 
   11002     5877549 :         ASSERT(ordered == NULL);
   11003             : }
   11004             : 
   11005             : static const struct inode_operations btrfs_dir_inode_operations = {
   11006             :         .getattr        = btrfs_getattr,
   11007             :         .lookup         = btrfs_lookup,
   11008             :         .create         = btrfs_create,
   11009             :         .unlink         = btrfs_unlink,
   11010             :         .link           = btrfs_link,
   11011             :         .mkdir          = btrfs_mkdir,
   11012             :         .rmdir          = btrfs_rmdir,
   11013             :         .rename         = btrfs_rename2,
   11014             :         .symlink        = btrfs_symlink,
   11015             :         .setattr        = btrfs_setattr,
   11016             :         .mknod          = btrfs_mknod,
   11017             :         .listxattr      = btrfs_listxattr,
   11018             :         .permission     = btrfs_permission,
   11019             :         .get_inode_acl  = btrfs_get_acl,
   11020             :         .set_acl        = btrfs_set_acl,
   11021             :         .update_time    = btrfs_update_time,
   11022             :         .tmpfile        = btrfs_tmpfile,
   11023             :         .fileattr_get   = btrfs_fileattr_get,
   11024             :         .fileattr_set   = btrfs_fileattr_set,
   11025             : };
   11026             : 
   11027             : static const struct file_operations btrfs_dir_file_operations = {
   11028             :         .llseek         = generic_file_llseek,
   11029             :         .read           = generic_read_dir,
   11030             :         .iterate_shared = btrfs_real_readdir,
   11031             :         .open           = btrfs_opendir,
   11032             :         .unlocked_ioctl = btrfs_ioctl,
   11033             : #ifdef CONFIG_COMPAT
   11034             :         .compat_ioctl   = btrfs_compat_ioctl,
   11035             : #endif
   11036             :         .release        = btrfs_release_file,
   11037             :         .fsync          = btrfs_sync_file,
   11038             : };
   11039             : 
   11040             : /*
   11041             :  * btrfs doesn't support the bmap operation because swapfiles
   11042             :  * use bmap to make a mapping of extents in the file.  They assume
   11043             :  * these extents won't change over the life of the file and they
   11044             :  * use the bmap result to do IO directly to the drive.
   11045             :  *
   11046             :  * the btrfs bmap call would return logical addresses that aren't
   11047             :  * suitable for IO and they also will change frequently as COW
   11048             :  * operations happen.  So, swapfile + btrfs == corruption.
   11049             :  *
   11050             :  * For now we're avoiding this by dropping bmap.
   11051             :  */
   11052             : static const struct address_space_operations btrfs_aops = {
   11053             :         .read_folio     = btrfs_read_folio,
   11054             :         .writepages     = btrfs_writepages,
   11055             :         .readahead      = btrfs_readahead,
   11056             :         .invalidate_folio = btrfs_invalidate_folio,
   11057             :         .release_folio  = btrfs_release_folio,
   11058             :         .migrate_folio  = btrfs_migrate_folio,
   11059             :         .dirty_folio    = filemap_dirty_folio,
   11060             :         .error_remove_page = generic_error_remove_page,
   11061             :         .swap_activate  = btrfs_swap_activate,
   11062             :         .swap_deactivate = btrfs_swap_deactivate,
   11063             : };
   11064             : 
   11065             : static const struct inode_operations btrfs_file_inode_operations = {
   11066             :         .getattr        = btrfs_getattr,
   11067             :         .setattr        = btrfs_setattr,
   11068             :         .listxattr      = btrfs_listxattr,
   11069             :         .permission     = btrfs_permission,
   11070             :         .fiemap         = btrfs_fiemap,
   11071             :         .get_inode_acl  = btrfs_get_acl,
   11072             :         .set_acl        = btrfs_set_acl,
   11073             :         .update_time    = btrfs_update_time,
   11074             :         .fileattr_get   = btrfs_fileattr_get,
   11075             :         .fileattr_set   = btrfs_fileattr_set,
   11076             : };
   11077             : static const struct inode_operations btrfs_special_inode_operations = {
   11078             :         .getattr        = btrfs_getattr,
   11079             :         .setattr        = btrfs_setattr,
   11080             :         .permission     = btrfs_permission,
   11081             :         .listxattr      = btrfs_listxattr,
   11082             :         .get_inode_acl  = btrfs_get_acl,
   11083             :         .set_acl        = btrfs_set_acl,
   11084             :         .update_time    = btrfs_update_time,
   11085             : };
   11086             : static const struct inode_operations btrfs_symlink_inode_operations = {
   11087             :         .get_link       = page_get_link,
   11088             :         .getattr        = btrfs_getattr,
   11089             :         .setattr        = btrfs_setattr,
   11090             :         .permission     = btrfs_permission,
   11091             :         .listxattr      = btrfs_listxattr,
   11092             :         .update_time    = btrfs_update_time,
   11093             : };
   11094             : 
   11095             : const struct dentry_operations btrfs_dentry_operations = {
   11096             :         .d_delete       = btrfs_dentry_delete,
   11097             : };

Generated by: LCOV version 1.14