LCOV - code coverage report
Current view: top level - fs/btrfs - file.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwx @ Mon Jul 31 20:08:22 PDT 2023 Lines: 1583 1834 86.3 %
Date: 2023-07-31 20:08:22 Functions: 44 46 95.7 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (C) 2007 Oracle.  All rights reserved.
       4             :  */
       5             : 
       6             : #include <linux/fs.h>
       7             : #include <linux/pagemap.h>
       8             : #include <linux/time.h>
       9             : #include <linux/init.h>
      10             : #include <linux/string.h>
      11             : #include <linux/backing-dev.h>
      12             : #include <linux/falloc.h>
      13             : #include <linux/writeback.h>
      14             : #include <linux/compat.h>
      15             : #include <linux/slab.h>
      16             : #include <linux/btrfs.h>
      17             : #include <linux/uio.h>
      18             : #include <linux/iversion.h>
      19             : #include <linux/fsverity.h>
      20             : #include "ctree.h"
      21             : #include "disk-io.h"
      22             : #include "transaction.h"
      23             : #include "btrfs_inode.h"
      24             : #include "print-tree.h"
      25             : #include "tree-log.h"
      26             : #include "locking.h"
      27             : #include "volumes.h"
      28             : #include "qgroup.h"
      29             : #include "compression.h"
      30             : #include "delalloc-space.h"
      31             : #include "reflink.h"
      32             : #include "subpage.h"
      33             : #include "fs.h"
      34             : #include "accessors.h"
      35             : #include "extent-tree.h"
      36             : #include "file-item.h"
      37             : #include "ioctl.h"
      38             : #include "file.h"
      39             : #include "super.h"
      40             : 
      41             : /* simple helper to fault in pages and copy.  This should go away
      42             :  * and be replaced with calls into generic code.
      43             :  */
      44    27356946 : static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
      45             :                                          struct page **prepared_pages,
      46             :                                          struct iov_iter *i)
      47             : {
      48    27356946 :         size_t copied = 0;
      49    27356946 :         size_t total_copied = 0;
      50    27356946 :         int pg = 0;
      51    27356946 :         int offset = offset_in_page(pos);
      52             : 
      53    96953371 :         while (write_bytes > 0) {
      54    69536922 :                 size_t count = min_t(size_t,
      55             :                                      PAGE_SIZE - offset, write_bytes);
      56    69536922 :                 struct page *page = prepared_pages[pg];
      57             :                 /*
      58             :                  * Copy data from userspace to the current page
      59             :                  */
      60    69536922 :                 copied = copy_page_from_iter_atomic(page, offset, count, i);
      61             : 
      62             :                 /* Flush processor's dcache for this page */
      63    69596425 :                 flush_dcache_page(page);
      64             : 
      65             :                 /*
      66             :                  * if we get a partial write, we can end up with
      67             :                  * partially up to date pages.  These add
      68             :                  * a lot of complexity, so make sure they don't
      69             :                  * happen by forcing this copy to be retried.
      70             :                  *
      71             :                  * The rest of the btrfs_file_write code will fall
      72             :                  * back to page at a time copies after we return 0.
      73             :                  */
      74    69596425 :                 if (unlikely(copied < count)) {
      75           0 :                         if (!PageUptodate(page)) {
      76           0 :                                 iov_iter_revert(i, copied);
      77           0 :                                 copied = 0;
      78             :                         }
      79           0 :                         if (!copied)
      80             :                                 break;
      81             :                 }
      82             : 
      83    69596425 :                 write_bytes -= copied;
      84    69596425 :                 total_copied += copied;
      85    69596425 :                 offset += copied;
      86    69596425 :                 if (offset == PAGE_SIZE) {
      87    58636706 :                         pg++;
      88    58636706 :                         offset = 0;
      89             :                 }
      90             :         }
      91    27416449 :         return total_copied;
      92             : }
      93             : 
      94             : /*
      95             :  * unlocks pages after btrfs_file_write is done with them
      96             :  */
      97    27437960 : static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
      98             :                              struct page **pages, size_t num_pages,
      99             :                              u64 pos, u64 copied)
     100             : {
     101    27437960 :         size_t i;
     102    27437960 :         u64 block_start = round_down(pos, fs_info->sectorsize);
     103    27437960 :         u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
     104             : 
     105    27437960 :         ASSERT(block_len <= U32_MAX);
     106    97059897 :         for (i = 0; i < num_pages; i++) {
     107             :                 /* page checked is some magic around finding pages that
     108             :                  * have been modified without going through btrfs_set_page_dirty
     109             :                  * clear it here. There should be no need to mark the pages
     110             :                  * accessed as prepare_pages should have marked them accessed
     111             :                  * in prepare_pages via find_or_create_page()
     112             :                  */
     113    69620094 :                 btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
     114             :                                                block_len);
     115    69628640 :                 unlock_page(pages[i]);
     116    69621213 :                 put_page(pages[i]);
     117             :         }
     118    27439803 : }
     119             : 
     120             : /*
     121             :  * After btrfs_copy_from_user(), update the following things for delalloc:
     122             :  * - Mark newly dirtied pages as DELALLOC in the io tree.
     123             :  *   Used to advise which range is to be written back.
     124             :  * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
     125             :  * - Update inode size for past EOF write
     126             :  */
     127    27349144 : int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
     128             :                       size_t num_pages, loff_t pos, size_t write_bytes,
     129             :                       struct extent_state **cached, bool noreserve)
     130             : {
     131    27349144 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
     132    27349144 :         int err = 0;
     133    27349144 :         int i;
     134    27349144 :         u64 num_bytes;
     135    27349144 :         u64 start_pos;
     136    27349144 :         u64 end_of_last_block;
     137    27349144 :         u64 end_pos = pos + write_bytes;
     138    27349144 :         loff_t isize = i_size_read(&inode->vfs_inode);
     139    27349144 :         unsigned int extra_bits = 0;
     140             : 
     141    27349144 :         if (write_bytes == 0)
     142             :                 return 0;
     143             : 
     144    27349144 :         if (noreserve)
     145        5282 :                 extra_bits |= EXTENT_NORESERVE;
     146             : 
     147    27349144 :         start_pos = round_down(pos, fs_info->sectorsize);
     148    27349144 :         num_bytes = round_up(write_bytes + pos - start_pos,
     149             :                              fs_info->sectorsize);
     150    27349144 :         ASSERT(num_bytes <= U32_MAX);
     151             : 
     152    27349144 :         end_of_last_block = start_pos + num_bytes - 1;
     153             : 
     154             :         /*
     155             :          * The pages may have already been dirty, clear out old accounting so
     156             :          * we can set things up properly
     157             :          */
     158    27349144 :         clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
     159             :                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
     160             :                          cached);
     161             : 
     162    27431872 :         err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
     163             :                                         extra_bits, cached);
     164    27421162 :         if (err)
     165             :                 return err;
     166             : 
     167    96991490 :         for (i = 0; i < num_pages; i++) {
     168    69600034 :                 struct page *p = pages[i];
     169             : 
     170    69600034 :                 btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
     171    69620512 :                 btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
     172    69620038 :                 btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
     173             :         }
     174             : 
     175             :         /*
     176             :          * we've only changed i_size in ram, and we haven't updated
     177             :          * the disk i_size.  There is no need to log the inode
     178             :          * at this time.
     179             :          */
     180    27391456 :         if (end_pos > isize)
     181    20087727 :                 i_size_write(&inode->vfs_inode, end_pos);
     182             :         return 0;
     183             : }
     184             : 
     185             : /*
     186             :  * this is very complex, but the basic idea is to drop all extents
     187             :  * in the range start - end.  hint_block is filled in with a block number
     188             :  * that would be a good hint to the block allocator for this file.
     189             :  *
     190             :  * If an extent intersects the range but is not entirely inside the range
     191             :  * it is either truncated or split.  Anything entirely inside the range
     192             :  * is deleted from the tree.
     193             :  *
     194             :  * Note: the VFS' inode number of bytes is not updated, it's up to the caller
     195             :  * to deal with that. We set the field 'bytes_found' of the arguments structure
     196             :  * with the number of allocated bytes found in the target range, so that the
     197             :  * caller can update the inode's number of bytes in an atomic way when
     198             :  * replacing extents in a range to avoid races with stat(2).
     199             :  */
     200    22966706 : int btrfs_drop_extents(struct btrfs_trans_handle *trans,
     201             :                        struct btrfs_root *root, struct btrfs_inode *inode,
     202             :                        struct btrfs_drop_extents_args *args)
     203             : {
     204    22966706 :         struct btrfs_fs_info *fs_info = root->fs_info;
     205    22966706 :         struct extent_buffer *leaf;
     206    22966706 :         struct btrfs_file_extent_item *fi;
     207    22966706 :         struct btrfs_ref ref = { 0 };
     208    22966706 :         struct btrfs_key key;
     209    22966706 :         struct btrfs_key new_key;
     210    22966706 :         u64 ino = btrfs_ino(inode);
     211    22966706 :         u64 search_start = args->start;
     212    22966706 :         u64 disk_bytenr = 0;
     213    22966706 :         u64 num_bytes = 0;
     214    22966706 :         u64 extent_offset = 0;
     215    22966706 :         u64 extent_end = 0;
     216    22966706 :         u64 last_end = args->start;
     217    22966706 :         int del_nr = 0;
     218    22966706 :         int del_slot = 0;
     219    22966706 :         int extent_type;
     220    22966706 :         int recow;
     221    22966706 :         int ret;
     222    22966706 :         int modify_tree = -1;
     223    22966706 :         int update_refs;
     224    22966706 :         int found = 0;
     225    22966706 :         struct btrfs_path *path = args->path;
     226             : 
     227    22966706 :         args->bytes_found = 0;
     228    22966706 :         args->extent_inserted = false;
     229             : 
     230             :         /* Must always have a path if ->replace_extent is true */
     231    22966706 :         ASSERT(!(args->replace_extent && !args->path));
     232             : 
     233    22966706 :         if (!path) {
     234      105195 :                 path = btrfs_alloc_path();
     235      105195 :                 if (!path) {
     236           0 :                         ret = -ENOMEM;
     237           0 :                         goto out;
     238             :                 }
     239             :         }
     240             : 
     241    22966706 :         if (args->drop_cache)
     242    19434925 :                 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
     243             : 
     244    22966794 :         if (args->start >= inode->disk_i_size && !args->replace_extent)
     245    13926072 :                 modify_tree = 0;
     246             : 
     247    22966794 :         update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
     248    22993763 :         while (1) {
     249    22993763 :                 recow = 0;
     250    22993763 :                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
     251             :                                                search_start, modify_tree);
     252    22994727 :                 if (ret < 0)
     253             :                         break;
     254    22994718 :                 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
     255    21296593 :                         leaf = path->nodes[0];
     256    21296593 :                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
     257    21296578 :                         if (key.objectid == ino &&
     258    21296527 :                             key.type == BTRFS_EXTENT_DATA_KEY)
     259    19497303 :                                 path->slots[0]--;
     260             :                 }
     261             :                 ret = 0;
     262             : next_slot:
     263    41042010 :                 leaf = path->nodes[0];
     264    41042010 :                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
     265    14220667 :                         BUG_ON(del_nr > 0);
     266    14220667 :                         ret = btrfs_next_leaf(root, path);
     267    14220662 :                         if (ret < 0)
     268             :                                 break;
     269    14220662 :                         if (ret > 0) {
     270             :                                 ret = 0;
     271             :                                 break;
     272             :                         }
     273       29107 :                         leaf = path->nodes[0];
     274       29107 :                         recow = 1;
     275             :                 }
     276             : 
     277    26850450 :                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
     278             : 
     279    26850524 :                 if (key.objectid > ino)
     280             :                         break;
     281    24664067 :                 if (WARN_ON_ONCE(key.objectid < ino) ||
     282    24664067 :                     key.type < BTRFS_EXTENT_DATA_KEY) {
     283           0 :                         ASSERT(del_nr == 0);
     284           0 :                         path->slots[0]++;
     285           0 :                         goto next_slot;
     286             :                 }
     287    24664067 :                 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
     288             :                         break;
     289             : 
     290    22692579 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
     291             :                                     struct btrfs_file_extent_item);
     292    22692480 :                 extent_type = btrfs_file_extent_type(leaf, fi);
     293             : 
     294    22692398 :                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
     295             :                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
     296    22465204 :                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
     297    22465200 :                         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
     298    22465218 :                         extent_offset = btrfs_file_extent_offset(leaf, fi);
     299    22465244 :                         extent_end = key.offset +
     300             :                                 btrfs_file_extent_num_bytes(leaf, fi);
     301      227194 :                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
     302      227194 :                         extent_end = key.offset +
     303             :                                 btrfs_file_extent_ram_bytes(leaf, fi);
     304             :                 } else {
     305             :                         /* can't happen */
     306           0 :                         BUG();
     307             :                 }
     308             : 
     309             :                 /*
     310             :                  * Don't skip extent items representing 0 byte lengths. They
     311             :                  * used to be created (bug) if while punching holes we hit
     312             :                  * -ENOSPC condition. So if we find one here, just ensure we
     313             :                  * delete it, otherwise we would insert a new file extent item
     314             :                  * with the same key (offset) as that 0 bytes length file
     315             :                  * extent item in the call to setup_items_for_insert() later
     316             :                  * in this function.
     317             :                  */
     318    22692465 :                 if (extent_end == key.offset && extent_end >= search_start) {
     319           0 :                         last_end = extent_end;
     320           0 :                         goto delete_extent_item;
     321             :                 }
     322             : 
     323    22692465 :                 if (extent_end <= search_start) {
     324    16524699 :                         path->slots[0]++;
     325    16524699 :                         goto next_slot;
     326             :                 }
     327             : 
     328     6167766 :                 found = 1;
     329     6167766 :                 search_start = max(key.offset, args->start);
     330     6167766 :                 if (recow || !modify_tree) {
     331        9381 :                         modify_tree = -1;
     332        9381 :                         btrfs_release_path(path);
     333        9381 :                         continue;
     334             :                 }
     335             : 
     336             :                 /*
     337             :                  *     | - range to drop - |
     338             :                  *  | -------- extent -------- |
     339             :                  */
     340     6158385 :                 if (args->start > key.offset && args->end < extent_end) {
     341     2391312 :                         BUG_ON(del_nr > 0);
     342     2391312 :                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
     343             :                                 ret = -EOPNOTSUPP;
     344             :                                 break;
     345             :                         }
     346             : 
     347     2391312 :                         memcpy(&new_key, &key, sizeof(new_key));
     348     2391312 :                         new_key.offset = args->start;
     349     2391312 :                         ret = btrfs_duplicate_item(trans, root, path,
     350             :                                                    &new_key);
     351     2391302 :                         if (ret == -EAGAIN) {
     352           7 :                                 btrfs_release_path(path);
     353           7 :                                 continue;
     354             :                         }
     355     2391295 :                         if (ret < 0)
     356             :                                 break;
     357             : 
     358     2391295 :                         leaf = path->nodes[0];
     359     2391295 :                         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
     360             :                                             struct btrfs_file_extent_item);
     361     2391300 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     362     2391300 :                                                         args->start - key.offset);
     363             : 
     364     2391284 :                         fi = btrfs_item_ptr(leaf, path->slots[0],
     365             :                                             struct btrfs_file_extent_item);
     366             : 
     367     2391284 :                         extent_offset += args->start - key.offset;
     368     2391284 :                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
     369     2391284 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     370     2391284 :                                                         extent_end - args->start);
     371     2391287 :                         btrfs_mark_buffer_dirty(leaf);
     372             : 
     373     2391282 :                         if (update_refs && disk_bytenr > 0) {
     374     2346923 :                                 btrfs_init_generic_ref(&ref,
     375             :                                                 BTRFS_ADD_DELAYED_REF,
     376             :                                                 disk_bytenr, num_bytes, 0);
     377     2346923 :                                 btrfs_init_data_ref(&ref,
     378             :                                                 root->root_key.objectid,
     379             :                                                 new_key.objectid,
     380     2346923 :                                                 args->start - extent_offset,
     381             :                                                 0, false);
     382     2346923 :                                 ret = btrfs_inc_extent_ref(trans, &ref);
     383     2346946 :                                 if (ret) {
     384           0 :                                         btrfs_abort_transaction(trans, ret);
     385           0 :                                         break;
     386             :                                 }
     387             :                         }
     388     2391305 :                         key.offset = args->start;
     389             :                 }
     390             :                 /*
     391             :                  * From here on out we will have actually dropped something, so
     392             :                  * last_end can be updated.
     393             :                  */
     394     6158378 :                 last_end = extent_end;
     395             : 
     396             :                 /*
     397             :                  *  | ---- range to drop ----- |
     398             :                  *      | -------- extent -------- |
     399             :                  */
     400     6158378 :                 if (args->start <= key.offset && args->end < extent_end) {
     401     3232304 :                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
     402             :                                 ret = -EOPNOTSUPP;
     403             :                                 break;
     404             :                         }
     405             : 
     406     3232303 :                         memcpy(&new_key, &key, sizeof(new_key));
     407     3232303 :                         new_key.offset = args->end;
     408     3232303 :                         btrfs_set_item_key_safe(fs_info, path, &new_key);
     409             : 
     410     3232301 :                         extent_offset += args->end - key.offset;
     411     3232301 :                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
     412     3232302 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     413     3232302 :                                                         extent_end - args->end);
     414     3232301 :                         btrfs_mark_buffer_dirty(leaf);
     415     3232305 :                         if (update_refs && disk_bytenr > 0)
     416     3080246 :                                 args->bytes_found += args->end - key.offset;
     417             :                         break;
     418             :                 }
     419             : 
     420     2926074 :                 search_start = extent_end;
     421             :                 /*
     422             :                  *       | ---- range to drop ----- |
     423             :                  *  | -------- extent -------- |
     424             :                  */
     425     2926074 :                 if (args->start > key.offset && args->end >= extent_end) {
     426      579917 :                         BUG_ON(del_nr > 0);
     427      579917 :                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
     428             :                                 ret = -EOPNOTSUPP;
     429             :                                 break;
     430             :                         }
     431             : 
     432      579917 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     433             :                                                         args->start - key.offset);
     434      579915 :                         btrfs_mark_buffer_dirty(leaf);
     435      579930 :                         if (update_refs && disk_bytenr > 0)
     436      503768 :                                 args->bytes_found += extent_end - args->start;
     437      579930 :                         if (args->end == extent_end)
     438             :                                 break;
     439             : 
     440      323469 :                         path->slots[0]++;
     441      323469 :                         goto next_slot;
     442             :                 }
     443             : 
     444             :                 /*
     445             :                  *  | ---- range to drop ----- |
     446             :                  *    | ------ extent ------ |
     447             :                  */
     448     2346157 :                 if (args->start <= key.offset && args->end >= extent_end) {
     449     2346157 : delete_extent_item:
     450     2346157 :                         if (del_nr == 0) {
     451     1597275 :                                 del_slot = path->slots[0];
     452     1597275 :                                 del_nr = 1;
     453             :                         } else {
     454      748882 :                                 BUG_ON(del_slot + del_nr != path->slots[0]);
     455      748882 :                                 del_nr++;
     456             :                         }
     457             : 
     458     2346157 :                         if (update_refs &&
     459     2346157 :                             extent_type == BTRFS_FILE_EXTENT_INLINE) {
     460      227088 :                                 args->bytes_found += extent_end - key.offset;
     461      227088 :                                 extent_end = ALIGN(extent_end,
     462             :                                                    fs_info->sectorsize);
     463     2119069 :                         } else if (update_refs && disk_bytenr > 0) {
     464     1883347 :                                 btrfs_init_generic_ref(&ref,
     465             :                                                 BTRFS_DROP_DELAYED_REF,
     466             :                                                 disk_bytenr, num_bytes, 0);
     467     1883347 :                                 btrfs_init_data_ref(&ref,
     468             :                                                 root->root_key.objectid,
     469             :                                                 key.objectid,
     470     1883347 :                                                 key.offset - extent_offset, 0,
     471             :                                                 false);
     472     1883347 :                                 ret = btrfs_free_extent(trans, &ref);
     473     1883348 :                                 if (ret) {
     474           0 :                                         btrfs_abort_transaction(trans, ret);
     475           0 :                                         break;
     476             :                                 }
     477     1883348 :                                 args->bytes_found += extent_end - key.offset;
     478             :                         }
     479             : 
     480     2346158 :                         if (args->end == extent_end)
     481             :                                 break;
     482             : 
     483     1216720 :                         if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
     484     1199139 :                                 path->slots[0]++;
     485     1199139 :                                 goto next_slot;
     486             :                         }
     487             : 
     488       17581 :                         ret = btrfs_del_items(trans, root, path, del_slot,
     489             :                                               del_nr);
     490       17581 :                         if (ret) {
     491           0 :                                 btrfs_abort_transaction(trans, ret);
     492           0 :                                 break;
     493             :                         }
     494             : 
     495       17581 :                         del_nr = 0;
     496       17581 :                         del_slot = 0;
     497             : 
     498       17581 :                         btrfs_release_path(path);
     499       17581 :                         continue;
     500             :                 }
     501             : 
     502           0 :                 BUG();
     503             :         }
     504             : 
     505    22967714 :         if (!ret && del_nr > 0) {
     506             :                 /*
     507             :                  * Set path->slots[0] to first slot, so that after the delete
     508             :                  * if items are move off from our leaf to its immediate left or
     509             :                  * right neighbor leafs, we end up with a correct and adjusted
     510             :                  * path->slots[0] for our insertion (if args->replace_extent).
     511             :                  */
     512     1579693 :                 path->slots[0] = del_slot;
     513     1579693 :                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
     514     1579697 :                 if (ret)
     515           0 :                         btrfs_abort_transaction(trans, ret);
     516             :         }
     517             : 
     518    22967718 :         leaf = path->nodes[0];
     519             :         /*
     520             :          * If btrfs_del_items() was called, it might have deleted a leaf, in
     521             :          * which case it unlocked our path, so check path->locks[0] matches a
     522             :          * write lock.
     523             :          */
     524    22967718 :         if (!ret && args->replace_extent &&
     525     4610048 :             path->locks[0] == BTRFS_WRITE_LOCK &&
     526     4192526 :             btrfs_leaf_free_space(leaf) >=
     527     4192548 :             sizeof(struct btrfs_item) + args->extent_item_size) {
     528             : 
     529     3916215 :                 key.objectid = ino;
     530     3916215 :                 key.type = BTRFS_EXTENT_DATA_KEY;
     531     3916215 :                 key.offset = args->start;
     532     3916215 :                 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
     533     2747848 :                         struct btrfs_key slot_key;
     534             : 
     535     2747848 :                         btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
     536     2747854 :                         if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
     537      229392 :                                 path->slots[0]++;
     538             :                 }
     539     3916221 :                 btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
     540     3916234 :                 args->extent_inserted = true;
     541             :         }
     542             : 
     543    22967759 :         if (!args->path)
     544      105195 :                 btrfs_free_path(path);
     545    22862564 :         else if (!args->extent_inserted)
     546    18946227 :                 btrfs_release_path(path);
     547     3916337 : out:
     548    22967715 :         args->drop_end = found ? min(args->end, last_end) : args->end;
     549             : 
     550    22967715 :         return ret;
     551             : }
     552             : 
     553      653151 : static int extent_mergeable(struct extent_buffer *leaf, int slot,
     554             :                             u64 objectid, u64 bytenr, u64 orig_offset,
     555             :                             u64 *start, u64 *end)
     556             : {
     557      653151 :         struct btrfs_file_extent_item *fi;
     558      653151 :         struct btrfs_key key;
     559      653151 :         u64 extent_end;
     560             : 
     561      653151 :         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
     562             :                 return 0;
     563             : 
     564      638562 :         btrfs_item_key_to_cpu(leaf, &key, slot);
     565      638563 :         if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
     566             :                 return 0;
     567             : 
     568      597854 :         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
     569      872476 :         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
     570       68319 :             btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
     571      136629 :             btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
     572       68310 :             btrfs_file_extent_compression(leaf, fi) ||
     573       68310 :             btrfs_file_extent_encryption(leaf, fi) ||
     574             :             btrfs_file_extent_other_encoding(leaf, fi))
     575      529545 :                 return 0;
     576             : 
     577       68310 :         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
     578       68310 :         if ((*start && *start != key.offset) || (*end && *end != extent_end))
     579             :                 return 0;
     580             : 
     581       60000 :         *start = key.offset;
     582       60000 :         *end = extent_end;
     583       60000 :         return 1;
     584             : }
     585             : 
     586             : /*
     587             :  * Mark extent in the range start - end as written.
     588             :  *
     589             :  * This changes extent type from 'pre-allocated' to 'regular'. If only
     590             :  * part of extent is marked as written, the extent will be split into
     591             :  * two or three.
     592             :  */
     593      291270 : int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
     594             :                               struct btrfs_inode *inode, u64 start, u64 end)
     595             : {
     596      291270 :         struct btrfs_fs_info *fs_info = trans->fs_info;
     597      291270 :         struct btrfs_root *root = inode->root;
     598      291270 :         struct extent_buffer *leaf;
     599      291270 :         struct btrfs_path *path;
     600      291270 :         struct btrfs_file_extent_item *fi;
     601      291270 :         struct btrfs_ref ref = { 0 };
     602      291270 :         struct btrfs_key key;
     603      291270 :         struct btrfs_key new_key;
     604      291270 :         u64 bytenr;
     605      291270 :         u64 num_bytes;
     606      291270 :         u64 extent_end;
     607      291270 :         u64 orig_offset;
     608      291270 :         u64 other_start;
     609      291270 :         u64 other_end;
     610      291270 :         u64 split;
     611      291270 :         int del_nr = 0;
     612      291270 :         int del_slot = 0;
     613      291270 :         int recow;
     614      291270 :         int ret = 0;
     615      291270 :         u64 ino = btrfs_ino(inode);
     616             : 
     617      291270 :         path = btrfs_alloc_path();
     618      291278 :         if (!path)
     619             :                 return -ENOMEM;
     620      291278 : again:
     621      291305 :         recow = 0;
     622      291305 :         split = start;
     623      291305 :         key.objectid = ino;
     624      291305 :         key.type = BTRFS_EXTENT_DATA_KEY;
     625      291305 :         key.offset = split;
     626             : 
     627      291305 :         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
     628      291312 :         if (ret < 0)
     629           0 :                 goto out;
     630      291312 :         if (ret > 0 && path->slots[0] > 0)
     631      139238 :                 path->slots[0]--;
     632             : 
     633      291312 :         leaf = path->nodes[0];
     634      291312 :         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
     635      291312 :         if (key.objectid != ino ||
     636      291312 :             key.type != BTRFS_EXTENT_DATA_KEY) {
     637           0 :                 ret = -EINVAL;
     638           0 :                 btrfs_abort_transaction(trans, ret);
     639           0 :                 goto out;
     640             :         }
     641      291312 :         fi = btrfs_item_ptr(leaf, path->slots[0],
     642             :                             struct btrfs_file_extent_item);
     643      291312 :         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
     644           0 :                 ret = -EINVAL;
     645           0 :                 btrfs_abort_transaction(trans, ret);
     646           0 :                 goto out;
     647             :         }
     648      291311 :         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
     649      291311 :         if (key.offset > start || extent_end < end) {
     650           0 :                 ret = -EINVAL;
     651           0 :                 btrfs_abort_transaction(trans, ret);
     652           0 :                 goto out;
     653             :         }
     654             : 
     655      291311 :         bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
     656      291312 :         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
     657      291311 :         orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
     658      291312 :         memcpy(&new_key, &key, sizeof(new_key));
     659             : 
     660      291312 :         if (start == key.offset && end < extent_end) {
     661       70064 :                 other_start = 0;
     662       70064 :                 other_end = start;
     663       70064 :                 if (extent_mergeable(leaf, path->slots[0] - 1,
     664             :                                      ino, bytenr, orig_offset,
     665             :                                      &other_start, &other_end)) {
     666       33424 :                         new_key.offset = end;
     667       33424 :                         btrfs_set_item_key_safe(fs_info, path, &new_key);
     668       33424 :                         fi = btrfs_item_ptr(leaf, path->slots[0],
     669             :                                             struct btrfs_file_extent_item);
     670       33424 :                         btrfs_set_file_extent_generation(leaf, fi,
     671             :                                                          trans->transid);
     672       33424 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     673             :                                                         extent_end - end);
     674       33424 :                         btrfs_set_file_extent_offset(leaf, fi,
     675             :                                                      end - orig_offset);
     676       33424 :                         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
     677             :                                             struct btrfs_file_extent_item);
     678       33424 :                         btrfs_set_file_extent_generation(leaf, fi,
     679             :                                                          trans->transid);
     680       33424 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     681             :                                                         end - other_start);
     682       33424 :                         btrfs_mark_buffer_dirty(leaf);
     683       33424 :                         goto out;
     684             :                 }
     685             :         }
     686             : 
     687      257888 :         if (start > key.offset && end == extent_end) {
     688       84742 :                 other_start = end;
     689       84742 :                 other_end = 0;
     690       84742 :                 if (extent_mergeable(leaf, path->slots[0] + 1,
     691             :                                      ino, bytenr, orig_offset,
     692             :                                      &other_start, &other_end)) {
     693        8692 :                         fi = btrfs_item_ptr(leaf, path->slots[0],
     694             :                                             struct btrfs_file_extent_item);
     695        8692 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     696        8692 :                                                         start - key.offset);
     697        8692 :                         btrfs_set_file_extent_generation(leaf, fi,
     698             :                                                          trans->transid);
     699        8692 :                         path->slots[0]++;
     700        8692 :                         new_key.offset = start;
     701        8692 :                         btrfs_set_item_key_safe(fs_info, path, &new_key);
     702             : 
     703        8692 :                         fi = btrfs_item_ptr(leaf, path->slots[0],
     704             :                                             struct btrfs_file_extent_item);
     705        8692 :                         btrfs_set_file_extent_generation(leaf, fi,
     706             :                                                          trans->transid);
     707        8692 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     708             :                                                         other_end - start);
     709        8692 :                         btrfs_set_file_extent_offset(leaf, fi,
     710             :                                                      start - orig_offset);
     711        8692 :                         btrfs_mark_buffer_dirty(leaf);
     712        8692 :                         goto out;
     713             :                 }
     714             :         }
     715             : 
     716      470847 :         while (start > key.offset || end < extent_end) {
     717      221674 :                 if (key.offset == start)
     718       91126 :                         split = end;
     719             : 
     720      221674 :                 new_key.offset = split;
     721      221674 :                 ret = btrfs_duplicate_item(trans, root, path, &new_key);
     722      221671 :                 if (ret == -EAGAIN) {
     723          21 :                         btrfs_release_path(path);
     724          21 :                         goto again;
     725             :                 }
     726      221650 :                 if (ret < 0) {
     727           0 :                         btrfs_abort_transaction(trans, ret);
     728           0 :                         goto out;
     729             :                 }
     730             : 
     731      221650 :                 leaf = path->nodes[0];
     732      221650 :                 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
     733             :                                     struct btrfs_file_extent_item);
     734      221651 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
     735      221649 :                 btrfs_set_file_extent_num_bytes(leaf, fi,
     736      221649 :                                                 split - key.offset);
     737             : 
     738      221649 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
     739             :                                     struct btrfs_file_extent_item);
     740             : 
     741      221649 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
     742      221649 :                 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
     743      221648 :                 btrfs_set_file_extent_num_bytes(leaf, fi,
     744             :                                                 extent_end - split);
     745      221649 :                 btrfs_mark_buffer_dirty(leaf);
     746             : 
     747      221648 :                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
     748             :                                        num_bytes, 0);
     749      221648 :                 btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
     750             :                                     orig_offset, 0, false);
     751      221648 :                 ret = btrfs_inc_extent_ref(trans, &ref);
     752      221651 :                 if (ret) {
     753           0 :                         btrfs_abort_transaction(trans, ret);
     754           0 :                         goto out;
     755             :                 }
     756             : 
     757      221651 :                 if (split == start) {
     758      130535 :                         key.offset = start;
     759             :                 } else {
     760       91116 :                         if (start != key.offset) {
     761           0 :                                 ret = -EINVAL;
     762           0 :                                 btrfs_abort_transaction(trans, ret);
     763           0 :                                 goto out;
     764             :                         }
     765       91116 :                         path->slots[0]--;
     766       91116 :                         extent_end = end;
     767             :                 }
     768             :                 recow = 1;
     769             :         }
     770             : 
     771      249173 :         other_start = end;
     772      249173 :         other_end = 0;
     773      249173 :         btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
     774             :                                num_bytes, 0);
     775      249173 :         btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
     776             :                             0, false);
     777      249173 :         if (extent_mergeable(leaf, path->slots[0] + 1,
     778             :                              ino, bytenr, orig_offset,
     779             :                              &other_start, &other_end)) {
     780        9478 :                 if (recow) {
     781           4 :                         btrfs_release_path(path);
     782           4 :                         goto again;
     783             :                 }
     784        9474 :                 extent_end = other_end;
     785        9474 :                 del_slot = path->slots[0] + 1;
     786        9474 :                 del_nr++;
     787        9474 :                 ret = btrfs_free_extent(trans, &ref);
     788        9474 :                 if (ret) {
     789           0 :                         btrfs_abort_transaction(trans, ret);
     790           0 :                         goto out;
     791             :                 }
     792             :         }
     793      249171 :         other_start = 0;
     794      249171 :         other_end = start;
     795      249171 :         if (extent_mergeable(leaf, path->slots[0] - 1,
     796             :                              ino, bytenr, orig_offset,
     797             :                              &other_start, &other_end)) {
     798        8406 :                 if (recow) {
     799           2 :                         btrfs_release_path(path);
     800           2 :                         goto again;
     801             :                 }
     802        8404 :                 key.offset = other_start;
     803        8404 :                 del_slot = path->slots[0];
     804        8404 :                 del_nr++;
     805        8404 :                 ret = btrfs_free_extent(trans, &ref);
     806        8404 :                 if (ret) {
     807           0 :                         btrfs_abort_transaction(trans, ret);
     808           0 :                         goto out;
     809             :                 }
     810             :         }
     811      249169 :         if (del_nr == 0) {
     812      234842 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
     813             :                            struct btrfs_file_extent_item);
     814      234842 :                 btrfs_set_file_extent_type(leaf, fi,
     815             :                                            BTRFS_FILE_EXTENT_REG);
     816      234842 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
     817      234842 :                 btrfs_mark_buffer_dirty(leaf);
     818             :         } else {
     819       14327 :                 fi = btrfs_item_ptr(leaf, del_slot - 1,
     820             :                            struct btrfs_file_extent_item);
     821       14327 :                 btrfs_set_file_extent_type(leaf, fi,
     822             :                                            BTRFS_FILE_EXTENT_REG);
     823       14327 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
     824       14327 :                 btrfs_set_file_extent_num_bytes(leaf, fi,
     825       14327 :                                                 extent_end - key.offset);
     826       14327 :                 btrfs_mark_buffer_dirty(leaf);
     827             : 
     828       14327 :                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
     829       14327 :                 if (ret < 0) {
     830           0 :                         btrfs_abort_transaction(trans, ret);
     831           0 :                         goto out;
     832             :                 }
     833             :         }
     834       14327 : out:
     835      291284 :         btrfs_free_path(path);
     836      291284 :         return ret;
     837             : }
     838             : 
     839             : /*
     840             :  * on error we return an unlocked page and the error value
     841             :  * on success we return a locked page and 0
     842             :  */
     843    54725010 : static int prepare_uptodate_page(struct inode *inode,
     844             :                                  struct page *page, u64 pos,
     845             :                                  bool force_uptodate)
     846             : {
     847    54725010 :         struct folio *folio = page_folio(page);
     848    54725407 :         int ret = 0;
     849             : 
     850    74230089 :         if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
     851    19510683 :             !PageUptodate(page)) {
     852     2982326 :                 ret = btrfs_read_folio(NULL, folio);
     853     2990630 :                 if (ret)
     854             :                         return ret;
     855     2990497 :                 lock_page(page);
     856     2989174 :                 if (!PageUptodate(page)) {
     857           0 :                         unlock_page(page);
     858           0 :                         return -EIO;
     859             :                 }
     860             : 
     861             :                 /*
     862             :                  * Since btrfs_read_folio() will unlock the folio before it
     863             :                  * returns, there is a window where btrfs_release_folio() can be
     864             :                  * called to release the page.  Here we check both inode
     865             :                  * mapping and PagePrivate() to make sure the page was not
     866             :                  * released.
     867             :                  *
     868             :                  * The private flag check is essential for subpage as we need
     869             :                  * to store extra bitmap using page->private.
     870             :                  */
     871     5973607 :                 if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
     872           5 :                         unlock_page(page);
     873           5 :                         return -EAGAIN;
     874             :                 }
     875             :         }
     876             :         return 0;
     877             : }
     878             : 
     879             : static fgf_t get_prepare_fgp_flags(bool nowait)
     880             : {
     881    27410775 :         fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
     882             : 
     883    27410775 :         if (nowait)
     884           0 :                 fgp_flags |= FGP_NOWAIT;
     885             : 
     886    27410775 :         return fgp_flags;
     887             : }
     888             : 
     889             : static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
     890             : {
     891    27410775 :         gfp_t gfp;
     892             : 
     893    27410775 :         gfp = btrfs_alloc_write_mask(inode->i_mapping);
     894    27410775 :         if (nowait) {
     895           0 :                 gfp &= ~__GFP_DIRECT_RECLAIM;
     896           0 :                 gfp |= GFP_NOWAIT;
     897             :         }
     898             : 
     899    27410775 :         return gfp;
     900             : }
     901             : 
     902             : /*
     903             :  * this just gets pages into the page cache and locks them down.
     904             :  */
     905    27410775 : static noinline int prepare_pages(struct inode *inode, struct page **pages,
     906             :                                   size_t num_pages, loff_t pos,
     907             :                                   size_t write_bytes, bool force_uptodate,
     908             :                                   bool nowait)
     909             : {
     910    27410775 :         int i;
     911    27410775 :         unsigned long index = pos >> PAGE_SHIFT;
     912    27410775 :         gfp_t mask = get_prepare_gfp_flags(inode, nowait);
     913    27410775 :         fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
     914    27410775 :         int err = 0;
     915    27410775 :         int faili;
     916             : 
     917    97370282 :         for (i = 0; i < num_pages; i++) {
     918    70001370 : again:
     919    70001370 :                 pages[i] = pagecache_get_page(inode->i_mapping, index + i,
     920             :                                               fgp_flags, mask | __GFP_WRITE);
     921    69995803 :                 if (!pages[i]) {
     922           0 :                         faili = i - 1;
     923           0 :                         if (nowait)
     924             :                                 err = -EAGAIN;
     925             :                         else
     926           0 :                                 err = -ENOMEM;
     927           0 :                         goto fail;
     928             :                 }
     929             : 
     930    69995803 :                 err = set_page_extent_mapped(pages[i]);
     931    69994878 :                 if (err < 0) {
     932           0 :                         faili = i;
     933           0 :                         goto fail;
     934             :                 }
     935             : 
     936    69994878 :                 if (i == 0)
     937    27382676 :                         err = prepare_uptodate_page(inode, pages[i], pos,
     938             :                                                     force_uptodate);
     939    69979971 :                 if (!err && i == num_pages - 1)
     940    27383916 :                         err = prepare_uptodate_page(inode, pages[i],
     941             :                                                     pos + write_bytes, false);
     942    69987112 :                 if (err) {
     943           5 :                         put_page(pages[i]);
     944           5 :                         if (!nowait && err == -EAGAIN) {
     945           5 :                                 err = 0;
     946           5 :                                 goto again;
     947             :                         }
     948           0 :                         faili = i - 1;
     949           0 :                         goto fail;
     950             :                 }
     951    69987107 :                 wait_on_page_writeback(pages[i]);
     952             :         }
     953             : 
     954             :         return 0;
     955           0 : fail:
     956           0 :         while (faili >= 0) {
     957           0 :                 unlock_page(pages[faili]);
     958           0 :                 put_page(pages[faili]);
     959           0 :                 faili--;
     960             :         }
     961             :         return err;
     962             : 
     963             : }
     964             : 
     965             : /*
     966             :  * This function locks the extent and properly waits for data=ordered extents
     967             :  * to finish before allowing the pages to be modified if need.
     968             :  *
     969             :  * The return value:
     970             :  * 1 - the extent is locked
     971             :  * 0 - the extent is not locked, and everything is OK
     972             :  * -EAGAIN - need re-prepare the pages
     973             :  * the other < 0 number - Something wrong happens
     974             :  */
     975             : static noinline int
     976    27375495 : lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
     977             :                                 size_t num_pages, loff_t pos,
     978             :                                 size_t write_bytes,
     979             :                                 u64 *lockstart, u64 *lockend, bool nowait,
     980             :                                 struct extent_state **cached_state)
     981             : {
     982    27375495 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
     983    27375495 :         u64 start_pos;
     984    27375495 :         u64 last_pos;
     985    27375495 :         int i;
     986    27375495 :         int ret = 0;
     987             : 
     988    27375495 :         start_pos = round_down(pos, fs_info->sectorsize);
     989    27375495 :         last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
     990             : 
     991    27375495 :         if (start_pos < inode->vfs_inode.i_size) {
     992    14526993 :                 struct btrfs_ordered_extent *ordered;
     993             : 
     994    14526993 :                 if (nowait) {
     995           0 :                         if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
     996             :                                              cached_state)) {
     997           0 :                                 for (i = 0; i < num_pages; i++) {
     998           0 :                                         unlock_page(pages[i]);
     999           0 :                                         put_page(pages[i]);
    1000           0 :                                         pages[i] = NULL;
    1001             :                                 }
    1002             : 
    1003             :                                 return -EAGAIN;
    1004             :                         }
    1005             :                 } else {
    1006    14526993 :                         lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
    1007             :                 }
    1008             : 
    1009    14494895 :                 ordered = btrfs_lookup_ordered_range(inode, start_pos,
    1010    14494895 :                                                      last_pos - start_pos + 1);
    1011    14495194 :                 if (ordered &&
    1012        4642 :                     ordered->file_offset + ordered->num_bytes > start_pos &&
    1013             :                     ordered->file_offset <= last_pos) {
    1014        4642 :                         unlock_extent(&inode->io_tree, start_pos, last_pos,
    1015             :                                       cached_state);
    1016      424379 :                         for (i = 0; i < num_pages; i++) {
    1017      415095 :                                 unlock_page(pages[i]);
    1018      415095 :                                 put_page(pages[i]);
    1019             :                         }
    1020        4642 :                         btrfs_start_ordered_extent(ordered);
    1021        4642 :                         btrfs_put_ordered_extent(ordered);
    1022        4642 :                         return -EAGAIN;
    1023             :                 }
    1024    14490552 :                 if (ordered)
    1025           0 :                         btrfs_put_ordered_extent(ordered);
    1026             : 
    1027    14485583 :                 *lockstart = start_pos;
    1028    14485583 :                 *lockend = last_pos;
    1029    14485583 :                 ret = 1;
    1030             :         }
    1031             : 
    1032             :         /*
    1033             :          * We should be called after prepare_pages() which should have locked
    1034             :          * all pages in the range.
    1035             :          */
    1036    96847478 :         for (i = 0; i < num_pages; i++)
    1037   139023042 :                 WARN_ON(!PageLocked(pages[i]));
    1038             : 
    1039             :         return ret;
    1040             : }
    1041             : 
    1042             : /*
    1043             :  * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
    1044             :  *
    1045             :  * @pos:         File offset.
    1046             :  * @write_bytes: The length to write, will be updated to the nocow writeable
    1047             :  *               range.
    1048             :  *
    1049             :  * This function will flush ordered extents in the range to ensure proper
    1050             :  * nocow checks.
    1051             :  *
    1052             :  * Return:
    1053             :  * > 0          If we can nocow, and updates @write_bytes.
    1054             :  *  0           If we can't do a nocow write.
    1055             :  * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
    1056             :  *              root is in progress.
    1057             :  * < 0          If an error happened.
    1058             :  *
    1059             :  * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
    1060             :  */
    1061       68077 : int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
    1062             :                            size_t *write_bytes, bool nowait)
    1063             : {
    1064       68077 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    1065       68077 :         struct btrfs_root *root = inode->root;
    1066       68077 :         struct extent_state *cached_state = NULL;
    1067       68077 :         u64 lockstart, lockend;
    1068       68077 :         u64 num_bytes;
    1069       68077 :         int ret;
    1070             : 
    1071       68077 :         if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
    1072             :                 return 0;
    1073             : 
    1074        7429 :         if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
    1075             :                 return -EAGAIN;
    1076             : 
    1077        7429 :         lockstart = round_down(pos, fs_info->sectorsize);
    1078        7429 :         lockend = round_up(pos + *write_bytes,
    1079             :                            fs_info->sectorsize) - 1;
    1080        7429 :         num_bytes = lockend - lockstart + 1;
    1081             : 
    1082        7429 :         if (nowait) {
    1083           0 :                 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
    1084             :                                                   &cached_state)) {
    1085           0 :                         btrfs_drew_write_unlock(&root->snapshot_lock);
    1086           0 :                         return -EAGAIN;
    1087             :                 }
    1088             :         } else {
    1089        7429 :                 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
    1090             :                                                    &cached_state);
    1091             :         }
    1092        7427 :         ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
    1093             :                         NULL, NULL, NULL, nowait, false);
    1094        7429 :         if (ret <= 0)
    1095        2072 :                 btrfs_drew_write_unlock(&root->snapshot_lock);
    1096             :         else
    1097        5357 :                 *write_bytes = min_t(size_t, *write_bytes ,
    1098             :                                      num_bytes - pos + lockstart);
    1099        7428 :         unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
    1100             : 
    1101        7428 :         return ret;
    1102             : }
    1103             : 
    1104          48 : void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
    1105             : {
    1106          48 :         btrfs_drew_write_unlock(&inode->root->snapshot_lock);
    1107        5309 : }
    1108             : 
    1109    27115548 : static void update_time_for_write(struct inode *inode)
    1110             : {
    1111    27115548 :         struct timespec64 now;
    1112             : 
    1113    27115548 :         if (IS_NOCMTIME(inode))
    1114             :                 return;
    1115             : 
    1116    27115733 :         now = current_time(inode);
    1117    27148066 :         if (!timespec64_equal(&inode->i_mtime, &now))
    1118      816205 :                 inode->i_mtime = now;
    1119             : 
    1120    27148066 :         if (!timespec64_equal(&inode->i_ctime, &now))
    1121      804419 :                 inode->i_ctime = now;
    1122             : 
    1123    27148066 :         if (IS_I_VERSION(inode))
    1124    27161817 :                 inode_inc_iversion(inode);
    1125             : }
    1126             : 
    1127    27146276 : static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
    1128             :                              size_t count)
    1129             : {
    1130    27146276 :         struct file *file = iocb->ki_filp;
    1131    27146276 :         struct inode *inode = file_inode(file);
    1132    27146276 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    1133    27146276 :         loff_t pos = iocb->ki_pos;
    1134    27146276 :         int ret;
    1135    27146276 :         loff_t oldsize;
    1136    27146276 :         loff_t start_pos;
    1137             : 
    1138             :         /*
    1139             :          * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
    1140             :          * prealloc flags, as without those flags we always have to COW. We will
    1141             :          * later check if we can really COW into the target range (using
    1142             :          * can_nocow_extent() at btrfs_get_blocks_direct_write()).
    1143             :          */
    1144    27146276 :         if ((iocb->ki_flags & IOCB_NOWAIT) &&
    1145           7 :             !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
    1146             :                 return -EAGAIN;
    1147             : 
    1148    27146274 :         ret = file_remove_privs(file);
    1149    27128291 :         if (ret)
    1150             :                 return ret;
    1151             : 
    1152             :         /*
    1153             :          * We reserve space for updating the inode when we reserve space for the
    1154             :          * extent we are going to write, so we will enospc out there.  We don't
    1155             :          * need to start yet another transaction to update the inode as we will
    1156             :          * update the inode when we finish writing whatever data we write.
    1157             :          */
    1158    27128592 :         update_time_for_write(inode);
    1159             : 
    1160    27160634 :         start_pos = round_down(pos, fs_info->sectorsize);
    1161    27160634 :         oldsize = i_size_read(inode);
    1162    27160634 :         if (start_pos > oldsize) {
    1163             :                 /* Expand hole size to cover write data, preventing empty gap */
    1164     1388760 :                 loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
    1165             : 
    1166     1388760 :                 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
    1167     1394587 :                 if (ret)
    1168        1412 :                         return ret;
    1169             :         }
    1170             : 
    1171             :         return 0;
    1172             : }
    1173             : 
    1174    26122198 : static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
    1175             :                                                struct iov_iter *i)
    1176             : {
    1177    26122198 :         struct file *file = iocb->ki_filp;
    1178    26122198 :         loff_t pos;
    1179    26122198 :         struct inode *inode = file_inode(file);
    1180    26122198 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    1181    26122198 :         struct page **pages = NULL;
    1182    26122198 :         struct extent_changeset *data_reserved = NULL;
    1183    26122198 :         u64 release_bytes = 0;
    1184    26122198 :         u64 lockstart;
    1185    26122198 :         u64 lockend;
    1186    26122198 :         size_t num_written = 0;
    1187    26122198 :         int nrptrs;
    1188    26122198 :         ssize_t ret;
    1189    26122198 :         bool only_release_metadata = false;
    1190    26122198 :         bool force_page_uptodate = false;
    1191    26122198 :         loff_t old_isize = i_size_read(inode);
    1192    26122198 :         unsigned int ilock_flags = 0;
    1193    26122198 :         const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
    1194    26122198 :         unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
    1195             : 
    1196    26122198 :         if (nowait)
    1197           0 :                 ilock_flags |= BTRFS_ILOCK_TRY;
    1198             : 
    1199    26122198 :         ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
    1200    26077265 :         if (ret < 0)
    1201             :                 return ret;
    1202             : 
    1203    26077265 :         ret = generic_write_checks(iocb, i);
    1204    26101646 :         if (ret <= 0)
    1205          15 :                 goto out;
    1206             : 
    1207    26101631 :         ret = btrfs_write_check(iocb, i, ret);
    1208    26121402 :         if (ret < 0)
    1209        1040 :                 goto out;
    1210             : 
    1211    26120362 :         pos = iocb->ki_pos;
    1212    26120362 :         nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
    1213             :                         PAGE_SIZE / (sizeof(struct page *)));
    1214    26120362 :         nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
    1215    26120362 :         nrptrs = max(nrptrs, 8);
    1216    26120362 :         pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
    1217    26054407 :         if (!pages) {
    1218           0 :                 ret = -ENOMEM;
    1219           0 :                 goto out;
    1220             :         }
    1221             : 
    1222    53490456 :         while (iov_iter_count(i) > 0) {
    1223    27395928 :                 struct extent_state *cached_state = NULL;
    1224    27395928 :                 size_t offset = offset_in_page(pos);
    1225    27395928 :                 size_t sector_offset;
    1226    27395928 :                 size_t write_bytes = min(iov_iter_count(i),
    1227             :                                          nrptrs * (size_t)PAGE_SIZE -
    1228             :                                          offset);
    1229    27395928 :                 size_t num_pages;
    1230    27395928 :                 size_t reserve_bytes;
    1231    27395928 :                 size_t dirty_pages;
    1232    27395928 :                 size_t copied;
    1233    27395928 :                 size_t dirty_sectors;
    1234    27395928 :                 size_t num_sectors;
    1235    27395928 :                 int extents_locked;
    1236             : 
    1237             :                 /*
    1238             :                  * Fault pages before locking them in prepare_pages
    1239             :                  * to avoid recursive lock
    1240             :                  */
    1241    27395928 :                 if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
    1242             :                         ret = -EFAULT;
    1243       61255 :                         break;
    1244             :                 }
    1245             : 
    1246    27415401 :                 only_release_metadata = false;
    1247    27415401 :                 sector_offset = pos & (fs_info->sectorsize - 1);
    1248             : 
    1249    27415401 :                 extent_changeset_release(data_reserved);
    1250    27415401 :                 ret = btrfs_check_data_free_space(BTRFS_I(inode),
    1251             :                                                   &data_reserved, pos,
    1252             :                                                   write_bytes, nowait);
    1253    27501623 :                 if (ret < 0) {
    1254       66363 :                         int can_nocow;
    1255             : 
    1256       66363 :                         if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
    1257             :                                 ret = -EAGAIN;
    1258             :                                 break;
    1259             :                         }
    1260             : 
    1261             :                         /*
    1262             :                          * If we don't have to COW at the offset, reserve
    1263             :                          * metadata only. write_bytes may get smaller than
    1264             :                          * requested here.
    1265             :                          */
    1266       66363 :                         can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
    1267             :                                                            &write_bytes, nowait);
    1268       66384 :                         if (can_nocow < 0)
    1269           3 :                                 ret = can_nocow;
    1270       66384 :                         if (can_nocow > 0)
    1271             :                                 ret = 0;
    1272             :                         if (ret)
    1273             :                                 break;
    1274             :                         only_release_metadata = true;
    1275             :                 }
    1276             : 
    1277    27440572 :                 num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
    1278    27440572 :                 WARN_ON(num_pages > nrptrs);
    1279    27440572 :                 reserve_bytes = round_up(write_bytes + sector_offset,
    1280             :                                          fs_info->sectorsize);
    1281    27440572 :                 WARN_ON(reserve_bytes == 0);
    1282    27440572 :                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
    1283             :                                                       reserve_bytes,
    1284             :                                                       reserve_bytes, nowait);
    1285    27417651 :                 if (ret) {
    1286         183 :                         if (!only_release_metadata)
    1287         156 :                                 btrfs_free_reserved_data_space(BTRFS_I(inode),
    1288             :                                                 data_reserved, pos,
    1289             :                                                 write_bytes);
    1290             :                         else
    1291          27 :                                 btrfs_check_nocow_unlock(BTRFS_I(inode));
    1292             : 
    1293         183 :                         if (nowait && ret == -ENOSPC)
    1294           0 :                                 ret = -EAGAIN;
    1295             :                         break;
    1296             :                 }
    1297             : 
    1298             :                 release_bytes = reserve_bytes;
    1299    27422110 : again:
    1300    27422110 :                 ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
    1301    27408159 :                 if (ret) {
    1302           0 :                         btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
    1303           0 :                         break;
    1304             :                 }
    1305             : 
    1306             :                 /*
    1307             :                  * This is going to setup the pages array with the number of
    1308             :                  * pages we want, so we don't really need to worry about the
    1309             :                  * contents of pages from loop to loop
    1310             :                  */
    1311    27408159 :                 ret = prepare_pages(inode, pages, num_pages,
    1312             :                                     pos, write_bytes, force_page_uptodate, false);
    1313    27374373 :                 if (ret) {
    1314           0 :                         btrfs_delalloc_release_extents(BTRFS_I(inode),
    1315             :                                                        reserve_bytes);
    1316           0 :                         break;
    1317             :                 }
    1318             : 
    1319    27374373 :                 extents_locked = lock_and_cleanup_extent_if_need(
    1320             :                                 BTRFS_I(inode), pages,
    1321             :                                 num_pages, pos, write_bytes, &lockstart,
    1322             :                                 &lockend, nowait, &cached_state);
    1323    27342648 :                 if (extents_locked < 0) {
    1324        4642 :                         if (!nowait && extents_locked == -EAGAIN)
    1325        4642 :                                 goto again;
    1326             : 
    1327           0 :                         btrfs_delalloc_release_extents(BTRFS_I(inode),
    1328             :                                                        reserve_bytes);
    1329           0 :                         ret = extents_locked;
    1330           0 :                         break;
    1331             :                 }
    1332             : 
    1333    27338006 :                 copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
    1334             : 
    1335    27375521 :                 num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
    1336    27375521 :                 dirty_sectors = round_up(copied + sector_offset,
    1337             :                                         fs_info->sectorsize);
    1338    27375521 :                 dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
    1339             : 
    1340             :                 /*
    1341             :                  * if we have trouble faulting in the pages, fall
    1342             :                  * back to one page at a time
    1343             :                  */
    1344    27375521 :                 if (copied < write_bytes)
    1345           0 :                         nrptrs = 1;
    1346             : 
    1347    27375521 :                 if (copied == 0) {
    1348             :                         force_page_uptodate = true;
    1349             :                         dirty_sectors = 0;
    1350             :                         dirty_pages = 0;
    1351             :                 } else {
    1352    27372750 :                         force_page_uptodate = false;
    1353    27372750 :                         dirty_pages = DIV_ROUND_UP(copied + offset,
    1354             :                                                    PAGE_SIZE);
    1355             :                 }
    1356             : 
    1357    27375521 :                 if (num_sectors > dirty_sectors) {
    1358             :                         /* release everything except the sectors we dirtied */
    1359           0 :                         release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
    1360           0 :                         if (only_release_metadata) {
    1361           0 :                                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
    1362             :                                                         release_bytes, true);
    1363             :                         } else {
    1364           0 :                                 u64 __pos;
    1365             : 
    1366           0 :                                 __pos = round_down(pos,
    1367             :                                                    fs_info->sectorsize) +
    1368           0 :                                         (dirty_pages << PAGE_SHIFT);
    1369           0 :                                 btrfs_delalloc_release_space(BTRFS_I(inode),
    1370             :                                                 data_reserved, __pos,
    1371             :                                                 release_bytes, true);
    1372             :                         }
    1373             :                 }
    1374             : 
    1375    27375521 :                 release_bytes = round_up(copied + sector_offset,
    1376             :                                         fs_info->sectorsize);
    1377             : 
    1378    27375521 :                 ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
    1379             :                                         dirty_pages, pos, copied,
    1380             :                                         &cached_state, only_release_metadata);
    1381             : 
    1382             :                 /*
    1383             :                  * If we have not locked the extent range, because the range's
    1384             :                  * start offset is >= i_size, we might still have a non-NULL
    1385             :                  * cached extent state, acquired while marking the extent range
    1386             :                  * as delalloc through btrfs_dirty_pages(). Therefore free any
    1387             :                  * possible cached extent state to avoid a memory leak.
    1388             :                  */
    1389    27387411 :                 if (extents_locked)
    1390    14526645 :                         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
    1391             :                                       lockend, &cached_state);
    1392             :                 else
    1393    12860766 :                         free_extent_state(cached_state);
    1394             : 
    1395    27412956 :                 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
    1396    27438583 :                 if (ret) {
    1397           0 :                         btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
    1398           0 :                         break;
    1399             :                 }
    1400             : 
    1401    27438583 :                 release_bytes = 0;
    1402    27438583 :                 if (only_release_metadata)
    1403        5282 :                         btrfs_check_nocow_unlock(BTRFS_I(inode));
    1404             : 
    1405    27438583 :                 btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
    1406             : 
    1407    27435042 :                 cond_resched();
    1408             : 
    1409    27436049 :                 pos += copied;
    1410    27436049 :                 num_written += copied;
    1411             :         }
    1412             : 
    1413    26155783 :         kfree(pages);
    1414             : 
    1415    26154753 :         if (release_bytes) {
    1416           0 :                 if (only_release_metadata) {
    1417           0 :                         btrfs_check_nocow_unlock(BTRFS_I(inode));
    1418           0 :                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
    1419             :                                         release_bytes, true);
    1420             :                 } else {
    1421           0 :                         btrfs_delalloc_release_space(BTRFS_I(inode),
    1422             :                                         data_reserved,
    1423           0 :                                         round_down(pos, fs_info->sectorsize),
    1424             :                                         release_bytes, true);
    1425             :                 }
    1426             :         }
    1427             : 
    1428    26154753 :         extent_changeset_free(data_reserved);
    1429    26137007 :         if (num_written > 0) {
    1430    26075952 :                 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
    1431    26063498 :                 iocb->ki_pos += num_written;
    1432             :         }
    1433       61055 : out:
    1434    26125608 :         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1435    26150424 :         return num_written ? num_written : ret;
    1436             : }
    1437             : 
    1438     5018007 : static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
    1439             :                                const struct iov_iter *iter, loff_t offset)
    1440             : {
    1441     5018007 :         const u32 blocksize_mask = fs_info->sectorsize - 1;
    1442             : 
    1443     5018007 :         if (offset & blocksize_mask)
    1444             :                 return -EINVAL;
    1445             : 
    1446     4735992 :         if (iov_iter_alignment(iter) & blocksize_mask)
    1447     2705550 :                 return -EINVAL;
    1448             : 
    1449             :         return 0;
    1450             : }
    1451             : 
    1452     1051329 : static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
    1453             : {
    1454     1051329 :         struct file *file = iocb->ki_filp;
    1455     1051329 :         struct inode *inode = file_inode(file);
    1456     1051329 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    1457     1051329 :         loff_t pos;
    1458     1051329 :         ssize_t written = 0;
    1459     1051329 :         ssize_t written_buffered;
    1460     1051329 :         size_t prev_left = 0;
    1461     1051329 :         loff_t endbyte;
    1462     1051329 :         ssize_t err;
    1463     1051329 :         unsigned int ilock_flags = 0;
    1464     1051329 :         struct iomap_dio *dio;
    1465             : 
    1466     1051329 :         if (iocb->ki_flags & IOCB_NOWAIT)
    1467           7 :                 ilock_flags |= BTRFS_ILOCK_TRY;
    1468             : 
    1469             :         /* If the write DIO is within EOF, use a shared lock */
    1470     1051329 :         if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
    1471      761259 :                 ilock_flags |= BTRFS_ILOCK_SHARED;
    1472             : 
    1473      290070 : relock:
    1474     1051352 :         err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
    1475     1051551 :         if (err < 0)
    1476           0 :                 return err;
    1477             : 
    1478     1051551 :         err = generic_write_checks(iocb, from);
    1479     1051692 :         if (err <= 0) {
    1480           1 :                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1481           1 :                 return err;
    1482             :         }
    1483             : 
    1484     1051691 :         err = btrfs_write_check(iocb, from, err);
    1485     1051719 :         if (err < 0) {
    1486         375 :                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1487         375 :                 goto out;
    1488             :         }
    1489             : 
    1490     1051344 :         pos = iocb->ki_pos;
    1491             :         /*
    1492             :          * Re-check since file size may have changed just before taking the
    1493             :          * lock or pos may have changed because of O_APPEND in generic_write_check()
    1494             :          */
    1495     1051344 :         if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
    1496      761607 :             pos + iov_iter_count(from) > i_size_read(inode)) {
    1497          12 :                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1498          12 :                 ilock_flags &= ~BTRFS_ILOCK_SHARED;
    1499          12 :                 goto relock;
    1500             :         }
    1501             : 
    1502     1051332 :         if (check_direct_IO(fs_info, from, pos)) {
    1503      345562 :                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1504      345684 :                 goto buffered;
    1505             :         }
    1506             : 
    1507             :         /*
    1508             :          * The iov_iter can be mapped to the same file range we are writing to.
    1509             :          * If that's the case, then we will deadlock in the iomap code, because
    1510             :          * it first calls our callback btrfs_dio_iomap_begin(), which will create
    1511             :          * an ordered extent, and after that it will fault in the pages that the
    1512             :          * iov_iter refers to. During the fault in we end up in the readahead
    1513             :          * pages code (starting at btrfs_readahead()), which will lock the range,
    1514             :          * find that ordered extent and then wait for it to complete (at
    1515             :          * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
    1516             :          * obviously the ordered extent can never complete as we didn't submit
    1517             :          * yet the respective bio(s). This always happens when the buffer is
    1518             :          * memory mapped to the same file range, since the iomap DIO code always
    1519             :          * invalidates pages in the target file range (after starting and waiting
    1520             :          * for any writeback).
    1521             :          *
    1522             :          * So here we disable page faults in the iov_iter and then retry if we
    1523             :          * got -EFAULT, faulting in the pages before the retry.
    1524             :          */
    1525      705573 :         from->nofault = true;
    1526      705573 :         dio = btrfs_dio_write(iocb, from, written);
    1527      705757 :         from->nofault = false;
    1528             : 
    1529             :         /*
    1530             :          * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
    1531             :          * iocb, and that needs to lock the inode. So unlock it before calling
    1532             :          * iomap_dio_complete() to avoid a deadlock.
    1533             :          */
    1534      705757 :         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1535             : 
    1536     1411370 :         if (IS_ERR_OR_NULL(dio))
    1537      309926 :                 err = PTR_ERR_OR_ZERO(dio);
    1538             :         else
    1539      550724 :                 err = iomap_dio_complete(dio);
    1540             : 
    1541             :         /* No increment (+=) because iomap returns a cumulative value. */
    1542      705670 :         if (err > 0)
    1543      527713 :                 written = err;
    1544             : 
    1545      705670 :         if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
    1546          74 :                 const size_t left = iov_iter_count(from);
    1547             :                 /*
    1548             :                  * We have more data left to write. Try to fault in as many as
    1549             :                  * possible of the remainder pages and retry. We do this without
    1550             :                  * releasing and locking again the inode, to prevent races with
    1551             :                  * truncate.
    1552             :                  *
    1553             :                  * Also, in case the iov refers to pages in the file range of the
    1554             :                  * file we want to write to (due to a mmap), we could enter an
    1555             :                  * infinite loop if we retry after faulting the pages in, since
    1556             :                  * iomap will invalidate any pages in the range early on, before
    1557             :                  * it tries to fault in the pages of the iov. So we keep track of
    1558             :                  * how much was left of iov in the previous EFAULT and fallback
    1559             :                  * to buffered IO in case we haven't made any progress.
    1560             :                  */
    1561          74 :                 if (left == prev_left) {
    1562             :                         err = -ENOTBLK;
    1563             :                 } else {
    1564          73 :                         fault_in_iov_iter_readable(from, left);
    1565          11 :                         prev_left = left;
    1566          11 :                         goto relock;
    1567             :                 }
    1568             :         }
    1569             : 
    1570             :         /*
    1571             :          * If 'err' is -ENOTBLK or we have not written all data, then it means
    1572             :          * we must fallback to buffered IO.
    1573             :          */
    1574      705597 :         if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
    1575      705030 :                 goto out;
    1576             : 
    1577         567 : buffered:
    1578             :         /*
    1579             :          * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
    1580             :          * it must retry the operation in a context where blocking is acceptable,
    1581             :          * because even if we end up not blocking during the buffered IO attempt
    1582             :          * below, we will block when flushing and waiting for the IO.
    1583             :          */
    1584      346251 :         if (iocb->ki_flags & IOCB_NOWAIT) {
    1585           0 :                 err = -EAGAIN;
    1586           0 :                 goto out;
    1587             :         }
    1588             : 
    1589      346251 :         pos = iocb->ki_pos;
    1590      346251 :         written_buffered = btrfs_buffered_write(iocb, from);
    1591      346484 :         if (written_buffered < 0) {
    1592           0 :                 err = written_buffered;
    1593           0 :                 goto out;
    1594             :         }
    1595             :         /*
    1596             :          * Ensure all data is persisted. We want the next direct IO read to be
    1597             :          * able to read what was just written.
    1598             :          */
    1599      346484 :         endbyte = pos + written_buffered - 1;
    1600      346484 :         err = btrfs_fdatawrite_range(inode, pos, endbyte);
    1601      346523 :         if (err)
    1602           0 :                 goto out;
    1603      346523 :         err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
    1604      346452 :         if (err)
    1605          10 :                 goto out;
    1606      346442 :         written += written_buffered;
    1607      346442 :         iocb->ki_pos = pos + written_buffered;
    1608      346442 :         invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
    1609      346442 :                                  endbyte >> PAGE_SHIFT);
    1610     1051844 : out:
    1611     1051844 :         return err < 0 ? err : written;
    1612             : }
    1613             : 
    1614           0 : static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
    1615             :                         const struct btrfs_ioctl_encoded_io_args *encoded)
    1616             : {
    1617           0 :         struct file *file = iocb->ki_filp;
    1618           0 :         struct inode *inode = file_inode(file);
    1619           0 :         loff_t count;
    1620           0 :         ssize_t ret;
    1621             : 
    1622           0 :         btrfs_inode_lock(BTRFS_I(inode), 0);
    1623           0 :         count = encoded->len;
    1624           0 :         ret = generic_write_checks_count(iocb, &count);
    1625           0 :         if (ret == 0 && count != encoded->len) {
    1626             :                 /*
    1627             :                  * The write got truncated by generic_write_checks_count(). We
    1628             :                  * can't do a partial encoded write.
    1629             :                  */
    1630             :                 ret = -EFBIG;
    1631             :         }
    1632           0 :         if (ret || encoded->len == 0)
    1633           0 :                 goto out;
    1634             : 
    1635           0 :         ret = btrfs_write_check(iocb, from, encoded->len);
    1636           0 :         if (ret < 0)
    1637           0 :                 goto out;
    1638             : 
    1639           0 :         ret = btrfs_do_encoded_write(iocb, from, encoded);
    1640           0 : out:
    1641           0 :         btrfs_inode_unlock(BTRFS_I(inode), 0);
    1642           0 :         return ret;
    1643             : }
    1644             : 
    1645    26776909 : ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
    1646             :                             const struct btrfs_ioctl_encoded_io_args *encoded)
    1647             : {
    1648    26776909 :         struct file *file = iocb->ki_filp;
    1649    26776909 :         struct btrfs_inode *inode = BTRFS_I(file_inode(file));
    1650    26776909 :         ssize_t num_written, num_sync;
    1651             : 
    1652             :         /*
    1653             :          * If the fs flips readonly due to some impossible error, although we
    1654             :          * have opened a file as writable, we have to stop this write operation
    1655             :          * to ensure consistency.
    1656             :          */
    1657    26776909 :         if (BTRFS_FS_ERROR(inode->root->fs_info))
    1658             :                 return -EROFS;
    1659             : 
    1660    26776909 :         if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
    1661             :                 return -EOPNOTSUPP;
    1662             : 
    1663    26776909 :         if (encoded) {
    1664           0 :                 num_written = btrfs_encoded_write(iocb, from, encoded);
    1665           0 :                 num_sync = encoded->len;
    1666    26776909 :         } else if (iocb->ki_flags & IOCB_DIRECT) {
    1667     1051313 :                 num_written = btrfs_direct_write(iocb, from);
    1668     1051313 :                 num_sync = num_written;
    1669             :         } else {
    1670    25725596 :                 num_written = btrfs_buffered_write(iocb, from);
    1671    25725596 :                 num_sync = num_written;
    1672             :         }
    1673             : 
    1674    26842734 :         btrfs_set_inode_last_sub_trans(inode);
    1675             : 
    1676    26843257 :         if (num_sync > 0) {
    1677    26602381 :                 num_sync = generic_write_sync(iocb, num_sync);
    1678    26590907 :                 if (num_sync < 0)
    1679           0 :                         num_written = num_sync;
    1680             :         }
    1681             : 
    1682             :         return num_written;
    1683             : }
    1684             : 
    1685    26820758 : static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
    1686             : {
    1687    26820758 :         return btrfs_do_write_iter(iocb, from, NULL);
    1688             : }
    1689             : 
    1690     7597099 : int btrfs_release_file(struct inode *inode, struct file *filp)
    1691             : {
    1692     7597099 :         struct btrfs_file_private *private = filp->private_data;
    1693             : 
    1694     7597099 :         if (private) {
    1695      148958 :                 kfree(private->filldir_buf);
    1696      148947 :                 free_extent_state(private->llseek_cached_state);
    1697      148955 :                 kfree(private);
    1698      148957 :                 filp->private_data = NULL;
    1699             :         }
    1700             : 
    1701             :         /*
    1702             :          * Set by setattr when we are about to truncate a file from a non-zero
    1703             :          * size to a zero size.  This tries to flush down new bytes that may
    1704             :          * have been written if the application were using truncate to replace
    1705             :          * a file in place.
    1706             :          */
    1707     7601135 :         if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
    1708     7597098 :                                &BTRFS_I(inode)->runtime_flags))
    1709       64698 :                         filemap_flush(inode->i_mapping);
    1710     7601131 :         return 0;
    1711             : }
    1712             : 
    1713      544728 : static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
    1714             : {
    1715      544728 :         int ret;
    1716      544728 :         struct blk_plug plug;
    1717             : 
    1718             :         /*
    1719             :          * This is only called in fsync, which would do synchronous writes, so
    1720             :          * a plug can merge adjacent IOs as much as possible.  Esp. in case of
    1721             :          * multiple disks using raid profile, a large IO can be split to
    1722             :          * several segments of stripe length (currently 64K).
    1723             :          */
    1724      544728 :         blk_start_plug(&plug);
    1725      544682 :         ret = btrfs_fdatawrite_range(inode, start, end);
    1726      544753 :         blk_finish_plug(&plug);
    1727             : 
    1728      544743 :         return ret;
    1729             : }
    1730             : 
    1731      272400 : static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
    1732             : {
    1733      272400 :         struct btrfs_inode *inode = BTRFS_I(ctx->inode);
    1734      272400 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    1735             : 
    1736      272400 :         if (btrfs_inode_in_log(inode, fs_info->generation) &&
    1737        1853 :             list_empty(&ctx->ordered_extents))
    1738             :                 return true;
    1739             : 
    1740             :         /*
    1741             :          * If we are doing a fast fsync we can not bail out if the inode's
    1742             :          * last_trans is <= then the last committed transaction, because we only
    1743             :          * update the last_trans of the inode during ordered extent completion,
    1744             :          * and for a fast fsync we don't wait for that, we only wait for the
    1745             :          * writeback to complete.
    1746             :          */
    1747      270645 :         if (inode->last_trans <= fs_info->last_trans_committed &&
    1748       18526 :             (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
    1749        3396 :              list_empty(&ctx->ordered_extents)))
    1750       18194 :                 return true;
    1751             : 
    1752             :         return false;
    1753             : }
    1754             : 
    1755             : /*
    1756             :  * fsync call for both files and directories.  This logs the inode into
    1757             :  * the tree log instead of forcing full commits whenever possible.
    1758             :  *
    1759             :  * It needs to call filemap_fdatawait so that all ordered extent updates are
    1760             :  * in the metadata btree are up to date for copying to the log.
    1761             :  *
    1762             :  * It drops the inode mutex before doing the tree log commit.  This is an
    1763             :  * important optimization for directories because holding the mutex prevents
    1764             :  * new operations on the dir while we write to disk.
    1765             :  */
    1766      272342 : int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
    1767             : {
    1768      272342 :         struct dentry *dentry = file_dentry(file);
    1769      272279 :         struct inode *inode = d_inode(dentry);
    1770      272279 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    1771      272279 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    1772      272279 :         struct btrfs_trans_handle *trans;
    1773      272279 :         struct btrfs_log_ctx ctx;
    1774      272279 :         int ret = 0, err;
    1775      272279 :         u64 len;
    1776      272279 :         bool full_sync;
    1777             : 
    1778      272279 :         trace_btrfs_sync_file(file, datasync);
    1779             : 
    1780      272316 :         btrfs_init_log_ctx(&ctx, inode);
    1781             : 
    1782             :         /*
    1783             :          * Always set the range to a full range, otherwise we can get into
    1784             :          * several problems, from missing file extent items to represent holes
    1785             :          * when not using the NO_HOLES feature, to log tree corruption due to
    1786             :          * races between hole detection during logging and completion of ordered
    1787             :          * extents outside the range, to missing checksums due to ordered extents
    1788             :          * for which we flushed only a subset of their pages.
    1789             :          */
    1790      272316 :         start = 0;
    1791      272316 :         end = LLONG_MAX;
    1792      272316 :         len = (u64)LLONG_MAX + 1;
    1793             : 
    1794             :         /*
    1795             :          * We write the dirty pages in the range and wait until they complete
    1796             :          * out of the ->i_mutex. If so, we can flush the dirty pages by
    1797             :          * multi-task, and make the performance up.  See
    1798             :          * btrfs_wait_ordered_range for an explanation of the ASYNC check.
    1799             :          */
    1800      272316 :         ret = start_ordered_ops(inode, start, end);
    1801      272372 :         if (ret)
    1802           0 :                 goto out;
    1803             : 
    1804      272372 :         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    1805             : 
    1806      272370 :         atomic_inc(&root->log_batch);
    1807             : 
    1808             :         /*
    1809             :          * Before we acquired the inode's lock and the mmap lock, someone may
    1810             :          * have dirtied more pages in the target range. We need to make sure
    1811             :          * that writeback for any such pages does not start while we are logging
    1812             :          * the inode, because if it does, any of the following might happen when
    1813             :          * we are not doing a full inode sync:
    1814             :          *
    1815             :          * 1) We log an extent after its writeback finishes but before its
    1816             :          *    checksums are added to the csum tree, leading to -EIO errors
    1817             :          *    when attempting to read the extent after a log replay.
    1818             :          *
    1819             :          * 2) We can end up logging an extent before its writeback finishes.
    1820             :          *    Therefore after the log replay we will have a file extent item
    1821             :          *    pointing to an unwritten extent (and no data checksums as well).
    1822             :          *
    1823             :          * So trigger writeback for any eventual new dirty pages and then we
    1824             :          * wait for all ordered extents to complete below.
    1825             :          */
    1826      272403 :         ret = start_ordered_ops(inode, start, end);
    1827      272393 :         if (ret) {
    1828           0 :                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    1829           0 :                 goto out;
    1830             :         }
    1831             : 
    1832             :         /*
    1833             :          * Always check for the full sync flag while holding the inode's lock,
    1834             :          * to avoid races with other tasks. The flag must be either set all the
    1835             :          * time during logging or always off all the time while logging.
    1836             :          * We check the flag here after starting delalloc above, because when
    1837             :          * running delalloc the full sync flag may be set if we need to drop
    1838             :          * extra extent map ranges due to temporary memory allocation failures.
    1839             :          */
    1840      272393 :         full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
    1841             :                              &BTRFS_I(inode)->runtime_flags);
    1842             : 
    1843             :         /*
    1844             :          * We have to do this here to avoid the priority inversion of waiting on
    1845             :          * IO of a lower priority task while holding a transaction open.
    1846             :          *
    1847             :          * For a full fsync we wait for the ordered extents to complete while
    1848             :          * for a fast fsync we wait just for writeback to complete, and then
    1849             :          * attach the ordered extents to the transaction so that a transaction
    1850             :          * commit waits for their completion, to avoid data loss if we fsync,
    1851             :          * the current transaction commits before the ordered extents complete
    1852             :          * and a power failure happens right after that.
    1853             :          *
    1854             :          * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
    1855             :          * logical address recorded in the ordered extent may change. We need
    1856             :          * to wait for the IO to stabilize the logical address.
    1857             :          */
    1858      272393 :         if (full_sync || btrfs_is_zoned(fs_info)) {
    1859      132023 :                 ret = btrfs_wait_ordered_range(inode, start, len);
    1860             :         } else {
    1861             :                 /*
    1862             :                  * Get our ordered extents as soon as possible to avoid doing
    1863             :                  * checksum lookups in the csum tree, and use instead the
    1864             :                  * checksums attached to the ordered extents.
    1865             :                  */
    1866      140370 :                 btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
    1867             :                                                       &ctx.ordered_extents);
    1868      140388 :                 ret = filemap_fdatawait_range(inode->i_mapping, start, end);
    1869             :         }
    1870             : 
    1871      272402 :         if (ret)
    1872          10 :                 goto out_release_extents;
    1873             : 
    1874      272392 :         atomic_inc(&root->log_batch);
    1875             : 
    1876      272399 :         smp_mb();
    1877      272400 :         if (skip_inode_logging(&ctx)) {
    1878             :                 /*
    1879             :                  * We've had everything committed since the last time we were
    1880             :                  * modified so clear this flag in case it was set for whatever
    1881             :                  * reason, it's no longer relevant.
    1882             :                  */
    1883       19944 :                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
    1884       19944 :                           &BTRFS_I(inode)->runtime_flags);
    1885             :                 /*
    1886             :                  * An ordered extent might have started before and completed
    1887             :                  * already with io errors, in which case the inode was not
    1888             :                  * updated and we end up here. So check the inode's mapping
    1889             :                  * for any errors that might have happened since we last
    1890             :                  * checked called fsync.
    1891             :                  */
    1892       19944 :                 ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
    1893       19944 :                 goto out_release_extents;
    1894             :         }
    1895             : 
    1896             :         /*
    1897             :          * We use start here because we will need to wait on the IO to complete
    1898             :          * in btrfs_sync_log, which could require joining a transaction (for
    1899             :          * example checking cross references in the nocow path).  If we use join
    1900             :          * here we could get into a situation where we're waiting on IO to
    1901             :          * happen that is blocked on a transaction trying to commit.  With start
    1902             :          * we inc the extwriter counter, so we wait for all extwriters to exit
    1903             :          * before we start blocking joiners.  This comment is to keep somebody
    1904             :          * from thinking they are super smart and changing this to
    1905             :          * btrfs_join_transaction *cough*Josef*cough*.
    1906             :          */
    1907      252451 :         trans = btrfs_start_transaction(root, 0);
    1908      252456 :         if (IS_ERR(trans)) {
    1909           0 :                 ret = PTR_ERR(trans);
    1910           0 :                 goto out_release_extents;
    1911             :         }
    1912      252456 :         trans->in_fsync = true;
    1913             : 
    1914      252456 :         ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
    1915      252436 :         btrfs_release_log_ctx_extents(&ctx);
    1916      252417 :         if (ret < 0) {
    1917             :                 /* Fallthrough and commit/free transaction. */
    1918         694 :                 ret = BTRFS_LOG_FORCE_COMMIT;
    1919             :         }
    1920             : 
    1921             :         /* we've logged all the items and now have a consistent
    1922             :          * version of the file in the log.  It is possible that
    1923             :          * someone will come in and modify the file, but that's
    1924             :          * fine because the log is consistent on disk, and we
    1925             :          * have references to all of the file's extents
    1926             :          *
    1927             :          * It is possible that someone will come in and log the
    1928             :          * file again, but that will end up using the synchronization
    1929             :          * inside btrfs_sync_log to keep things safe.
    1930             :          */
    1931      252417 :         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    1932             : 
    1933      252430 :         if (ret == BTRFS_NO_LOG_SYNC) {
    1934           1 :                 ret = btrfs_end_transaction(trans);
    1935           1 :                 goto out;
    1936             :         }
    1937             : 
    1938             :         /* We successfully logged the inode, attempt to sync the log. */
    1939      252429 :         if (!ret) {
    1940      251735 :                 ret = btrfs_sync_log(trans, root, &ctx);
    1941      251762 :                 if (!ret) {
    1942      251557 :                         ret = btrfs_end_transaction(trans);
    1943      251541 :                         goto out;
    1944             :                 }
    1945             :         }
    1946             : 
    1947             :         /*
    1948             :          * At this point we need to commit the transaction because we had
    1949             :          * btrfs_need_log_full_commit() or some other error.
    1950             :          *
    1951             :          * If we didn't do a full sync we have to stop the trans handle, wait on
    1952             :          * the ordered extents, start it again and commit the transaction.  If
    1953             :          * we attempt to wait on the ordered extents here we could deadlock with
    1954             :          * something like fallocate() that is holding the extent lock trying to
    1955             :          * start a transaction while some other thread is trying to commit the
    1956             :          * transaction while we (fsync) are currently holding the transaction
    1957             :          * open.
    1958             :          */
    1959         899 :         if (!full_sync) {
    1960         143 :                 ret = btrfs_end_transaction(trans);
    1961         143 :                 if (ret)
    1962           0 :                         goto out;
    1963         143 :                 ret = btrfs_wait_ordered_range(inode, start, len);
    1964         143 :                 if (ret)
    1965           0 :                         goto out;
    1966             : 
    1967             :                 /*
    1968             :                  * This is safe to use here because we're only interested in
    1969             :                  * making sure the transaction that had the ordered extents is
    1970             :                  * committed.  We aren't waiting on anything past this point,
    1971             :                  * we're purely getting the transaction and committing it.
    1972             :                  */
    1973         143 :                 trans = btrfs_attach_transaction_barrier(root);
    1974         143 :                 if (IS_ERR(trans)) {
    1975          19 :                         ret = PTR_ERR(trans);
    1976             : 
    1977             :                         /*
    1978             :                          * We committed the transaction and there's no currently
    1979             :                          * running transaction, this means everything we care
    1980             :                          * about made it to disk and we are done.
    1981             :                          */
    1982          19 :                         if (ret == -ENOENT)
    1983          19 :                                 ret = 0;
    1984          19 :                         goto out;
    1985             :                 }
    1986             :         }
    1987             : 
    1988         880 :         ret = btrfs_commit_transaction(trans);
    1989      272394 : out:
    1990      272394 :         ASSERT(list_empty(&ctx.list));
    1991      272394 :         ASSERT(list_empty(&ctx.conflict_inodes));
    1992      272394 :         err = file_check_and_advance_wb_err(file);
    1993      272399 :         if (!ret)
    1994      272388 :                 ret = err;
    1995      272399 :         return ret > 0 ? -EIO : ret;
    1996             : 
    1997       19954 : out_release_extents:
    1998       19954 :         btrfs_release_log_ctx_extents(&ctx);
    1999       19954 :         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    2000       19954 :         goto out;
    2001             : }
    2002             : 
    2003             : static const struct vm_operations_struct btrfs_file_vm_ops = {
    2004             :         .fault          = filemap_fault,
    2005             :         .map_pages      = filemap_map_pages,
    2006             :         .page_mkwrite   = btrfs_page_mkwrite,
    2007             : };
    2008             : 
    2009      437697 : static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
    2010             : {
    2011      437697 :         struct address_space *mapping = filp->f_mapping;
    2012             : 
    2013      437697 :         if (!mapping->a_ops->read_folio)
    2014             :                 return -ENOEXEC;
    2015             : 
    2016      437697 :         file_accessed(filp);
    2017      437692 :         vma->vm_ops = &btrfs_file_vm_ops;
    2018             : 
    2019      437692 :         return 0;
    2020             : }
    2021             : 
    2022           0 : static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
    2023             :                           int slot, u64 start, u64 end)
    2024             : {
    2025           0 :         struct btrfs_file_extent_item *fi;
    2026           0 :         struct btrfs_key key;
    2027             : 
    2028           0 :         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
    2029             :                 return 0;
    2030             : 
    2031           0 :         btrfs_item_key_to_cpu(leaf, &key, slot);
    2032           0 :         if (key.objectid != btrfs_ino(inode) ||
    2033           0 :             key.type != BTRFS_EXTENT_DATA_KEY)
    2034             :                 return 0;
    2035             : 
    2036           0 :         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
    2037             : 
    2038           0 :         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
    2039             :                 return 0;
    2040             : 
    2041           0 :         if (btrfs_file_extent_disk_bytenr(leaf, fi))
    2042             :                 return 0;
    2043             : 
    2044           0 :         if (key.offset == end)
    2045             :                 return 1;
    2046           0 :         if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
    2047           0 :                 return 1;
    2048             :         return 0;
    2049             : }
    2050             : 
    2051     3363191 : static int fill_holes(struct btrfs_trans_handle *trans,
    2052             :                 struct btrfs_inode *inode,
    2053             :                 struct btrfs_path *path, u64 offset, u64 end)
    2054             : {
    2055     3363191 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2056     3363191 :         struct btrfs_root *root = inode->root;
    2057     3363191 :         struct extent_buffer *leaf;
    2058     3363191 :         struct btrfs_file_extent_item *fi;
    2059     3363191 :         struct extent_map *hole_em;
    2060     3363191 :         struct btrfs_key key;
    2061     3363191 :         int ret;
    2062             : 
    2063     3363191 :         if (btrfs_fs_incompat(fs_info, NO_HOLES))
    2064     3363191 :                 goto out;
    2065             : 
    2066           0 :         key.objectid = btrfs_ino(inode);
    2067           0 :         key.type = BTRFS_EXTENT_DATA_KEY;
    2068           0 :         key.offset = offset;
    2069             : 
    2070           0 :         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
    2071           0 :         if (ret <= 0) {
    2072             :                 /*
    2073             :                  * We should have dropped this offset, so if we find it then
    2074             :                  * something has gone horribly wrong.
    2075             :                  */
    2076           0 :                 if (ret == 0)
    2077           0 :                         ret = -EINVAL;
    2078           0 :                 return ret;
    2079             :         }
    2080             : 
    2081           0 :         leaf = path->nodes[0];
    2082           0 :         if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
    2083           0 :                 u64 num_bytes;
    2084             : 
    2085           0 :                 path->slots[0]--;
    2086           0 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
    2087             :                                     struct btrfs_file_extent_item);
    2088           0 :                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
    2089             :                         end - offset;
    2090           0 :                 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
    2091           0 :                 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
    2092           0 :                 btrfs_set_file_extent_offset(leaf, fi, 0);
    2093           0 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
    2094           0 :                 btrfs_mark_buffer_dirty(leaf);
    2095           0 :                 goto out;
    2096             :         }
    2097             : 
    2098           0 :         if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
    2099           0 :                 u64 num_bytes;
    2100             : 
    2101           0 :                 key.offset = offset;
    2102           0 :                 btrfs_set_item_key_safe(fs_info, path, &key);
    2103           0 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
    2104             :                                     struct btrfs_file_extent_item);
    2105           0 :                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
    2106             :                         offset;
    2107           0 :                 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
    2108           0 :                 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
    2109           0 :                 btrfs_set_file_extent_offset(leaf, fi, 0);
    2110           0 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
    2111           0 :                 btrfs_mark_buffer_dirty(leaf);
    2112           0 :                 goto out;
    2113             :         }
    2114           0 :         btrfs_release_path(path);
    2115             : 
    2116           0 :         ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
    2117             :                                        end - offset);
    2118           0 :         if (ret)
    2119             :                 return ret;
    2120             : 
    2121           0 : out:
    2122     3363191 :         btrfs_release_path(path);
    2123             : 
    2124     3363216 :         hole_em = alloc_extent_map();
    2125     3363269 :         if (!hole_em) {
    2126           0 :                 btrfs_drop_extent_map_range(inode, offset, end - 1, false);
    2127           0 :                 btrfs_set_inode_full_sync(inode);
    2128             :         } else {
    2129     3363269 :                 hole_em->start = offset;
    2130     3363269 :                 hole_em->len = end - offset;
    2131     3363269 :                 hole_em->ram_bytes = hole_em->len;
    2132     3363269 :                 hole_em->orig_start = offset;
    2133             : 
    2134     3363269 :                 hole_em->block_start = EXTENT_MAP_HOLE;
    2135     3363269 :                 hole_em->block_len = 0;
    2136     3363269 :                 hole_em->orig_block_len = 0;
    2137     3363269 :                 hole_em->compress_type = BTRFS_COMPRESS_NONE;
    2138     3363269 :                 hole_em->generation = trans->transid;
    2139             : 
    2140     3363269 :                 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
    2141     3363274 :                 free_extent_map(hole_em);
    2142     3363299 :                 if (ret)
    2143           0 :                         btrfs_set_inode_full_sync(inode);
    2144             :         }
    2145             : 
    2146             :         return 0;
    2147             : }
    2148             : 
    2149             : /*
    2150             :  * Find a hole extent on given inode and change start/len to the end of hole
    2151             :  * extent.(hole/vacuum extent whose em->start <= start &&
    2152             :  *         em->start + em->len > start)
    2153             :  * When a hole extent is found, return 1 and modify start/len.
    2154             :  */
    2155     4809060 : static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
    2156             : {
    2157     4809060 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    2158     4809060 :         struct extent_map *em;
    2159     4809060 :         int ret = 0;
    2160             : 
    2161     4809060 :         em = btrfs_get_extent(inode, NULL, 0,
    2162     4809060 :                               round_down(*start, fs_info->sectorsize),
    2163     4809060 :                               round_up(*len, fs_info->sectorsize));
    2164     4809193 :         if (IS_ERR(em))
    2165           0 :                 return PTR_ERR(em);
    2166             : 
    2167             :         /* Hole or vacuum extent(only exists in no-hole mode) */
    2168     4809193 :         if (em->block_start == EXTENT_MAP_HOLE) {
    2169      155611 :                 ret = 1;
    2170      311222 :                 *len = em->start + em->len > *start + *len ?
    2171      155611 :                        0 : *start + *len - em->start - em->len;
    2172      155611 :                 *start = em->start + em->len;
    2173             :         }
    2174     4809193 :         free_extent_map(em);
    2175     4809193 :         return ret;
    2176             : }
    2177             : 
    2178     2498121 : static void btrfs_punch_hole_lock_range(struct inode *inode,
    2179             :                                         const u64 lockstart,
    2180             :                                         const u64 lockend,
    2181             :                                         struct extent_state **cached_state)
    2182             : {
    2183             :         /*
    2184             :          * For subpage case, if the range is not at page boundary, we could
    2185             :          * have pages at the leading/tailing part of the range.
    2186             :          * This could lead to dead loop since filemap_range_has_page()
    2187             :          * will always return true.
    2188             :          * So here we need to do extra page alignment for
    2189             :          * filemap_range_has_page().
    2190             :          */
    2191     2498121 :         const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
    2192     2498121 :         const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
    2193             : 
    2194     2498121 :         while (1) {
    2195     2498121 :                 truncate_pagecache_range(inode, lockstart, lockend);
    2196             : 
    2197     2498153 :                 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
    2198             :                             cached_state);
    2199             :                 /*
    2200             :                  * We can't have ordered extents in the range, nor dirty/writeback
    2201             :                  * pages, because we have locked the inode's VFS lock in exclusive
    2202             :                  * mode, we have locked the inode's i_mmap_lock in exclusive mode,
    2203             :                  * we have flushed all delalloc in the range and we have waited
    2204             :                  * for any ordered extents in the range to complete.
    2205             :                  * We can race with anyone reading pages from this range, so after
    2206             :                  * locking the range check if we have pages in the range, and if
    2207             :                  * we do, unlock the range and retry.
    2208             :                  */
    2209     2498206 :                 if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
    2210             :                                             page_lockend))
    2211             :                         break;
    2212             : 
    2213           0 :                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
    2214             :                               cached_state);
    2215             :         }
    2216             : 
    2217     2498164 :         btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
    2218     2498112 : }
    2219             : 
    2220    14846523 : static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
    2221             :                                      struct btrfs_inode *inode,
    2222             :                                      struct btrfs_path *path,
    2223             :                                      struct btrfs_replace_extent_info *extent_info,
    2224             :                                      const u64 replace_len,
    2225             :                                      const u64 bytes_to_drop)
    2226             : {
    2227    14846523 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2228    14846523 :         struct btrfs_root *root = inode->root;
    2229    14846523 :         struct btrfs_file_extent_item *extent;
    2230    14846523 :         struct extent_buffer *leaf;
    2231    14846523 :         struct btrfs_key key;
    2232    14846523 :         int slot;
    2233    14846523 :         struct btrfs_ref ref = { 0 };
    2234    14846523 :         int ret;
    2235             : 
    2236    14846523 :         if (replace_len == 0)
    2237             :                 return 0;
    2238             : 
    2239    14846523 :         if (extent_info->disk_offset == 0 &&
    2240          11 :             btrfs_fs_incompat(fs_info, NO_HOLES)) {
    2241           0 :                 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
    2242           0 :                 return 0;
    2243             :         }
    2244             : 
    2245    14846523 :         key.objectid = btrfs_ino(inode);
    2246    14846523 :         key.type = BTRFS_EXTENT_DATA_KEY;
    2247    14846523 :         key.offset = extent_info->file_offset;
    2248    14846523 :         ret = btrfs_insert_empty_item(trans, root, path, &key,
    2249             :                                       sizeof(struct btrfs_file_extent_item));
    2250    14846524 :         if (ret)
    2251             :                 return ret;
    2252    14846523 :         leaf = path->nodes[0];
    2253    14846523 :         slot = path->slots[0];
    2254    14846523 :         write_extent_buffer(leaf, extent_info->extent_buf,
    2255    14846523 :                             btrfs_item_ptr_offset(leaf, slot),
    2256             :                             sizeof(struct btrfs_file_extent_item));
    2257    14846526 :         extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
    2258    14846526 :         ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
    2259    14846525 :         btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
    2260    14846523 :         btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
    2261    14846522 :         if (extent_info->is_new_extent)
    2262      439555 :                 btrfs_set_file_extent_generation(leaf, extent, trans->transid);
    2263    14846522 :         btrfs_mark_buffer_dirty(leaf);
    2264    14846527 :         btrfs_release_path(path);
    2265             : 
    2266    14846527 :         ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
    2267             :                                                 replace_len);
    2268    14846526 :         if (ret)
    2269             :                 return ret;
    2270             : 
    2271             :         /* If it's a hole, nothing more needs to be done. */
    2272    14846526 :         if (extent_info->disk_offset == 0) {
    2273          11 :                 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
    2274          11 :                 return 0;
    2275             :         }
    2276             : 
    2277    14846515 :         btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
    2278             : 
    2279    14846512 :         if (extent_info->is_new_extent && extent_info->insertions == 0) {
    2280      439552 :                 key.objectid = extent_info->disk_offset;
    2281      439552 :                 key.type = BTRFS_EXTENT_ITEM_KEY;
    2282      439552 :                 key.offset = extent_info->disk_len;
    2283      439552 :                 ret = btrfs_alloc_reserved_file_extent(trans, root,
    2284             :                                                        btrfs_ino(inode),
    2285             :                                                        extent_info->file_offset,
    2286      439552 :                                                        extent_info->qgroup_reserved,
    2287             :                                                        &key);
    2288             :         } else {
    2289    14406960 :                 u64 ref_offset;
    2290             : 
    2291    14406960 :                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
    2292             :                                        extent_info->disk_offset,
    2293             :                                        extent_info->disk_len, 0);
    2294    14406960 :                 ref_offset = extent_info->file_offset - extent_info->data_offset;
    2295    14406960 :                 btrfs_init_data_ref(&ref, root->root_key.objectid,
    2296             :                                     btrfs_ino(inode), ref_offset, 0, false);
    2297    14406960 :                 ret = btrfs_inc_extent_ref(trans, &ref);
    2298             :         }
    2299             : 
    2300    14846516 :         extent_info->insertions++;
    2301             : 
    2302    14846516 :         return ret;
    2303             : }
    2304             : 
    2305             : /*
    2306             :  * The respective range must have been previously locked, as well as the inode.
    2307             :  * The end offset is inclusive (last byte of the range).
    2308             :  * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
    2309             :  * the file range with an extent.
    2310             :  * When not punching a hole, we don't want to end up in a state where we dropped
    2311             :  * extents without inserting a new one, so we must abort the transaction to avoid
    2312             :  * a corruption.
    2313             :  */
    2314    18252202 : int btrfs_replace_file_extents(struct btrfs_inode *inode,
    2315             :                                struct btrfs_path *path, const u64 start,
    2316             :                                const u64 end,
    2317             :                                struct btrfs_replace_extent_info *extent_info,
    2318             :                                struct btrfs_trans_handle **trans_out)
    2319             : {
    2320    18252202 :         struct btrfs_drop_extents_args drop_args = { 0 };
    2321    18252202 :         struct btrfs_root *root = inode->root;
    2322    18252202 :         struct btrfs_fs_info *fs_info = root->fs_info;
    2323    18252202 :         u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
    2324    18252202 :         u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
    2325    18252202 :         struct btrfs_trans_handle *trans = NULL;
    2326    18252202 :         struct btrfs_block_rsv *rsv;
    2327    18252202 :         unsigned int rsv_count;
    2328    18252202 :         u64 cur_offset;
    2329    18252202 :         u64 len = end - start;
    2330    18252202 :         int ret = 0;
    2331             : 
    2332    18252202 :         if (end <= start)
    2333             :                 return -EINVAL;
    2334             : 
    2335    18252202 :         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
    2336    18252047 :         if (!rsv) {
    2337           0 :                 ret = -ENOMEM;
    2338           0 :                 goto out;
    2339             :         }
    2340    18252047 :         rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
    2341    18252047 :         rsv->failfast = true;
    2342             : 
    2343             :         /*
    2344             :          * 1 - update the inode
    2345             :          * 1 - removing the extents in the range
    2346             :          * 1 - adding the hole extent if no_holes isn't set or if we are
    2347             :          *     replacing the range with a new extent
    2348             :          */
    2349    18252047 :         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
    2350             :                 rsv_count = 3;
    2351             :         else
    2352     3405376 :                 rsv_count = 2;
    2353             : 
    2354    18252047 :         trans = btrfs_start_transaction(root, rsv_count);
    2355    18252253 :         if (IS_ERR(trans)) {
    2356        2041 :                 ret = PTR_ERR(trans);
    2357        2041 :                 trans = NULL;
    2358        2041 :                 goto out_free;
    2359             :         }
    2360             : 
    2361    18250212 :         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
    2362             :                                       min_size, false);
    2363    18250161 :         if (WARN_ON(ret))
    2364           0 :                 goto out_trans;
    2365    18250161 :         trans->block_rsv = rsv;
    2366             : 
    2367    18250161 :         cur_offset = start;
    2368    18250161 :         drop_args.path = path;
    2369    18250161 :         drop_args.end = end + 1;
    2370    18250161 :         drop_args.drop_cache = true;
    2371    18250170 :         while (cur_offset < end) {
    2372    18250170 :                 drop_args.start = cur_offset;
    2373    18250170 :                 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
    2374             :                 /* If we are punching a hole decrement the inode's byte count */
    2375    18250145 :                 if (!extent_info)
    2376     3403696 :                         btrfs_update_inode_bytes(inode, 0,
    2377             :                                                  drop_args.bytes_found);
    2378    18250114 :                 if (ret != -ENOSPC) {
    2379             :                         /*
    2380             :                          * The only time we don't want to abort is if we are
    2381             :                          * attempting to clone a partial inline extent, in which
    2382             :                          * case we'll get EOPNOTSUPP.  However if we aren't
    2383             :                          * clone we need to abort no matter what, because if we
    2384             :                          * got EOPNOTSUPP via prealloc then we messed up and
    2385             :                          * need to abort.
    2386             :                          */
    2387    18250105 :                         if (ret &&
    2388           0 :                             (ret != -EOPNOTSUPP ||
    2389           0 :                              (extent_info && extent_info->is_new_extent)))
    2390           0 :                                 btrfs_abort_transaction(trans, ret);
    2391             :                         break;
    2392             :                 }
    2393             : 
    2394           9 :                 trans->block_rsv = &fs_info->trans_block_rsv;
    2395             : 
    2396           9 :                 if (!extent_info && cur_offset < drop_args.drop_end &&
    2397             :                     cur_offset < ino_size) {
    2398           0 :                         ret = fill_holes(trans, inode, path, cur_offset,
    2399             :                                          drop_args.drop_end);
    2400           0 :                         if (ret) {
    2401             :                                 /*
    2402             :                                  * If we failed then we didn't insert our hole
    2403             :                                  * entries for the area we dropped, so now the
    2404             :                                  * fs is corrupted, so we must abort the
    2405             :                                  * transaction.
    2406             :                                  */
    2407           0 :                                 btrfs_abort_transaction(trans, ret);
    2408           0 :                                 break;
    2409             :                         }
    2410           9 :                 } else if (!extent_info && cur_offset < drop_args.drop_end) {
    2411             :                         /*
    2412             :                          * We are past the i_size here, but since we didn't
    2413             :                          * insert holes we need to clear the mapped area so we
    2414             :                          * know to not set disk_i_size in this area until a new
    2415             :                          * file extent is inserted here.
    2416             :                          */
    2417           0 :                         ret = btrfs_inode_clear_file_extent_range(inode,
    2418             :                                         cur_offset,
    2419             :                                         drop_args.drop_end - cur_offset);
    2420           0 :                         if (ret) {
    2421             :                                 /*
    2422             :                                  * We couldn't clear our area, so we could
    2423             :                                  * presumably adjust up and corrupt the fs, so
    2424             :                                  * we need to abort.
    2425             :                                  */
    2426           0 :                                 btrfs_abort_transaction(trans, ret);
    2427           0 :                                 break;
    2428             :                         }
    2429             :                 }
    2430             : 
    2431           9 :                 if (extent_info &&
    2432           9 :                     drop_args.drop_end > extent_info->file_offset) {
    2433           9 :                         u64 replace_len = drop_args.drop_end -
    2434             :                                           extent_info->file_offset;
    2435             : 
    2436           9 :                         ret = btrfs_insert_replace_extent(trans, inode, path,
    2437             :                                         extent_info, replace_len,
    2438             :                                         drop_args.bytes_found);
    2439           9 :                         if (ret) {
    2440           0 :                                 btrfs_abort_transaction(trans, ret);
    2441           0 :                                 break;
    2442             :                         }
    2443           9 :                         extent_info->data_len -= replace_len;
    2444           9 :                         extent_info->data_offset += replace_len;
    2445           9 :                         extent_info->file_offset += replace_len;
    2446             :                 }
    2447             : 
    2448             :                 /*
    2449             :                  * We are releasing our handle on the transaction, balance the
    2450             :                  * dirty pages of the btree inode and flush delayed items, and
    2451             :                  * then get a new transaction handle, which may now point to a
    2452             :                  * new transaction in case someone else may have committed the
    2453             :                  * transaction we used to replace/drop file extent items. So
    2454             :                  * bump the inode's iversion and update mtime and ctime except
    2455             :                  * if we are called from a dedupe context. This is because a
    2456             :                  * power failure/crash may happen after the transaction is
    2457             :                  * committed and before we finish replacing/dropping all the
    2458             :                  * file extent items we need.
    2459             :                  */
    2460           9 :                 inode_inc_iversion(&inode->vfs_inode);
    2461             : 
    2462           9 :                 if (!extent_info || extent_info->update_times) {
    2463           9 :                         inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
    2464           9 :                         inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
    2465             :                 }
    2466             : 
    2467           9 :                 ret = btrfs_update_inode(trans, root, inode);
    2468           9 :                 if (ret)
    2469             :                         break;
    2470             : 
    2471           9 :                 btrfs_end_transaction(trans);
    2472           9 :                 btrfs_btree_balance_dirty(fs_info);
    2473             : 
    2474           9 :                 trans = btrfs_start_transaction(root, rsv_count);
    2475           9 :                 if (IS_ERR(trans)) {
    2476           0 :                         ret = PTR_ERR(trans);
    2477           0 :                         trans = NULL;
    2478           0 :                         break;
    2479             :                 }
    2480             : 
    2481           9 :                 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
    2482             :                                               rsv, min_size, false);
    2483           0 :                 if (WARN_ON(ret))
    2484             :                         break;
    2485           9 :                 trans->block_rsv = rsv;
    2486             : 
    2487           9 :                 cur_offset = drop_args.drop_end;
    2488           9 :                 len = end - cur_offset;
    2489           9 :                 if (!extent_info && len) {
    2490           0 :                         ret = find_first_non_hole(inode, &cur_offset, &len);
    2491           0 :                         if (unlikely(ret < 0))
    2492             :                                 break;
    2493           0 :                         if (ret && !len) {
    2494             :                                 ret = 0;
    2495             :                                 break;
    2496             :                         }
    2497             :                 }
    2498             :         }
    2499             : 
    2500             :         /*
    2501             :          * If we were cloning, force the next fsync to be a full one since we
    2502             :          * we replaced (or just dropped in the case of cloning holes when
    2503             :          * NO_HOLES is enabled) file extent items and did not setup new extent
    2504             :          * maps for the replacement extents (or holes).
    2505             :          */
    2506    18250061 :         if (extent_info && !extent_info->is_new_extent)
    2507    14406964 :                 btrfs_set_inode_full_sync(inode);
    2508             : 
    2509    18250062 :         if (ret)
    2510           0 :                 goto out_trans;
    2511             : 
    2512    18250062 :         trans->block_rsv = &fs_info->trans_block_rsv;
    2513             :         /*
    2514             :          * If we are using the NO_HOLES feature we might have had already an
    2515             :          * hole that overlaps a part of the region [lockstart, lockend] and
    2516             :          * ends at (or beyond) lockend. Since we have no file extent items to
    2517             :          * represent holes, drop_end can be less than lockend and so we must
    2518             :          * make sure we have an extent map representing the existing hole (the
    2519             :          * call to __btrfs_drop_extents() might have dropped the existing extent
    2520             :          * map representing the existing hole), otherwise the fast fsync path
    2521             :          * will not record the existence of the hole region
    2522             :          * [existing_hole_start, lockend].
    2523             :          */
    2524    18250062 :         if (drop_args.drop_end <= end)
    2525      228917 :                 drop_args.drop_end = end + 1;
    2526             :         /*
    2527             :          * Don't insert file hole extent item if it's for a range beyond eof
    2528             :          * (because it's useless) or if it represents a 0 bytes range (when
    2529             :          * cur_offset == drop_end).
    2530             :          */
    2531    18250062 :         if (!extent_info && cur_offset < ino_size &&
    2532     3363222 :             cur_offset < drop_args.drop_end) {
    2533     3363150 :                 ret = fill_holes(trans, inode, path, cur_offset,
    2534             :                                  drop_args.drop_end);
    2535     3363291 :                 if (ret) {
    2536             :                         /* Same comment as above. */
    2537           0 :                         btrfs_abort_transaction(trans, ret);
    2538           0 :                         goto out_trans;
    2539             :                 }
    2540    14886912 :         } else if (!extent_info && cur_offset < drop_args.drop_end) {
    2541             :                 /* See the comment in the loop above for the reasoning here. */
    2542       40399 :                 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
    2543             :                                         drop_args.drop_end - cur_offset);
    2544       40400 :                 if (ret) {
    2545           0 :                         btrfs_abort_transaction(trans, ret);
    2546           0 :                         goto out_trans;
    2547             :                 }
    2548             : 
    2549             :         }
    2550    18250204 :         if (extent_info) {
    2551    14846516 :                 ret = btrfs_insert_replace_extent(trans, inode, path,
    2552             :                                 extent_info, extent_info->data_len,
    2553             :                                 drop_args.bytes_found);
    2554    14846518 :                 if (ret) {
    2555           0 :                         btrfs_abort_transaction(trans, ret);
    2556           0 :                         goto out_trans;
    2557             :                 }
    2558             :         }
    2559             : 
    2560    18250206 : out_trans:
    2561    18250206 :         if (!trans)
    2562           0 :                 goto out_free;
    2563             : 
    2564    18250206 :         trans->block_rsv = &fs_info->trans_block_rsv;
    2565    18250206 :         if (ret)
    2566           0 :                 btrfs_end_transaction(trans);
    2567             :         else
    2568    18250206 :                 *trans_out = trans;
    2569    18252247 : out_free:
    2570    18252247 :         btrfs_free_block_rsv(fs_info, rsv);
    2571             : out:
    2572             :         return ret;
    2573             : }
    2574             : 
    2575     2389650 : static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
    2576             : {
    2577     2389650 :         struct inode *inode = file_inode(file);
    2578     2389650 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    2579     2389650 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    2580     2389650 :         struct extent_state *cached_state = NULL;
    2581     2389650 :         struct btrfs_path *path;
    2582     2389650 :         struct btrfs_trans_handle *trans = NULL;
    2583     2389650 :         u64 lockstart;
    2584     2389650 :         u64 lockend;
    2585     2389650 :         u64 tail_start;
    2586     2389650 :         u64 tail_len;
    2587     2389650 :         u64 orig_start = offset;
    2588     2389650 :         int ret = 0;
    2589     2389650 :         bool same_block;
    2590     2389650 :         u64 ino_size;
    2591     2389650 :         bool truncated_block = false;
    2592     2389650 :         bool updated_inode = false;
    2593             : 
    2594     2389650 :         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    2595             : 
    2596     2389673 :         ret = btrfs_wait_ordered_range(inode, offset, len);
    2597     2389628 :         if (ret)
    2598           0 :                 goto out_only_mutex;
    2599             : 
    2600     2389628 :         ino_size = round_up(inode->i_size, fs_info->sectorsize);
    2601     2389628 :         ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
    2602     2389665 :         if (ret < 0)
    2603           0 :                 goto out_only_mutex;
    2604     2389665 :         if (ret && !len) {
    2605             :                 /* Already in a large hole */
    2606       51967 :                 ret = 0;
    2607       51967 :                 goto out_only_mutex;
    2608             :         }
    2609             : 
    2610     2337698 :         ret = file_modified(file);
    2611     2337621 :         if (ret)
    2612          61 :                 goto out_only_mutex;
    2613             : 
    2614     2337560 :         lockstart = round_up(offset, fs_info->sectorsize);
    2615     2337560 :         lockend = round_down(offset + len, fs_info->sectorsize) - 1;
    2616     2337560 :         same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
    2617     2337560 :                 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
    2618             :         /*
    2619             :          * We needn't truncate any block which is beyond the end of the file
    2620             :          * because we are sure there is no data there.
    2621             :          */
    2622             :         /*
    2623             :          * Only do this if we are in the same block and we aren't doing the
    2624             :          * entire block.
    2625             :          */
    2626     2337560 :         if (same_block && len < fs_info->sectorsize) {
    2627        9981 :                 if (offset < ino_size) {
    2628        7052 :                         truncated_block = true;
    2629        7052 :                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
    2630             :                                                    0);
    2631             :                 } else {
    2632             :                         ret = 0;
    2633             :                 }
    2634        9981 :                 goto out_only_mutex;
    2635             :         }
    2636             : 
    2637             :         /* zero back part of the first block */
    2638     2327579 :         if (offset < ino_size) {
    2639     2326937 :                 truncated_block = true;
    2640     2326937 :                 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
    2641     2326908 :                 if (ret) {
    2642          16 :                         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    2643          16 :                         return ret;
    2644             :                 }
    2645             :         }
    2646             : 
    2647             :         /* Check the aligned pages after the first unaligned page,
    2648             :          * if offset != orig_start, which means the first unaligned page
    2649             :          * including several following pages are already in holes,
    2650             :          * the extra check can be skipped */
    2651     2327534 :         if (offset == orig_start) {
    2652             :                 /* after truncate page, check hole again */
    2653     2280254 :                 len = offset + len - lockstart;
    2654     2280254 :                 offset = lockstart;
    2655     2280254 :                 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
    2656     2280367 :                 if (ret < 0)
    2657           0 :                         goto out_only_mutex;
    2658     2280367 :                 if (ret && !len) {
    2659        2345 :                         ret = 0;
    2660        2345 :                         goto out_only_mutex;
    2661             :                 }
    2662     2278022 :                 lockstart = offset;
    2663             :         }
    2664             : 
    2665             :         /* Check the tail unaligned part is in a hole */
    2666     2325302 :         tail_start = lockend + 1;
    2667     2325302 :         tail_len = offset + len - tail_start;
    2668     2325302 :         if (tail_len) {
    2669      139181 :                 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
    2670      139181 :                 if (unlikely(ret < 0))
    2671           0 :                         goto out_only_mutex;
    2672      139181 :                 if (!ret) {
    2673             :                         /* zero the front end of the last page */
    2674       95674 :                         if (tail_start + tail_len < ino_size) {
    2675       95473 :                                 truncated_block = true;
    2676       95473 :                                 ret = btrfs_truncate_block(BTRFS_I(inode),
    2677             :                                                         tail_start + tail_len,
    2678             :                                                         0, 1);
    2679       95473 :                                 if (ret)
    2680          17 :                                         goto out_only_mutex;
    2681             :                         }
    2682             :                 }
    2683             :         }
    2684             : 
    2685     2325285 :         if (lockend < lockstart) {
    2686        5157 :                 ret = 0;
    2687        5157 :                 goto out_only_mutex;
    2688             :         }
    2689             : 
    2690     2320128 :         btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
    2691             : 
    2692     2320049 :         path = btrfs_alloc_path();
    2693     2320116 :         if (!path) {
    2694           0 :                 ret = -ENOMEM;
    2695           0 :                 goto out;
    2696             :         }
    2697             : 
    2698     2320116 :         ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
    2699             :                                          lockend, NULL, &trans);
    2700     2320151 :         btrfs_free_path(path);
    2701     2320090 :         if (ret)
    2702          49 :                 goto out;
    2703             : 
    2704     2320041 :         ASSERT(trans != NULL);
    2705     2320041 :         inode_inc_iversion(inode);
    2706     2320092 :         inode->i_mtime = current_time(inode);
    2707     2320084 :         inode->i_ctime = inode->i_mtime;
    2708     2320084 :         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    2709     2320101 :         updated_inode = true;
    2710     2320101 :         btrfs_end_transaction(trans);
    2711     2320102 :         btrfs_btree_balance_dirty(fs_info);
    2712     2320100 : out:
    2713     2320100 :         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
    2714             :                       &cached_state);
    2715     2389664 : out_only_mutex:
    2716     2389664 :         if (!updated_inode && truncated_block && !ret) {
    2717             :                 /*
    2718             :                  * If we only end up zeroing part of a page, we still need to
    2719             :                  * update the inode item, so that all the time fields are
    2720             :                  * updated as well as the necessary btrfs inode in memory fields
    2721             :                  * for detecting, at fsync time, if the inode isn't yet in the
    2722             :                  * log tree or it's there but not up to date.
    2723             :                  */
    2724       14550 :                 struct timespec64 now = current_time(inode);
    2725             : 
    2726       14550 :                 inode_inc_iversion(inode);
    2727       14550 :                 inode->i_mtime = now;
    2728       14550 :                 inode->i_ctime = now;
    2729       14550 :                 trans = btrfs_start_transaction(root, 1);
    2730       14550 :                 if (IS_ERR(trans)) {
    2731           0 :                         ret = PTR_ERR(trans);
    2732             :                 } else {
    2733       14550 :                         int ret2;
    2734             : 
    2735       14550 :                         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    2736       14550 :                         ret2 = btrfs_end_transaction(trans);
    2737       14550 :                         if (!ret)
    2738       14550 :                                 ret = ret2;
    2739             :                 }
    2740             :         }
    2741     2389664 :         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    2742     2389664 :         return ret;
    2743             : }
    2744             : 
    2745             : /* Helper structure to record which range is already reserved */
    2746             : struct falloc_range {
    2747             :         struct list_head list;
    2748             :         u64 start;
    2749             :         u64 len;
    2750             : };
    2751             : 
    2752             : /*
    2753             :  * Helper function to add falloc range
    2754             :  *
    2755             :  * Caller should have locked the larger range of extent containing
    2756             :  * [start, len)
    2757             :  */
    2758      315833 : static int add_falloc_range(struct list_head *head, u64 start, u64 len)
    2759             : {
    2760      315833 :         struct falloc_range *range = NULL;
    2761             : 
    2762      315833 :         if (!list_empty(head)) {
    2763             :                 /*
    2764             :                  * As fallocate iterates by bytenr order, we only need to check
    2765             :                  * the last range.
    2766             :                  */
    2767      111008 :                 range = list_last_entry(head, struct falloc_range, list);
    2768      111008 :                 if (range->start + range->len == start) {
    2769       63597 :                         range->len += len;
    2770       63597 :                         return 0;
    2771             :                 }
    2772             :         }
    2773             : 
    2774      252236 :         range = kmalloc(sizeof(*range), GFP_KERNEL);
    2775      252236 :         if (!range)
    2776             :                 return -ENOMEM;
    2777      252236 :         range->start = start;
    2778      252236 :         range->len = len;
    2779      252236 :         list_add_tail(&range->list, head);
    2780      252236 :         return 0;
    2781             : }
    2782             : 
    2783      696506 : static int btrfs_fallocate_update_isize(struct inode *inode,
    2784             :                                         const u64 end,
    2785             :                                         const int mode)
    2786             : {
    2787      696506 :         struct btrfs_trans_handle *trans;
    2788      696506 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    2789      696506 :         int ret;
    2790      696506 :         int ret2;
    2791             : 
    2792      696506 :         if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
    2793             :                 return 0;
    2794             : 
    2795      266292 :         trans = btrfs_start_transaction(root, 1);
    2796      266292 :         if (IS_ERR(trans))
    2797           0 :                 return PTR_ERR(trans);
    2798             : 
    2799      266292 :         inode->i_ctime = current_time(inode);
    2800      266292 :         i_size_write(inode, end);
    2801      266292 :         btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
    2802      266292 :         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    2803      266292 :         ret2 = btrfs_end_transaction(trans);
    2804             : 
    2805      266292 :         return ret ? ret : ret2;
    2806             : }
    2807             : 
    2808             : enum {
    2809             :         RANGE_BOUNDARY_WRITTEN_EXTENT,
    2810             :         RANGE_BOUNDARY_PREALLOC_EXTENT,
    2811             :         RANGE_BOUNDARY_HOLE,
    2812             : };
    2813             : 
    2814      330082 : static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
    2815             :                                                  u64 offset)
    2816             : {
    2817      330082 :         const u64 sectorsize = inode->root->fs_info->sectorsize;
    2818      330082 :         struct extent_map *em;
    2819      330082 :         int ret;
    2820             : 
    2821      330082 :         offset = round_down(offset, sectorsize);
    2822      330082 :         em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
    2823      330082 :         if (IS_ERR(em))
    2824           0 :                 return PTR_ERR(em);
    2825             : 
    2826      330082 :         if (em->block_start == EXTENT_MAP_HOLE)
    2827             :                 ret = RANGE_BOUNDARY_HOLE;
    2828      250670 :         else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
    2829             :                 ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
    2830             :         else
    2831      103918 :                 ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
    2832             : 
    2833      330082 :         free_extent_map(em);
    2834      330082 :         return ret;
    2835             : }
    2836             : 
    2837      190089 : static int btrfs_zero_range(struct inode *inode,
    2838             :                             loff_t offset,
    2839             :                             loff_t len,
    2840             :                             const int mode)
    2841             : {
    2842      190089 :         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
    2843      190089 :         struct extent_map *em;
    2844      190089 :         struct extent_changeset *data_reserved = NULL;
    2845      190089 :         int ret;
    2846      190089 :         u64 alloc_hint = 0;
    2847      190089 :         const u64 sectorsize = fs_info->sectorsize;
    2848      190089 :         u64 alloc_start = round_down(offset, sectorsize);
    2849      190089 :         u64 alloc_end = round_up(offset + len, sectorsize);
    2850      190089 :         u64 bytes_to_reserve = 0;
    2851      190089 :         bool space_reserved = false;
    2852             : 
    2853      190089 :         em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
    2854             :                               alloc_end - alloc_start);
    2855      190089 :         if (IS_ERR(em)) {
    2856           0 :                 ret = PTR_ERR(em);
    2857           0 :                 goto out;
    2858             :         }
    2859             : 
    2860             :         /*
    2861             :          * Avoid hole punching and extent allocation for some cases. More cases
    2862             :          * could be considered, but these are unlikely common and we keep things
    2863             :          * as simple as possible for now. Also, intentionally, if the target
    2864             :          * range contains one or more prealloc extents together with regular
    2865             :          * extents and holes, we drop all the existing extents and allocate a
    2866             :          * new prealloc extent, so that we get a larger contiguous disk extent.
    2867             :          */
    2868      380178 :         if (em->start <= alloc_start &&
    2869      190089 :             test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
    2870       30049 :                 const u64 em_end = em->start + em->len;
    2871             : 
    2872       30049 :                 if (em_end >= offset + len) {
    2873             :                         /*
    2874             :                          * The whole range is already a prealloc extent,
    2875             :                          * do nothing except updating the inode's i_size if
    2876             :                          * needed.
    2877             :                          */
    2878        7336 :                         free_extent_map(em);
    2879        7336 :                         ret = btrfs_fallocate_update_isize(inode, offset + len,
    2880             :                                                            mode);
    2881        7336 :                         goto out;
    2882             :                 }
    2883             :                 /*
    2884             :                  * Part of the range is already a prealloc extent, so operate
    2885             :                  * only on the remaining part of the range.
    2886             :                  */
    2887       22713 :                 alloc_start = em_end;
    2888       22713 :                 ASSERT(IS_ALIGNED(alloc_start, sectorsize));
    2889       22713 :                 len = offset + len - alloc_start;
    2890       22713 :                 offset = alloc_start;
    2891       22713 :                 alloc_hint = em->block_start + em->len;
    2892             :         }
    2893      182753 :         free_extent_map(em);
    2894             : 
    2895      182753 :         if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
    2896      182753 :             BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
    2897        5370 :                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
    2898             :                                       sectorsize);
    2899        5370 :                 if (IS_ERR(em)) {
    2900           0 :                         ret = PTR_ERR(em);
    2901           0 :                         goto out;
    2902             :                 }
    2903             : 
    2904       10740 :                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
    2905         285 :                         free_extent_map(em);
    2906         285 :                         ret = btrfs_fallocate_update_isize(inode, offset + len,
    2907             :                                                            mode);
    2908         285 :                         goto out;
    2909             :                 }
    2910        5085 :                 if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
    2911        1791 :                         free_extent_map(em);
    2912        1791 :                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
    2913             :                                                    0);
    2914        1791 :                         if (!ret)
    2915        1791 :                                 ret = btrfs_fallocate_update_isize(inode,
    2916             :                                                                    offset + len,
    2917             :                                                                    mode);
    2918        1791 :                         return ret;
    2919             :                 }
    2920        3294 :                 free_extent_map(em);
    2921        3294 :                 alloc_start = round_down(offset, sectorsize);
    2922        3294 :                 alloc_end = alloc_start + sectorsize;
    2923        3294 :                 goto reserve_space;
    2924             :         }
    2925             : 
    2926      177383 :         alloc_start = round_up(offset, sectorsize);
    2927      177383 :         alloc_end = round_down(offset + len, sectorsize);
    2928             : 
    2929             :         /*
    2930             :          * For unaligned ranges, check the pages at the boundaries, they might
    2931             :          * map to an extent, in which case we need to partially zero them, or
    2932             :          * they might map to a hole, in which case we need our allocation range
    2933             :          * to cover them.
    2934             :          */
    2935      177383 :         if (!IS_ALIGNED(offset, sectorsize)) {
    2936      155759 :                 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
    2937             :                                                             offset);
    2938      155759 :                 if (ret < 0)
    2939           0 :                         goto out;
    2940      155759 :                 if (ret == RANGE_BOUNDARY_HOLE) {
    2941       99721 :                         alloc_start = round_down(offset, sectorsize);
    2942       99721 :                         ret = 0;
    2943       56038 :                 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
    2944       56038 :                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
    2945       56038 :                         if (ret)
    2946          10 :                                 goto out;
    2947             :                 } else {
    2948             :                         ret = 0;
    2949             :                 }
    2950             :         }
    2951             : 
    2952      177373 :         if (!IS_ALIGNED(offset + len, sectorsize)) {
    2953      174323 :                 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
    2954             :                                                             offset + len);
    2955      174323 :                 if (ret < 0)
    2956           0 :                         goto out;
    2957      174323 :                 if (ret == RANGE_BOUNDARY_HOLE) {
    2958      105026 :                         alloc_end = round_up(offset + len, sectorsize);
    2959      105026 :                         ret = 0;
    2960       69297 :                 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
    2961       47880 :                         ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
    2962             :                                                    0, 1);
    2963       47880 :                         if (ret)
    2964           9 :                                 goto out;
    2965             :                 } else {
    2966             :                         ret = 0;
    2967             :                 }
    2968             :         }
    2969             : 
    2970        3050 : reserve_space:
    2971      180658 :         if (alloc_start < alloc_end) {
    2972      178449 :                 struct extent_state *cached_state = NULL;
    2973      178449 :                 const u64 lockstart = alloc_start;
    2974      178449 :                 const u64 lockend = alloc_end - 1;
    2975             : 
    2976      178449 :                 bytes_to_reserve = alloc_end - alloc_start;
    2977      178449 :                 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
    2978             :                                                       bytes_to_reserve);
    2979      178449 :                 if (ret < 0)
    2980         451 :                         goto out;
    2981      178073 :                 space_reserved = true;
    2982      178073 :                 btrfs_punch_hole_lock_range(inode, lockstart, lockend,
    2983             :                                             &cached_state);
    2984      178073 :                 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
    2985             :                                                 alloc_start, bytes_to_reserve);
    2986      178073 :                 if (ret) {
    2987          71 :                         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
    2988             :                                       lockend, &cached_state);
    2989          71 :                         goto out;
    2990             :                 }
    2991      178002 :                 ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
    2992             :                                                 alloc_end - alloc_start,
    2993             :                                                 i_blocksize(inode),
    2994             :                                                 offset + len, &alloc_hint);
    2995      178002 :                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
    2996             :                               &cached_state);
    2997             :                 /* btrfs_prealloc_file_range releases reserved space on error */
    2998      178002 :                 if (ret) {
    2999           4 :                         space_reserved = false;
    3000           4 :                         goto out;
    3001             :                 }
    3002             :         }
    3003      180207 :         ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
    3004      188298 :  out:
    3005      188298 :         if (ret && space_reserved)
    3006          71 :                 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
    3007             :                                                alloc_start, bytes_to_reserve);
    3008      188298 :         extent_changeset_free(data_reserved);
    3009             : 
    3010      188298 :         return ret;
    3011             : }
    3012             : 
    3013     3103139 : static long btrfs_fallocate(struct file *file, int mode,
    3014             :                             loff_t offset, loff_t len)
    3015             : {
    3016     3103139 :         struct inode *inode = file_inode(file);
    3017     3103139 :         struct extent_state *cached_state = NULL;
    3018     3103139 :         struct extent_changeset *data_reserved = NULL;
    3019     3103139 :         struct falloc_range *range;
    3020     3103139 :         struct falloc_range *tmp;
    3021     3103139 :         struct list_head reserve_list;
    3022     3103139 :         u64 cur_offset;
    3023     3103139 :         u64 last_byte;
    3024     3103139 :         u64 alloc_start;
    3025     3103139 :         u64 alloc_end;
    3026     3103139 :         u64 alloc_hint = 0;
    3027     3103139 :         u64 locked_end;
    3028     3103139 :         u64 actual_end = 0;
    3029     3103139 :         u64 data_space_needed = 0;
    3030     3103139 :         u64 data_space_reserved = 0;
    3031     3103139 :         u64 qgroup_reserved = 0;
    3032     3103139 :         struct extent_map *em;
    3033     3103139 :         int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
    3034     3103139 :         int ret;
    3035             : 
    3036             :         /* Do not allow fallocate in ZONED mode */
    3037     3103139 :         if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
    3038             :                 return -EOPNOTSUPP;
    3039             : 
    3040     3103139 :         alloc_start = round_down(offset, blocksize);
    3041     3103139 :         alloc_end = round_up(offset + len, blocksize);
    3042     3103139 :         cur_offset = alloc_start;
    3043             : 
    3044             :         /* Make sure we aren't being give some crap mode */
    3045     3103139 :         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
    3046             :                      FALLOC_FL_ZERO_RANGE))
    3047             :                 return -EOPNOTSUPP;
    3048             : 
    3049     3092454 :         if (mode & FALLOC_FL_PUNCH_HOLE)
    3050     2389663 :                 return btrfs_punch_hole(file, offset, len);
    3051             : 
    3052      702791 :         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    3053             : 
    3054      702790 :         if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
    3055      360534 :                 ret = inode_newsize_ok(inode, offset + len);
    3056      360534 :                 if (ret)
    3057           1 :                         goto out;
    3058             :         }
    3059             : 
    3060      702789 :         ret = file_modified(file);
    3061      702791 :         if (ret)
    3062         170 :                 goto out;
    3063             : 
    3064             :         /*
    3065             :          * TODO: Move these two operations after we have checked
    3066             :          * accurate reserved space, or fallocate can still fail but
    3067             :          * with page truncated or size expanded.
    3068             :          *
    3069             :          * But that's a minor problem and won't do much harm BTW.
    3070             :          */
    3071      702621 :         if (alloc_start > inode->i_size) {
    3072       94823 :                 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
    3073             :                                         alloc_start);
    3074       94823 :                 if (ret)
    3075         171 :                         goto out;
    3076      607798 :         } else if (offset + len > inode->i_size) {
    3077             :                 /*
    3078             :                  * If we are fallocating from the end of the file onward we
    3079             :                  * need to zero out the end of the block if i_size lands in the
    3080             :                  * middle of a block.
    3081             :                  */
    3082      327459 :                 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
    3083      327459 :                 if (ret)
    3084          81 :                         goto out;
    3085             :         }
    3086             : 
    3087             :         /*
    3088             :          * We have locked the inode at the VFS level (in exclusive mode) and we
    3089             :          * have locked the i_mmap_lock lock (in exclusive mode). Now before
    3090             :          * locking the file range, flush all dealloc in the range and wait for
    3091             :          * all ordered extents in the range to complete. After this we can lock
    3092             :          * the file range and, due to the previous locking we did, we know there
    3093             :          * can't be more delalloc or ordered extents in the range.
    3094             :          */
    3095      702369 :         ret = btrfs_wait_ordered_range(inode, alloc_start,
    3096             :                                        alloc_end - alloc_start);
    3097      702369 :         if (ret)
    3098           0 :                 goto out;
    3099             : 
    3100      702369 :         if (mode & FALLOC_FL_ZERO_RANGE) {
    3101      190089 :                 ret = btrfs_zero_range(inode, offset, len, mode);
    3102      190089 :                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    3103      190088 :                 return ret;
    3104             :         }
    3105             : 
    3106      512280 :         locked_end = alloc_end - 1;
    3107      512280 :         lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
    3108             :                     &cached_state);
    3109             : 
    3110      512280 :         btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
    3111             : 
    3112             :         /* First, check if we exceed the qgroup limit */
    3113      512279 :         INIT_LIST_HEAD(&reserve_list);
    3114     1646422 :         while (cur_offset < alloc_end) {
    3115     1134299 :                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
    3116             :                                       alloc_end - cur_offset);
    3117     1134300 :                 if (IS_ERR(em)) {
    3118           0 :                         ret = PTR_ERR(em);
    3119           0 :                         break;
    3120             :                 }
    3121     1134300 :                 last_byte = min(extent_map_end(em), alloc_end);
    3122     1134300 :                 actual_end = min_t(u64, extent_map_end(em), offset + len);
    3123     1134300 :                 last_byte = ALIGN(last_byte, blocksize);
    3124     1134300 :                 if (em->block_start == EXTENT_MAP_HOLE ||
    3125      823645 :                     (cur_offset >= inode->i_size &&
    3126        5178 :                      !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
    3127      315833 :                         const u64 range_len = last_byte - cur_offset;
    3128             : 
    3129      315833 :                         ret = add_falloc_range(&reserve_list, cur_offset, range_len);
    3130      315833 :                         if (ret < 0) {
    3131           0 :                                 free_extent_map(em);
    3132           0 :                                 break;
    3133             :                         }
    3134      315833 :                         ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
    3135             :                                         &data_reserved, cur_offset, range_len);
    3136      315832 :                         if (ret < 0) {
    3137         157 :                                 free_extent_map(em);
    3138         157 :                                 break;
    3139             :                         }
    3140      315675 :                         qgroup_reserved += range_len;
    3141      315675 :                         data_space_needed += range_len;
    3142             :                 }
    3143     1134142 :                 free_extent_map(em);
    3144     1134142 :                 cur_offset = last_byte;
    3145             :         }
    3146             : 
    3147      512280 :         if (!ret && data_space_needed > 0) {
    3148             :                 /*
    3149             :                  * We are safe to reserve space here as we can't have delalloc
    3150             :                  * in the range, see above.
    3151             :                  */
    3152      204668 :                 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
    3153             :                                                       data_space_needed);
    3154      204668 :                 if (!ret)
    3155      199439 :                         data_space_reserved = data_space_needed;
    3156             :         }
    3157             : 
    3158             :         /*
    3159             :          * If ret is still 0, means we're OK to fallocate.
    3160             :          * Or just cleanup the list and exit.
    3161             :          */
    3162      764514 :         list_for_each_entry_safe(range, tmp, &reserve_list, list) {
    3163      252236 :                 if (!ret) {
    3164      235984 :                         ret = btrfs_prealloc_file_range(inode, mode,
    3165             :                                         range->start,
    3166             :                                         range->len, i_blocksize(inode),
    3167             :                                         offset + len, &alloc_hint);
    3168             :                         /*
    3169             :                          * btrfs_prealloc_file_range() releases space even
    3170             :                          * if it returns an error.
    3171             :                          */
    3172      235984 :                         data_space_reserved -= range->len;
    3173      235984 :                         qgroup_reserved -= range->len;
    3174       16252 :                 } else if (data_space_reserved > 0) {
    3175           2 :                         btrfs_free_reserved_data_space(BTRFS_I(inode),
    3176             :                                                data_reserved, range->start,
    3177             :                                                range->len);
    3178           2 :                         data_space_reserved -= range->len;
    3179           2 :                         qgroup_reserved -= range->len;
    3180       16250 :                 } else if (qgroup_reserved > 0) {
    3181       16105 :                         btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
    3182             :                                                range->start, range->len);
    3183       16105 :                         qgroup_reserved -= range->len;
    3184             :                 }
    3185      252236 :                 list_del(&range->list);
    3186      252235 :                 kfree(range);
    3187             :         }
    3188      512278 :         if (ret < 0)
    3189        5391 :                 goto out_unlock;
    3190             : 
    3191             :         /*
    3192             :          * We didn't need to allocate any more space, but we still extended the
    3193             :          * size of the file so we need to update i_size and the inode item.
    3194             :          */
    3195      506887 :         ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
    3196      512278 : out_unlock:
    3197      512278 :         unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
    3198             :                       &cached_state);
    3199      512703 : out:
    3200      512703 :         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    3201      512702 :         extent_changeset_free(data_reserved);
    3202      512702 :         return ret;
    3203             : }
    3204             : 
    3205             : /*
    3206             :  * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
    3207             :  * that has unflushed and/or flushing delalloc. There might be other adjacent
    3208             :  * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
    3209             :  * looping while it gets adjacent subranges, and merging them together.
    3210             :  */
    3211      432280 : static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
    3212             :                                    struct extent_state **cached_state,
    3213             :                                    bool *search_io_tree,
    3214             :                                    u64 *delalloc_start_ret, u64 *delalloc_end_ret)
    3215             : {
    3216      432280 :         u64 len = end + 1 - start;
    3217      432280 :         u64 delalloc_len = 0;
    3218      432280 :         struct btrfs_ordered_extent *oe;
    3219      432280 :         u64 oe_start;
    3220      432280 :         u64 oe_end;
    3221             : 
    3222             :         /*
    3223             :          * Search the io tree first for EXTENT_DELALLOC. If we find any, it
    3224             :          * means we have delalloc (dirty pages) for which writeback has not
    3225             :          * started yet.
    3226             :          */
    3227      432280 :         if (*search_io_tree) {
    3228      432081 :                 spin_lock(&inode->lock);
    3229      432291 :                 if (inode->delalloc_bytes > 0) {
    3230       45033 :                         spin_unlock(&inode->lock);
    3231       45033 :                         *delalloc_start_ret = start;
    3232       45033 :                         delalloc_len = count_range_bits(&inode->io_tree,
    3233             :                                                         delalloc_start_ret, end,
    3234             :                                                         len, EXTENT_DELALLOC, 1,
    3235             :                                                         cached_state);
    3236             :                 } else {
    3237      387258 :                         spin_unlock(&inode->lock);
    3238             :                 }
    3239             :         }
    3240             : 
    3241      432291 :         if (delalloc_len > 0) {
    3242             :                 /*
    3243             :                  * If delalloc was found then *delalloc_start_ret has a sector size
    3244             :                  * aligned value (rounded down).
    3245             :                  */
    3246       20458 :                 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
    3247             : 
    3248       20458 :                 if (*delalloc_start_ret == start) {
    3249             :                         /* Delalloc for the whole range, nothing more to do. */
    3250       20059 :                         if (*delalloc_end_ret == end)
    3251             :                                 return true;
    3252             :                         /* Else trim our search range for ordered extents. */
    3253        1179 :                         start = *delalloc_end_ret + 1;
    3254        1179 :                         len = end + 1 - start;
    3255             :                 }
    3256             :         } else {
    3257             :                 /* No delalloc, future calls don't need to search again. */
    3258      411999 :                 *search_io_tree = false;
    3259             :         }
    3260             : 
    3261             :         /*
    3262             :          * Now also check if there's any ordered extent in the range.
    3263             :          * We do this because:
    3264             :          *
    3265             :          * 1) When delalloc is flushed, the file range is locked, we clear the
    3266             :          *    EXTENT_DELALLOC bit from the io tree and create an extent map and
    3267             :          *    an ordered extent for the write. So we might just have been called
    3268             :          *    after delalloc is flushed and before the ordered extent completes
    3269             :          *    and inserts the new file extent item in the subvolume's btree;
    3270             :          *
    3271             :          * 2) We may have an ordered extent created by flushing delalloc for a
    3272             :          *    subrange that starts before the subrange we found marked with
    3273             :          *    EXTENT_DELALLOC in the io tree.
    3274             :          *
    3275             :          * We could also use the extent map tree to find such delalloc that is
    3276             :          * being flushed, but using the ordered extents tree is more efficient
    3277             :          * because it's usually much smaller as ordered extents are removed from
    3278             :          * the tree once they complete. With the extent maps, we mau have them
    3279             :          * in the extent map tree for a very long time, and they were either
    3280             :          * created by previous writes or loaded by read operations.
    3281             :          */
    3282      413577 :         oe = btrfs_lookup_first_ordered_range(inode, start, len);
    3283      413576 :         if (!oe)
    3284      413029 :                 return (delalloc_len > 0);
    3285             : 
    3286             :         /* The ordered extent may span beyond our search range. */
    3287         547 :         oe_start = max(oe->file_offset, start);
    3288         547 :         oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
    3289             : 
    3290         547 :         btrfs_put_ordered_extent(oe);
    3291             : 
    3292             :         /* Don't have unflushed delalloc, return the ordered extent range. */
    3293         547 :         if (delalloc_len == 0) {
    3294         437 :                 *delalloc_start_ret = oe_start;
    3295         437 :                 *delalloc_end_ret = oe_end;
    3296         437 :                 return true;
    3297             :         }
    3298             : 
    3299             :         /*
    3300             :          * We have both unflushed delalloc (io_tree) and an ordered extent.
    3301             :          * If the ranges are adjacent returned a combined range, otherwise
    3302             :          * return the leftmost range.
    3303             :          */
    3304         110 :         if (oe_start < *delalloc_start_ret) {
    3305         106 :                 if (oe_end < *delalloc_start_ret)
    3306         106 :                         *delalloc_end_ret = oe_end;
    3307         106 :                 *delalloc_start_ret = oe_start;
    3308           4 :         } else if (*delalloc_end_ret + 1 == oe_start) {
    3309           0 :                 *delalloc_end_ret = oe_end;
    3310             :         }
    3311             : 
    3312             :         return true;
    3313             : }
    3314             : 
    3315             : /*
    3316             :  * Check if there's delalloc in a given range.
    3317             :  *
    3318             :  * @inode:               The inode.
    3319             :  * @start:               The start offset of the range. It does not need to be
    3320             :  *                       sector size aligned.
    3321             :  * @end:                 The end offset (inclusive value) of the search range.
    3322             :  *                       It does not need to be sector size aligned.
    3323             :  * @cached_state:        Extent state record used for speeding up delalloc
    3324             :  *                       searches in the inode's io_tree. Can be NULL.
    3325             :  * @delalloc_start_ret:  Output argument, set to the start offset of the
    3326             :  *                       subrange found with delalloc (may not be sector size
    3327             :  *                       aligned).
    3328             :  * @delalloc_end_ret:    Output argument, set to he end offset (inclusive value)
    3329             :  *                       of the subrange found with delalloc.
    3330             :  *
    3331             :  * Returns true if a subrange with delalloc is found within the given range, and
    3332             :  * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
    3333             :  * end offsets of the subrange.
    3334             :  */
    3335      430755 : bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
    3336             :                                   struct extent_state **cached_state,
    3337             :                                   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
    3338             : {
    3339      430755 :         u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
    3340      430755 :         u64 prev_delalloc_end = 0;
    3341      430755 :         bool search_io_tree = true;
    3342      430755 :         bool ret = false;
    3343             : 
    3344      451561 :         while (cur_offset <= end) {
    3345      432399 :                 u64 delalloc_start;
    3346      432399 :                 u64 delalloc_end;
    3347      432399 :                 bool delalloc;
    3348             : 
    3349      432399 :                 delalloc = find_delalloc_subrange(inode, cur_offset, end,
    3350             :                                                   cached_state, &search_io_tree,
    3351             :                                                   &delalloc_start,
    3352             :                                                   &delalloc_end);
    3353      432456 :                 if (!delalloc)
    3354             :                         break;
    3355             : 
    3356       20895 :                 if (prev_delalloc_end == 0) {
    3357             :                         /* First subrange found. */
    3358       20615 :                         *delalloc_start_ret = max(delalloc_start, start);
    3359       20615 :                         *delalloc_end_ret = delalloc_end;
    3360       20615 :                         ret = true;
    3361         280 :                 } else if (delalloc_start == prev_delalloc_end + 1) {
    3362             :                         /* Subrange adjacent to the previous one, merge them. */
    3363         191 :                         *delalloc_end_ret = delalloc_end;
    3364             :                 } else {
    3365             :                         /* Subrange not adjacent to the previous one, exit. */
    3366             :                         break;
    3367             :                 }
    3368             : 
    3369       20806 :                 prev_delalloc_end = delalloc_end;
    3370       20806 :                 cur_offset = delalloc_end + 1;
    3371       20806 :                 cond_resched();
    3372             :         }
    3373             : 
    3374      430812 :         return ret;
    3375             : }
    3376             : 
    3377             : /*
    3378             :  * Check if there's a hole or delalloc range in a range representing a hole (or
    3379             :  * prealloc extent) found in the inode's subvolume btree.
    3380             :  *
    3381             :  * @inode:      The inode.
    3382             :  * @whence:     Seek mode (SEEK_DATA or SEEK_HOLE).
    3383             :  * @start:      Start offset of the hole region. It does not need to be sector
    3384             :  *              size aligned.
    3385             :  * @end:        End offset (inclusive value) of the hole region. It does not
    3386             :  *              need to be sector size aligned.
    3387             :  * @start_ret:  Return parameter, used to set the start of the subrange in the
    3388             :  *              hole that matches the search criteria (seek mode), if such
    3389             :  *              subrange is found (return value of the function is true).
    3390             :  *              The value returned here may not be sector size aligned.
    3391             :  *
    3392             :  * Returns true if a subrange matching the given seek mode is found, and if one
    3393             :  * is found, it updates @start_ret with the start of the subrange.
    3394             :  */
    3395         500 : static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
    3396             :                                         struct extent_state **cached_state,
    3397             :                                         u64 start, u64 end, u64 *start_ret)
    3398             : {
    3399         500 :         u64 delalloc_start;
    3400         500 :         u64 delalloc_end;
    3401         500 :         bool delalloc;
    3402             : 
    3403         500 :         delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
    3404             :                                                 &delalloc_start, &delalloc_end);
    3405         500 :         if (delalloc && whence == SEEK_DATA) {
    3406         315 :                 *start_ret = delalloc_start;
    3407         315 :                 return true;
    3408             :         }
    3409             : 
    3410         185 :         if (delalloc && whence == SEEK_HOLE) {
    3411             :                 /*
    3412             :                  * We found delalloc but it starts after out start offset. So we
    3413             :                  * have a hole between our start offset and the delalloc start.
    3414             :                  */
    3415         106 :                 if (start < delalloc_start) {
    3416          15 :                         *start_ret = start;
    3417          15 :                         return true;
    3418             :                 }
    3419             :                 /*
    3420             :                  * Delalloc range starts at our start offset.
    3421             :                  * If the delalloc range's length is smaller than our range,
    3422             :                  * then it means we have a hole that starts where the delalloc
    3423             :                  * subrange ends.
    3424             :                  */
    3425          91 :                 if (delalloc_end < end) {
    3426          72 :                         *start_ret = delalloc_end + 1;
    3427          72 :                         return true;
    3428             :                 }
    3429             : 
    3430             :                 /* There's delalloc for the whole range. */
    3431             :                 return false;
    3432             :         }
    3433             : 
    3434          79 :         if (!delalloc && whence == SEEK_HOLE) {
    3435           6 :                 *start_ret = start;
    3436           6 :                 return true;
    3437             :         }
    3438             : 
    3439             :         /*
    3440             :          * No delalloc in the range and we are seeking for data. The caller has
    3441             :          * to iterate to the next extent item in the subvolume btree.
    3442             :          */
    3443             :         return false;
    3444             : }
    3445             : 
    3446         474 : static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
    3447             : {
    3448         474 :         struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
    3449         474 :         struct btrfs_file_private *private = file->private_data;
    3450         474 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    3451         474 :         struct extent_state *cached_state = NULL;
    3452         474 :         struct extent_state **delalloc_cached_state;
    3453         474 :         const loff_t i_size = i_size_read(&inode->vfs_inode);
    3454         474 :         const u64 ino = btrfs_ino(inode);
    3455         474 :         struct btrfs_root *root = inode->root;
    3456         474 :         struct btrfs_path *path;
    3457         474 :         struct btrfs_key key;
    3458         474 :         u64 last_extent_end;
    3459         474 :         u64 lockstart;
    3460         474 :         u64 lockend;
    3461         474 :         u64 start;
    3462         474 :         int ret;
    3463         474 :         bool found = false;
    3464             : 
    3465         474 :         if (i_size == 0 || offset >= i_size)
    3466             :                 return -ENXIO;
    3467             : 
    3468             :         /*
    3469             :          * Quick path. If the inode has no prealloc extents and its number of
    3470             :          * bytes used matches its i_size, then it can not have holes.
    3471             :          */
    3472         447 :         if (whence == SEEK_HOLE &&
    3473         185 :             !(inode->flags & BTRFS_INODE_PREALLOC) &&
    3474          76 :             inode_get_bytes(&inode->vfs_inode) == i_size)
    3475             :                 return i_size;
    3476             : 
    3477         447 :         if (!private) {
    3478          39 :                 private = kzalloc(sizeof(*private), GFP_KERNEL);
    3479             :                 /*
    3480             :                  * No worries if memory allocation failed.
    3481             :                  * The private structure is used only for speeding up multiple
    3482             :                  * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
    3483             :                  * so everything will still be correct.
    3484             :                  */
    3485          39 :                 file->private_data = private;
    3486             :         }
    3487             : 
    3488         447 :         if (private)
    3489         447 :                 delalloc_cached_state = &private->llseek_cached_state;
    3490             :         else
    3491             :                 delalloc_cached_state = NULL;
    3492             : 
    3493             :         /*
    3494             :          * offset can be negative, in this case we start finding DATA/HOLE from
    3495             :          * the very start of the file.
    3496             :          */
    3497         447 :         start = max_t(loff_t, 0, offset);
    3498             : 
    3499         447 :         lockstart = round_down(start, fs_info->sectorsize);
    3500         447 :         lockend = round_up(i_size, fs_info->sectorsize);
    3501         447 :         if (lockend <= lockstart)
    3502           0 :                 lockend = lockstart + fs_info->sectorsize;
    3503         447 :         lockend--;
    3504             : 
    3505         447 :         path = btrfs_alloc_path();
    3506         447 :         if (!path)
    3507             :                 return -ENOMEM;
    3508         447 :         path->reada = READA_FORWARD;
    3509             : 
    3510         447 :         key.objectid = ino;
    3511         447 :         key.type = BTRFS_EXTENT_DATA_KEY;
    3512         447 :         key.offset = start;
    3513             : 
    3514         447 :         last_extent_end = lockstart;
    3515             : 
    3516         447 :         lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
    3517             : 
    3518         447 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    3519         447 :         if (ret < 0) {
    3520           0 :                 goto out;
    3521         447 :         } else if (ret > 0 && path->slots[0] > 0) {
    3522         397 :                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
    3523         397 :                 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
    3524          33 :                         path->slots[0]--;
    3525             :         }
    3526             : 
    3527         501 :         while (start < i_size) {
    3528         484 :                 struct extent_buffer *leaf = path->nodes[0];
    3529         484 :                 struct btrfs_file_extent_item *extent;
    3530         484 :                 u64 extent_end;
    3531         484 :                 u8 type;
    3532             : 
    3533         484 :                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
    3534         317 :                         ret = btrfs_next_leaf(root, path);
    3535         317 :                         if (ret < 0)
    3536           0 :                                 goto out;
    3537         317 :                         else if (ret > 0)
    3538             :                                 break;
    3539             : 
    3540           0 :                         leaf = path->nodes[0];
    3541             :                 }
    3542             : 
    3543         167 :                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    3544         167 :                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
    3545             :                         break;
    3546             : 
    3547         122 :                 extent_end = btrfs_file_extent_end(path);
    3548             : 
    3549             :                 /*
    3550             :                  * In the first iteration we may have a slot that points to an
    3551             :                  * extent that ends before our start offset, so skip it.
    3552             :                  */
    3553         122 :                 if (extent_end <= start) {
    3554           1 :                         path->slots[0]++;
    3555           1 :                         continue;
    3556             :                 }
    3557             : 
    3558             :                 /* We have an implicit hole, NO_HOLES feature is likely set. */
    3559         121 :                 if (last_extent_end < key.offset) {
    3560          21 :                         u64 search_start = last_extent_end;
    3561          21 :                         u64 found_start;
    3562             : 
    3563             :                         /*
    3564             :                          * First iteration, @start matches @offset and it's
    3565             :                          * within the hole.
    3566             :                          */
    3567          21 :                         if (start == offset)
    3568           6 :                                 search_start = offset;
    3569             : 
    3570          21 :                         found = find_desired_extent_in_hole(inode, whence,
    3571             :                                                             delalloc_cached_state,
    3572             :                                                             search_start,
    3573             :                                                             key.offset - 1,
    3574             :                                                             &found_start);
    3575          21 :                         if (found) {
    3576           2 :                                 start = found_start;
    3577           2 :                                 break;
    3578             :                         }
    3579             :                         /*
    3580             :                          * Didn't find data or a hole (due to delalloc) in the
    3581             :                          * implicit hole range, so need to analyze the extent.
    3582             :                          */
    3583             :                 }
    3584             : 
    3585         119 :                 extent = btrfs_item_ptr(leaf, path->slots[0],
    3586             :                                         struct btrfs_file_extent_item);
    3587         119 :                 type = btrfs_file_extent_type(leaf, extent);
    3588             : 
    3589             :                 /*
    3590             :                  * Can't access the extent's disk_bytenr field if this is an
    3591             :                  * inline extent, since at that offset, it's where the extent
    3592             :                  * data starts.
    3593             :                  */
    3594         119 :                 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
    3595           1 :                     (type == BTRFS_FILE_EXTENT_REG &&
    3596          53 :                      btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
    3597             :                         /*
    3598             :                          * Explicit hole or prealloc extent, search for delalloc.
    3599             :                          * A prealloc extent is treated like a hole.
    3600             :                          */
    3601         117 :                         u64 search_start = key.offset;
    3602         117 :                         u64 found_start;
    3603             : 
    3604             :                         /*
    3605             :                          * First iteration, @start matches @offset and it's
    3606             :                          * within the hole.
    3607             :                          */
    3608         117 :                         if (start == offset)
    3609          84 :                                 search_start = offset;
    3610             : 
    3611         117 :                         found = find_desired_extent_in_hole(inode, whence,
    3612             :                                                             delalloc_cached_state,
    3613             :                                                             search_start,
    3614             :                                                             extent_end - 1,
    3615             :                                                             &found_start);
    3616         117 :                         if (found) {
    3617          64 :                                 start = found_start;
    3618          64 :                                 break;
    3619             :                         }
    3620             :                         /*
    3621             :                          * Didn't find data or a hole (due to delalloc) in the
    3622             :                          * implicit hole range, so need to analyze the next
    3623             :                          * extent item.
    3624             :                          */
    3625             :                 } else {
    3626             :                         /*
    3627             :                          * Found a regular or inline extent.
    3628             :                          * If we are seeking for data, adjust the start offset
    3629             :                          * and stop, we're done.
    3630             :                          */
    3631           2 :                         if (whence == SEEK_DATA) {
    3632           2 :                                 start = max_t(u64, key.offset, offset);
    3633           2 :                                 found = true;
    3634           2 :                                 break;
    3635             :                         }
    3636             :                         /*
    3637             :                          * Else, we are seeking for a hole, check the next file
    3638             :                          * extent item.
    3639             :                          */
    3640             :                 }
    3641             : 
    3642          53 :                 start = extent_end;
    3643          53 :                 last_extent_end = extent_end;
    3644          53 :                 path->slots[0]++;
    3645          53 :                 if (fatal_signal_pending(current)) {
    3646           0 :                         ret = -EINTR;
    3647           0 :                         goto out;
    3648             :                 }
    3649          53 :                 cond_resched();
    3650             :         }
    3651             : 
    3652             :         /* We have an implicit hole from the last extent found up to i_size. */
    3653         447 :         if (!found && start < i_size) {
    3654         362 :                 found = find_desired_extent_in_hole(inode, whence,
    3655             :                                                     delalloc_cached_state, start,
    3656             :                                                     i_size - 1, &start);
    3657         362 :                 if (!found)
    3658          20 :                         start = i_size;
    3659             :         }
    3660             : 
    3661         427 : out:
    3662         447 :         unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
    3663         447 :         btrfs_free_path(path);
    3664             : 
    3665         447 :         if (ret < 0)
    3666           0 :                 return ret;
    3667             : 
    3668         447 :         if (whence == SEEK_DATA && start >= i_size)
    3669             :                 return -ENXIO;
    3670             : 
    3671         426 :         return min_t(loff_t, start, i_size);
    3672             : }
    3673             : 
    3674     4710924 : static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
    3675             : {
    3676     4710924 :         struct inode *inode = file->f_mapping->host;
    3677             : 
    3678     4710924 :         switch (whence) {
    3679     4710450 :         default:
    3680     4710450 :                 return generic_file_llseek(file, offset, whence);
    3681             :         case SEEK_DATA:
    3682             :         case SEEK_HOLE:
    3683         474 :                 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
    3684         474 :                 offset = find_desired_extent(file, offset, whence);
    3685         474 :                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
    3686         474 :                 break;
    3687             :         }
    3688             : 
    3689         474 :         if (offset < 0)
    3690             :                 return offset;
    3691             : 
    3692         426 :         return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
    3693             : }
    3694             : 
    3695     7452354 : static int btrfs_file_open(struct inode *inode, struct file *filp)
    3696             : {
    3697     7452354 :         int ret;
    3698             : 
    3699     7452354 :         filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
    3700             :                         FMODE_CAN_ODIRECT;
    3701             : 
    3702     7452354 :         ret = fsverity_file_open(inode, filp);
    3703     7452354 :         if (ret)
    3704             :                 return ret;
    3705     7452354 :         return generic_file_open(inode, filp);
    3706             : }
    3707             : 
    3708     3966091 : static int check_direct_read(struct btrfs_fs_info *fs_info,
    3709             :                              const struct iov_iter *iter, loff_t offset)
    3710             : {
    3711     3966091 :         int ret;
    3712     3966091 :         int i, seg;
    3713             : 
    3714     3966091 :         ret = check_direct_IO(fs_info, iter, offset);
    3715     3966588 :         if (ret < 0)
    3716             :                 return ret;
    3717             : 
    3718     1324722 :         if (!iter_is_iovec(iter))
    3719             :                 return 0;
    3720             : 
    3721           0 :         for (seg = 0; seg < iter->nr_segs; seg++) {
    3722           0 :                 for (i = seg + 1; i < iter->nr_segs; i++) {
    3723           0 :                         const struct iovec *iov1 = iter_iov(iter) + seg;
    3724           0 :                         const struct iovec *iov2 = iter_iov(iter) + i;
    3725             : 
    3726           0 :                         if (iov1->iov_base == iov2->iov_base)
    3727             :                                 return -EINVAL;
    3728             :                 }
    3729             :         }
    3730             :         return 0;
    3731             : }
    3732             : 
    3733     3966387 : static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
    3734             : {
    3735     3966387 :         struct inode *inode = file_inode(iocb->ki_filp);
    3736     3966387 :         size_t prev_left = 0;
    3737     3966387 :         ssize_t read = 0;
    3738     3966387 :         ssize_t ret;
    3739             : 
    3740     3966387 :         if (fsverity_active(inode))
    3741             :                 return 0;
    3742             : 
    3743     3966387 :         if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
    3744             :                 return 0;
    3745             : 
    3746     1324731 :         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
    3747     1327093 : again:
    3748             :         /*
    3749             :          * This is similar to what we do for direct IO writes, see the comment
    3750             :          * at btrfs_direct_write(), but we also disable page faults in addition
    3751             :          * to disabling them only at the iov_iter level. This is because when
    3752             :          * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
    3753             :          * which can still trigger page fault ins despite having set ->nofault
    3754             :          * to true of our 'to' iov_iter.
    3755             :          *
    3756             :          * The difference to direct IO writes is that we deadlock when trying
    3757             :          * to lock the extent range in the inode's tree during he page reads
    3758             :          * triggered by the fault in (while for writes it is due to waiting for
    3759             :          * our own ordered extent). This is because for direct IO reads,
    3760             :          * btrfs_dio_iomap_begin() returns with the extent range locked, which
    3761             :          * is only unlocked in the endio callback (end_bio_extent_readpage()).
    3762             :          */
    3763     1327093 :         pagefault_disable();
    3764     1327051 :         to->nofault = true;
    3765     1327051 :         ret = btrfs_dio_read(iocb, to, read);
    3766     1327740 :         to->nofault = false;
    3767     1327740 :         pagefault_enable();
    3768             : 
    3769             :         /* No increment (+=) because iomap returns a cumulative value. */
    3770     1327714 :         if (ret > 0)
    3771      184274 :                 read = ret;
    3772             : 
    3773     1327714 :         if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
    3774        2377 :                 const size_t left = iov_iter_count(to);
    3775             : 
    3776        2377 :                 if (left == prev_left) {
    3777             :                         /*
    3778             :                          * We didn't make any progress since the last attempt,
    3779             :                          * fallback to a buffered read for the remainder of the
    3780             :                          * range. This is just to avoid any possibility of looping
    3781             :                          * for too long.
    3782             :                          */
    3783             :                         ret = read;
    3784             :                 } else {
    3785             :                         /*
    3786             :                          * We made some progress since the last retry or this is
    3787             :                          * the first time we are retrying. Fault in as many pages
    3788             :                          * as possible and retry.
    3789             :                          */
    3790        2377 :                         fault_in_iov_iter_writeable(to, left);
    3791        2368 :                         prev_left = left;
    3792        2368 :                         goto again;
    3793             :                 }
    3794             :         }
    3795     1325337 :         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
    3796     1325367 :         return ret < 0 ? ret : read;
    3797             : }
    3798             : 
    3799    26186621 : static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
    3800             : {
    3801    26186621 :         ssize_t ret = 0;
    3802             : 
    3803    26186621 :         if (iocb->ki_flags & IOCB_DIRECT) {
    3804     3966839 :                 ret = btrfs_direct_read(iocb, to);
    3805     3967313 :                 if (ret < 0 || !iov_iter_count(to) ||
    3806     3650036 :                     iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
    3807             :                         return ret;
    3808             :         }
    3809             : 
    3810    22925340 :         return filemap_read(iocb, to, ret);
    3811             : }
    3812             : 
    3813             : const struct file_operations btrfs_file_operations = {
    3814             :         .llseek         = btrfs_file_llseek,
    3815             :         .read_iter      = btrfs_file_read_iter,
    3816             :         .splice_read    = filemap_splice_read,
    3817             :         .write_iter     = btrfs_file_write_iter,
    3818             :         .splice_write   = iter_file_splice_write,
    3819             :         .mmap           = btrfs_file_mmap,
    3820             :         .open           = btrfs_file_open,
    3821             :         .release        = btrfs_release_file,
    3822             :         .get_unmapped_area = thp_get_unmapped_area,
    3823             :         .fsync          = btrfs_sync_file,
    3824             :         .fallocate      = btrfs_fallocate,
    3825             :         .unlocked_ioctl = btrfs_ioctl,
    3826             : #ifdef CONFIG_COMPAT
    3827             :         .compat_ioctl   = btrfs_compat_ioctl,
    3828             : #endif
    3829             :         .remap_file_range = btrfs_remap_file_range,
    3830             : };
    3831             : 
    3832     9490531 : int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
    3833             : {
    3834     9490531 :         int ret;
    3835             : 
    3836             :         /*
    3837             :          * So with compression we will find and lock a dirty page and clear the
    3838             :          * first one as dirty, setup an async extent, and immediately return
    3839             :          * with the entire range locked but with nobody actually marked with
    3840             :          * writeback.  So we can't just filemap_write_and_wait_range() and
    3841             :          * expect it to work since it will just kick off a thread to do the
    3842             :          * actual work.  So we need to call filemap_fdatawrite_range _again_
    3843             :          * since it will wait on the page lock, which won't be unlocked until
    3844             :          * after the pages have been marked as writeback and so we're good to go
    3845             :          * from there.  We have to do this otherwise we'll miss the ordered
    3846             :          * extents and that results in badness.  Please Josef, do not think you
    3847             :          * know better and pull this out at some point in the future, it is
    3848             :          * right and you are wrong.
    3849             :          */
    3850     9490531 :         ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
    3851     9491132 :         if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
    3852             :                              &BTRFS_I(inode)->runtime_flags))
    3853       24912 :                 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
    3854             : 
    3855     9491132 :         return ret;
    3856             : }

Generated by: LCOV version 1.14