LCOV - code coverage report
Current view: top level - fs/btrfs - file.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwa @ Mon Jul 31 20:08:17 PDT 2023 Lines: 0 1834 0.0 %
Date: 2023-07-31 20:08:17 Functions: 0 46 0.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (C) 2007 Oracle.  All rights reserved.
       4             :  */
       5             : 
       6             : #include <linux/fs.h>
       7             : #include <linux/pagemap.h>
       8             : #include <linux/time.h>
       9             : #include <linux/init.h>
      10             : #include <linux/string.h>
      11             : #include <linux/backing-dev.h>
      12             : #include <linux/falloc.h>
      13             : #include <linux/writeback.h>
      14             : #include <linux/compat.h>
      15             : #include <linux/slab.h>
      16             : #include <linux/btrfs.h>
      17             : #include <linux/uio.h>
      18             : #include <linux/iversion.h>
      19             : #include <linux/fsverity.h>
      20             : #include "ctree.h"
      21             : #include "disk-io.h"
      22             : #include "transaction.h"
      23             : #include "btrfs_inode.h"
      24             : #include "print-tree.h"
      25             : #include "tree-log.h"
      26             : #include "locking.h"
      27             : #include "volumes.h"
      28             : #include "qgroup.h"
      29             : #include "compression.h"
      30             : #include "delalloc-space.h"
      31             : #include "reflink.h"
      32             : #include "subpage.h"
      33             : #include "fs.h"
      34             : #include "accessors.h"
      35             : #include "extent-tree.h"
      36             : #include "file-item.h"
      37             : #include "ioctl.h"
      38             : #include "file.h"
      39             : #include "super.h"
      40             : 
      41             : /* simple helper to fault in pages and copy.  This should go away
      42             :  * and be replaced with calls into generic code.
      43             :  */
      44           0 : static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
      45             :                                          struct page **prepared_pages,
      46             :                                          struct iov_iter *i)
      47             : {
      48           0 :         size_t copied = 0;
      49           0 :         size_t total_copied = 0;
      50           0 :         int pg = 0;
      51           0 :         int offset = offset_in_page(pos);
      52             : 
      53           0 :         while (write_bytes > 0) {
      54           0 :                 size_t count = min_t(size_t,
      55             :                                      PAGE_SIZE - offset, write_bytes);
      56           0 :                 struct page *page = prepared_pages[pg];
      57             :                 /*
      58             :                  * Copy data from userspace to the current page
      59             :                  */
      60           0 :                 copied = copy_page_from_iter_atomic(page, offset, count, i);
      61             : 
      62             :                 /* Flush processor's dcache for this page */
      63           0 :                 flush_dcache_page(page);
      64             : 
      65             :                 /*
      66             :                  * if we get a partial write, we can end up with
      67             :                  * partially up to date pages.  These add
      68             :                  * a lot of complexity, so make sure they don't
      69             :                  * happen by forcing this copy to be retried.
      70             :                  *
      71             :                  * The rest of the btrfs_file_write code will fall
      72             :                  * back to page at a time copies after we return 0.
      73             :                  */
      74           0 :                 if (unlikely(copied < count)) {
      75           0 :                         if (!PageUptodate(page)) {
      76           0 :                                 iov_iter_revert(i, copied);
      77           0 :                                 copied = 0;
      78             :                         }
      79           0 :                         if (!copied)
      80             :                                 break;
      81             :                 }
      82             : 
      83           0 :                 write_bytes -= copied;
      84           0 :                 total_copied += copied;
      85           0 :                 offset += copied;
      86           0 :                 if (offset == PAGE_SIZE) {
      87           0 :                         pg++;
      88           0 :                         offset = 0;
      89             :                 }
      90             :         }
      91           0 :         return total_copied;
      92             : }
      93             : 
      94             : /*
      95             :  * unlocks pages after btrfs_file_write is done with them
      96             :  */
      97           0 : static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
      98             :                              struct page **pages, size_t num_pages,
      99             :                              u64 pos, u64 copied)
     100             : {
     101           0 :         size_t i;
     102           0 :         u64 block_start = round_down(pos, fs_info->sectorsize);
     103           0 :         u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
     104             : 
     105           0 :         ASSERT(block_len <= U32_MAX);
     106           0 :         for (i = 0; i < num_pages; i++) {
     107             :                 /* page checked is some magic around finding pages that
     108             :                  * have been modified without going through btrfs_set_page_dirty
     109             :                  * clear it here. There should be no need to mark the pages
     110             :                  * accessed as prepare_pages should have marked them accessed
     111             :                  * in prepare_pages via find_or_create_page()
     112             :                  */
     113           0 :                 btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
     114             :                                                block_len);
     115           0 :                 unlock_page(pages[i]);
     116           0 :                 put_page(pages[i]);
     117             :         }
     118           0 : }
     119             : 
     120             : /*
     121             :  * After btrfs_copy_from_user(), update the following things for delalloc:
     122             :  * - Mark newly dirtied pages as DELALLOC in the io tree.
     123             :  *   Used to advise which range is to be written back.
     124             :  * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
     125             :  * - Update inode size for past EOF write
     126             :  */
     127           0 : int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
     128             :                       size_t num_pages, loff_t pos, size_t write_bytes,
     129             :                       struct extent_state **cached, bool noreserve)
     130             : {
     131           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
     132           0 :         int err = 0;
     133           0 :         int i;
     134           0 :         u64 num_bytes;
     135           0 :         u64 start_pos;
     136           0 :         u64 end_of_last_block;
     137           0 :         u64 end_pos = pos + write_bytes;
     138           0 :         loff_t isize = i_size_read(&inode->vfs_inode);
     139           0 :         unsigned int extra_bits = 0;
     140             : 
     141           0 :         if (write_bytes == 0)
     142             :                 return 0;
     143             : 
     144           0 :         if (noreserve)
     145           0 :                 extra_bits |= EXTENT_NORESERVE;
     146             : 
     147           0 :         start_pos = round_down(pos, fs_info->sectorsize);
     148           0 :         num_bytes = round_up(write_bytes + pos - start_pos,
     149             :                              fs_info->sectorsize);
     150           0 :         ASSERT(num_bytes <= U32_MAX);
     151             : 
     152           0 :         end_of_last_block = start_pos + num_bytes - 1;
     153             : 
     154             :         /*
     155             :          * The pages may have already been dirty, clear out old accounting so
     156             :          * we can set things up properly
     157             :          */
     158           0 :         clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
     159             :                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
     160             :                          cached);
     161             : 
     162           0 :         err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
     163             :                                         extra_bits, cached);
     164           0 :         if (err)
     165             :                 return err;
     166             : 
     167           0 :         for (i = 0; i < num_pages; i++) {
     168           0 :                 struct page *p = pages[i];
     169             : 
     170           0 :                 btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
     171           0 :                 btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
     172           0 :                 btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
     173             :         }
     174             : 
     175             :         /*
     176             :          * we've only changed i_size in ram, and we haven't updated
     177             :          * the disk i_size.  There is no need to log the inode
     178             :          * at this time.
     179             :          */
     180           0 :         if (end_pos > isize)
     181           0 :                 i_size_write(&inode->vfs_inode, end_pos);
     182             :         return 0;
     183             : }
     184             : 
     185             : /*
     186             :  * this is very complex, but the basic idea is to drop all extents
     187             :  * in the range start - end.  hint_block is filled in with a block number
     188             :  * that would be a good hint to the block allocator for this file.
     189             :  *
     190             :  * If an extent intersects the range but is not entirely inside the range
     191             :  * it is either truncated or split.  Anything entirely inside the range
     192             :  * is deleted from the tree.
     193             :  *
     194             :  * Note: the VFS' inode number of bytes is not updated, it's up to the caller
     195             :  * to deal with that. We set the field 'bytes_found' of the arguments structure
     196             :  * with the number of allocated bytes found in the target range, so that the
     197             :  * caller can update the inode's number of bytes in an atomic way when
     198             :  * replacing extents in a range to avoid races with stat(2).
     199             :  */
     200           0 : int btrfs_drop_extents(struct btrfs_trans_handle *trans,
     201             :                        struct btrfs_root *root, struct btrfs_inode *inode,
     202             :                        struct btrfs_drop_extents_args *args)
     203             : {
     204           0 :         struct btrfs_fs_info *fs_info = root->fs_info;
     205           0 :         struct extent_buffer *leaf;
     206           0 :         struct btrfs_file_extent_item *fi;
     207           0 :         struct btrfs_ref ref = { 0 };
     208           0 :         struct btrfs_key key;
     209           0 :         struct btrfs_key new_key;
     210           0 :         u64 ino = btrfs_ino(inode);
     211           0 :         u64 search_start = args->start;
     212           0 :         u64 disk_bytenr = 0;
     213           0 :         u64 num_bytes = 0;
     214           0 :         u64 extent_offset = 0;
     215           0 :         u64 extent_end = 0;
     216           0 :         u64 last_end = args->start;
     217           0 :         int del_nr = 0;
     218           0 :         int del_slot = 0;
     219           0 :         int extent_type;
     220           0 :         int recow;
     221           0 :         int ret;
     222           0 :         int modify_tree = -1;
     223           0 :         int update_refs;
     224           0 :         int found = 0;
     225           0 :         struct btrfs_path *path = args->path;
     226             : 
     227           0 :         args->bytes_found = 0;
     228           0 :         args->extent_inserted = false;
     229             : 
     230             :         /* Must always have a path if ->replace_extent is true */
     231           0 :         ASSERT(!(args->replace_extent && !args->path));
     232             : 
     233           0 :         if (!path) {
     234           0 :                 path = btrfs_alloc_path();
     235           0 :                 if (!path) {
     236           0 :                         ret = -ENOMEM;
     237           0 :                         goto out;
     238             :                 }
     239             :         }
     240             : 
     241           0 :         if (args->drop_cache)
     242           0 :                 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
     243             : 
     244           0 :         if (args->start >= inode->disk_i_size && !args->replace_extent)
     245           0 :                 modify_tree = 0;
     246             : 
     247           0 :         update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
     248           0 :         while (1) {
     249           0 :                 recow = 0;
     250           0 :                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
     251             :                                                search_start, modify_tree);
     252           0 :                 if (ret < 0)
     253             :                         break;
     254           0 :                 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
     255           0 :                         leaf = path->nodes[0];
     256           0 :                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
     257           0 :                         if (key.objectid == ino &&
     258           0 :                             key.type == BTRFS_EXTENT_DATA_KEY)
     259           0 :                                 path->slots[0]--;
     260             :                 }
     261             :                 ret = 0;
     262             : next_slot:
     263           0 :                 leaf = path->nodes[0];
     264           0 :                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
     265           0 :                         BUG_ON(del_nr > 0);
     266           0 :                         ret = btrfs_next_leaf(root, path);
     267           0 :                         if (ret < 0)
     268             :                                 break;
     269           0 :                         if (ret > 0) {
     270             :                                 ret = 0;
     271             :                                 break;
     272             :                         }
     273           0 :                         leaf = path->nodes[0];
     274           0 :                         recow = 1;
     275             :                 }
     276             : 
     277           0 :                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
     278             : 
     279           0 :                 if (key.objectid > ino)
     280             :                         break;
     281           0 :                 if (WARN_ON_ONCE(key.objectid < ino) ||
     282           0 :                     key.type < BTRFS_EXTENT_DATA_KEY) {
     283           0 :                         ASSERT(del_nr == 0);
     284           0 :                         path->slots[0]++;
     285           0 :                         goto next_slot;
     286             :                 }
     287           0 :                 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
     288             :                         break;
     289             : 
     290           0 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
     291             :                                     struct btrfs_file_extent_item);
     292           0 :                 extent_type = btrfs_file_extent_type(leaf, fi);
     293             : 
     294           0 :                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
     295             :                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
     296           0 :                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
     297           0 :                         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
     298           0 :                         extent_offset = btrfs_file_extent_offset(leaf, fi);
     299           0 :                         extent_end = key.offset +
     300             :                                 btrfs_file_extent_num_bytes(leaf, fi);
     301           0 :                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
     302           0 :                         extent_end = key.offset +
     303             :                                 btrfs_file_extent_ram_bytes(leaf, fi);
     304             :                 } else {
     305             :                         /* can't happen */
     306           0 :                         BUG();
     307             :                 }
     308             : 
     309             :                 /*
     310             :                  * Don't skip extent items representing 0 byte lengths. They
     311             :                  * used to be created (bug) if while punching holes we hit
     312             :                  * -ENOSPC condition. So if we find one here, just ensure we
     313             :                  * delete it, otherwise we would insert a new file extent item
     314             :                  * with the same key (offset) as that 0 bytes length file
     315             :                  * extent item in the call to setup_items_for_insert() later
     316             :                  * in this function.
     317             :                  */
     318           0 :                 if (extent_end == key.offset && extent_end >= search_start) {
     319           0 :                         last_end = extent_end;
     320           0 :                         goto delete_extent_item;
     321             :                 }
     322             : 
     323           0 :                 if (extent_end <= search_start) {
     324           0 :                         path->slots[0]++;
     325           0 :                         goto next_slot;
     326             :                 }
     327             : 
     328           0 :                 found = 1;
     329           0 :                 search_start = max(key.offset, args->start);
     330           0 :                 if (recow || !modify_tree) {
     331           0 :                         modify_tree = -1;
     332           0 :                         btrfs_release_path(path);
     333           0 :                         continue;
     334             :                 }
     335             : 
     336             :                 /*
     337             :                  *     | - range to drop - |
     338             :                  *  | -------- extent -------- |
     339             :                  */
     340           0 :                 if (args->start > key.offset && args->end < extent_end) {
     341           0 :                         BUG_ON(del_nr > 0);
     342           0 :                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
     343             :                                 ret = -EOPNOTSUPP;
     344             :                                 break;
     345             :                         }
     346             : 
     347           0 :                         memcpy(&new_key, &key, sizeof(new_key));
     348           0 :                         new_key.offset = args->start;
     349           0 :                         ret = btrfs_duplicate_item(trans, root, path,
     350             :                                                    &new_key);
     351           0 :                         if (ret == -EAGAIN) {
     352           0 :                                 btrfs_release_path(path);
     353           0 :                                 continue;
     354             :                         }
     355           0 :                         if (ret < 0)
     356             :                                 break;
     357             : 
     358           0 :                         leaf = path->nodes[0];
     359           0 :                         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
     360             :                                             struct btrfs_file_extent_item);
     361           0 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     362           0 :                                                         args->start - key.offset);
     363             : 
     364           0 :                         fi = btrfs_item_ptr(leaf, path->slots[0],
     365             :                                             struct btrfs_file_extent_item);
     366             : 
     367           0 :                         extent_offset += args->start - key.offset;
     368           0 :                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
     369           0 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     370           0 :                                                         extent_end - args->start);
     371           0 :                         btrfs_mark_buffer_dirty(leaf);
     372             : 
     373           0 :                         if (update_refs && disk_bytenr > 0) {
     374           0 :                                 btrfs_init_generic_ref(&ref,
     375             :                                                 BTRFS_ADD_DELAYED_REF,
     376             :                                                 disk_bytenr, num_bytes, 0);
     377           0 :                                 btrfs_init_data_ref(&ref,
     378             :                                                 root->root_key.objectid,
     379             :                                                 new_key.objectid,
     380           0 :                                                 args->start - extent_offset,
     381             :                                                 0, false);
     382           0 :                                 ret = btrfs_inc_extent_ref(trans, &ref);
     383           0 :                                 if (ret) {
     384           0 :                                         btrfs_abort_transaction(trans, ret);
     385           0 :                                         break;
     386             :                                 }
     387             :                         }
     388           0 :                         key.offset = args->start;
     389             :                 }
     390             :                 /*
     391             :                  * From here on out we will have actually dropped something, so
     392             :                  * last_end can be updated.
     393             :                  */
     394           0 :                 last_end = extent_end;
     395             : 
     396             :                 /*
     397             :                  *  | ---- range to drop ----- |
     398             :                  *      | -------- extent -------- |
     399             :                  */
     400           0 :                 if (args->start <= key.offset && args->end < extent_end) {
     401           0 :                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
     402             :                                 ret = -EOPNOTSUPP;
     403             :                                 break;
     404             :                         }
     405             : 
     406           0 :                         memcpy(&new_key, &key, sizeof(new_key));
     407           0 :                         new_key.offset = args->end;
     408           0 :                         btrfs_set_item_key_safe(fs_info, path, &new_key);
     409             : 
     410           0 :                         extent_offset += args->end - key.offset;
     411           0 :                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
     412           0 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     413           0 :                                                         extent_end - args->end);
     414           0 :                         btrfs_mark_buffer_dirty(leaf);
     415           0 :                         if (update_refs && disk_bytenr > 0)
     416           0 :                                 args->bytes_found += args->end - key.offset;
     417             :                         break;
     418             :                 }
     419             : 
     420           0 :                 search_start = extent_end;
     421             :                 /*
     422             :                  *       | ---- range to drop ----- |
     423             :                  *  | -------- extent -------- |
     424             :                  */
     425           0 :                 if (args->start > key.offset && args->end >= extent_end) {
     426           0 :                         BUG_ON(del_nr > 0);
     427           0 :                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
     428             :                                 ret = -EOPNOTSUPP;
     429             :                                 break;
     430             :                         }
     431             : 
     432           0 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     433             :                                                         args->start - key.offset);
     434           0 :                         btrfs_mark_buffer_dirty(leaf);
     435           0 :                         if (update_refs && disk_bytenr > 0)
     436           0 :                                 args->bytes_found += extent_end - args->start;
     437           0 :                         if (args->end == extent_end)
     438             :                                 break;
     439             : 
     440           0 :                         path->slots[0]++;
     441           0 :                         goto next_slot;
     442             :                 }
     443             : 
     444             :                 /*
     445             :                  *  | ---- range to drop ----- |
     446             :                  *    | ------ extent ------ |
     447             :                  */
     448           0 :                 if (args->start <= key.offset && args->end >= extent_end) {
     449           0 : delete_extent_item:
     450           0 :                         if (del_nr == 0) {
     451           0 :                                 del_slot = path->slots[0];
     452           0 :                                 del_nr = 1;
     453             :                         } else {
     454           0 :                                 BUG_ON(del_slot + del_nr != path->slots[0]);
     455           0 :                                 del_nr++;
     456             :                         }
     457             : 
     458           0 :                         if (update_refs &&
     459           0 :                             extent_type == BTRFS_FILE_EXTENT_INLINE) {
     460           0 :                                 args->bytes_found += extent_end - key.offset;
     461           0 :                                 extent_end = ALIGN(extent_end,
     462             :                                                    fs_info->sectorsize);
     463           0 :                         } else if (update_refs && disk_bytenr > 0) {
     464           0 :                                 btrfs_init_generic_ref(&ref,
     465             :                                                 BTRFS_DROP_DELAYED_REF,
     466             :                                                 disk_bytenr, num_bytes, 0);
     467           0 :                                 btrfs_init_data_ref(&ref,
     468             :                                                 root->root_key.objectid,
     469             :                                                 key.objectid,
     470           0 :                                                 key.offset - extent_offset, 0,
     471             :                                                 false);
     472           0 :                                 ret = btrfs_free_extent(trans, &ref);
     473           0 :                                 if (ret) {
     474           0 :                                         btrfs_abort_transaction(trans, ret);
     475           0 :                                         break;
     476             :                                 }
     477           0 :                                 args->bytes_found += extent_end - key.offset;
     478             :                         }
     479             : 
     480           0 :                         if (args->end == extent_end)
     481             :                                 break;
     482             : 
     483           0 :                         if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
     484           0 :                                 path->slots[0]++;
     485           0 :                                 goto next_slot;
     486             :                         }
     487             : 
     488           0 :                         ret = btrfs_del_items(trans, root, path, del_slot,
     489             :                                               del_nr);
     490           0 :                         if (ret) {
     491           0 :                                 btrfs_abort_transaction(trans, ret);
     492           0 :                                 break;
     493             :                         }
     494             : 
     495           0 :                         del_nr = 0;
     496           0 :                         del_slot = 0;
     497             : 
     498           0 :                         btrfs_release_path(path);
     499           0 :                         continue;
     500             :                 }
     501             : 
     502           0 :                 BUG();
     503             :         }
     504             : 
     505           0 :         if (!ret && del_nr > 0) {
     506             :                 /*
     507             :                  * Set path->slots[0] to first slot, so that after the delete
     508             :                  * if items are move off from our leaf to its immediate left or
     509             :                  * right neighbor leafs, we end up with a correct and adjusted
     510             :                  * path->slots[0] for our insertion (if args->replace_extent).
     511             :                  */
     512           0 :                 path->slots[0] = del_slot;
     513           0 :                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
     514           0 :                 if (ret)
     515           0 :                         btrfs_abort_transaction(trans, ret);
     516             :         }
     517             : 
     518           0 :         leaf = path->nodes[0];
     519             :         /*
     520             :          * If btrfs_del_items() was called, it might have deleted a leaf, in
     521             :          * which case it unlocked our path, so check path->locks[0] matches a
     522             :          * write lock.
     523             :          */
     524           0 :         if (!ret && args->replace_extent &&
     525           0 :             path->locks[0] == BTRFS_WRITE_LOCK &&
     526           0 :             btrfs_leaf_free_space(leaf) >=
     527           0 :             sizeof(struct btrfs_item) + args->extent_item_size) {
     528             : 
     529           0 :                 key.objectid = ino;
     530           0 :                 key.type = BTRFS_EXTENT_DATA_KEY;
     531           0 :                 key.offset = args->start;
     532           0 :                 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
     533           0 :                         struct btrfs_key slot_key;
     534             : 
     535           0 :                         btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
     536           0 :                         if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
     537           0 :                                 path->slots[0]++;
     538             :                 }
     539           0 :                 btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
     540           0 :                 args->extent_inserted = true;
     541             :         }
     542             : 
     543           0 :         if (!args->path)
     544           0 :                 btrfs_free_path(path);
     545           0 :         else if (!args->extent_inserted)
     546           0 :                 btrfs_release_path(path);
     547           0 : out:
     548           0 :         args->drop_end = found ? min(args->end, last_end) : args->end;
     549             : 
     550           0 :         return ret;
     551             : }
     552             : 
     553           0 : static int extent_mergeable(struct extent_buffer *leaf, int slot,
     554             :                             u64 objectid, u64 bytenr, u64 orig_offset,
     555             :                             u64 *start, u64 *end)
     556             : {
     557           0 :         struct btrfs_file_extent_item *fi;
     558           0 :         struct btrfs_key key;
     559           0 :         u64 extent_end;
     560             : 
     561           0 :         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
     562             :                 return 0;
     563             : 
     564           0 :         btrfs_item_key_to_cpu(leaf, &key, slot);
     565           0 :         if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
     566             :                 return 0;
     567             : 
     568           0 :         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
     569           0 :         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
     570           0 :             btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
     571           0 :             btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
     572           0 :             btrfs_file_extent_compression(leaf, fi) ||
     573           0 :             btrfs_file_extent_encryption(leaf, fi) ||
     574             :             btrfs_file_extent_other_encoding(leaf, fi))
     575           0 :                 return 0;
     576             : 
     577           0 :         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
     578           0 :         if ((*start && *start != key.offset) || (*end && *end != extent_end))
     579             :                 return 0;
     580             : 
     581           0 :         *start = key.offset;
     582           0 :         *end = extent_end;
     583           0 :         return 1;
     584             : }
     585             : 
     586             : /*
     587             :  * Mark extent in the range start - end as written.
     588             :  *
     589             :  * This changes extent type from 'pre-allocated' to 'regular'. If only
     590             :  * part of extent is marked as written, the extent will be split into
     591             :  * two or three.
     592             :  */
     593           0 : int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
     594             :                               struct btrfs_inode *inode, u64 start, u64 end)
     595             : {
     596           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
     597           0 :         struct btrfs_root *root = inode->root;
     598           0 :         struct extent_buffer *leaf;
     599           0 :         struct btrfs_path *path;
     600           0 :         struct btrfs_file_extent_item *fi;
     601           0 :         struct btrfs_ref ref = { 0 };
     602           0 :         struct btrfs_key key;
     603           0 :         struct btrfs_key new_key;
     604           0 :         u64 bytenr;
     605           0 :         u64 num_bytes;
     606           0 :         u64 extent_end;
     607           0 :         u64 orig_offset;
     608           0 :         u64 other_start;
     609           0 :         u64 other_end;
     610           0 :         u64 split;
     611           0 :         int del_nr = 0;
     612           0 :         int del_slot = 0;
     613           0 :         int recow;
     614           0 :         int ret = 0;
     615           0 :         u64 ino = btrfs_ino(inode);
     616             : 
     617           0 :         path = btrfs_alloc_path();
     618           0 :         if (!path)
     619             :                 return -ENOMEM;
     620           0 : again:
     621           0 :         recow = 0;
     622           0 :         split = start;
     623           0 :         key.objectid = ino;
     624           0 :         key.type = BTRFS_EXTENT_DATA_KEY;
     625           0 :         key.offset = split;
     626             : 
     627           0 :         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
     628           0 :         if (ret < 0)
     629           0 :                 goto out;
     630           0 :         if (ret > 0 && path->slots[0] > 0)
     631           0 :                 path->slots[0]--;
     632             : 
     633           0 :         leaf = path->nodes[0];
     634           0 :         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
     635           0 :         if (key.objectid != ino ||
     636           0 :             key.type != BTRFS_EXTENT_DATA_KEY) {
     637           0 :                 ret = -EINVAL;
     638           0 :                 btrfs_abort_transaction(trans, ret);
     639           0 :                 goto out;
     640             :         }
     641           0 :         fi = btrfs_item_ptr(leaf, path->slots[0],
     642             :                             struct btrfs_file_extent_item);
     643           0 :         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
     644           0 :                 ret = -EINVAL;
     645           0 :                 btrfs_abort_transaction(trans, ret);
     646           0 :                 goto out;
     647             :         }
     648           0 :         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
     649           0 :         if (key.offset > start || extent_end < end) {
     650           0 :                 ret = -EINVAL;
     651           0 :                 btrfs_abort_transaction(trans, ret);
     652           0 :                 goto out;
     653             :         }
     654             : 
     655           0 :         bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
     656           0 :         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
     657           0 :         orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
     658           0 :         memcpy(&new_key, &key, sizeof(new_key));
     659             : 
     660           0 :         if (start == key.offset && end < extent_end) {
     661           0 :                 other_start = 0;
     662           0 :                 other_end = start;
     663           0 :                 if (extent_mergeable(leaf, path->slots[0] - 1,
     664             :                                      ino, bytenr, orig_offset,
     665             :                                      &other_start, &other_end)) {
     666           0 :                         new_key.offset = end;
     667           0 :                         btrfs_set_item_key_safe(fs_info, path, &new_key);
     668           0 :                         fi = btrfs_item_ptr(leaf, path->slots[0],
     669             :                                             struct btrfs_file_extent_item);
     670           0 :                         btrfs_set_file_extent_generation(leaf, fi,
     671             :                                                          trans->transid);
     672           0 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     673             :                                                         extent_end - end);
     674           0 :                         btrfs_set_file_extent_offset(leaf, fi,
     675             :                                                      end - orig_offset);
     676           0 :                         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
     677             :                                             struct btrfs_file_extent_item);
     678           0 :                         btrfs_set_file_extent_generation(leaf, fi,
     679             :                                                          trans->transid);
     680           0 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     681             :                                                         end - other_start);
     682           0 :                         btrfs_mark_buffer_dirty(leaf);
     683           0 :                         goto out;
     684             :                 }
     685             :         }
     686             : 
     687           0 :         if (start > key.offset && end == extent_end) {
     688           0 :                 other_start = end;
     689           0 :                 other_end = 0;
     690           0 :                 if (extent_mergeable(leaf, path->slots[0] + 1,
     691             :                                      ino, bytenr, orig_offset,
     692             :                                      &other_start, &other_end)) {
     693           0 :                         fi = btrfs_item_ptr(leaf, path->slots[0],
     694             :                                             struct btrfs_file_extent_item);
     695           0 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     696           0 :                                                         start - key.offset);
     697           0 :                         btrfs_set_file_extent_generation(leaf, fi,
     698             :                                                          trans->transid);
     699           0 :                         path->slots[0]++;
     700           0 :                         new_key.offset = start;
     701           0 :                         btrfs_set_item_key_safe(fs_info, path, &new_key);
     702             : 
     703           0 :                         fi = btrfs_item_ptr(leaf, path->slots[0],
     704             :                                             struct btrfs_file_extent_item);
     705           0 :                         btrfs_set_file_extent_generation(leaf, fi,
     706             :                                                          trans->transid);
     707           0 :                         btrfs_set_file_extent_num_bytes(leaf, fi,
     708             :                                                         other_end - start);
     709           0 :                         btrfs_set_file_extent_offset(leaf, fi,
     710             :                                                      start - orig_offset);
     711           0 :                         btrfs_mark_buffer_dirty(leaf);
     712           0 :                         goto out;
     713             :                 }
     714             :         }
     715             : 
     716           0 :         while (start > key.offset || end < extent_end) {
     717           0 :                 if (key.offset == start)
     718           0 :                         split = end;
     719             : 
     720           0 :                 new_key.offset = split;
     721           0 :                 ret = btrfs_duplicate_item(trans, root, path, &new_key);
     722           0 :                 if (ret == -EAGAIN) {
     723           0 :                         btrfs_release_path(path);
     724           0 :                         goto again;
     725             :                 }
     726           0 :                 if (ret < 0) {
     727           0 :                         btrfs_abort_transaction(trans, ret);
     728           0 :                         goto out;
     729             :                 }
     730             : 
     731           0 :                 leaf = path->nodes[0];
     732           0 :                 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
     733             :                                     struct btrfs_file_extent_item);
     734           0 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
     735           0 :                 btrfs_set_file_extent_num_bytes(leaf, fi,
     736           0 :                                                 split - key.offset);
     737             : 
     738           0 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
     739             :                                     struct btrfs_file_extent_item);
     740             : 
     741           0 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
     742           0 :                 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
     743           0 :                 btrfs_set_file_extent_num_bytes(leaf, fi,
     744             :                                                 extent_end - split);
     745           0 :                 btrfs_mark_buffer_dirty(leaf);
     746             : 
     747           0 :                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
     748             :                                        num_bytes, 0);
     749           0 :                 btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
     750             :                                     orig_offset, 0, false);
     751           0 :                 ret = btrfs_inc_extent_ref(trans, &ref);
     752           0 :                 if (ret) {
     753           0 :                         btrfs_abort_transaction(trans, ret);
     754           0 :                         goto out;
     755             :                 }
     756             : 
     757           0 :                 if (split == start) {
     758           0 :                         key.offset = start;
     759             :                 } else {
     760           0 :                         if (start != key.offset) {
     761           0 :                                 ret = -EINVAL;
     762           0 :                                 btrfs_abort_transaction(trans, ret);
     763           0 :                                 goto out;
     764             :                         }
     765           0 :                         path->slots[0]--;
     766           0 :                         extent_end = end;
     767             :                 }
     768             :                 recow = 1;
     769             :         }
     770             : 
     771           0 :         other_start = end;
     772           0 :         other_end = 0;
     773           0 :         btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
     774             :                                num_bytes, 0);
     775           0 :         btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
     776             :                             0, false);
     777           0 :         if (extent_mergeable(leaf, path->slots[0] + 1,
     778             :                              ino, bytenr, orig_offset,
     779             :                              &other_start, &other_end)) {
     780           0 :                 if (recow) {
     781           0 :                         btrfs_release_path(path);
     782           0 :                         goto again;
     783             :                 }
     784           0 :                 extent_end = other_end;
     785           0 :                 del_slot = path->slots[0] + 1;
     786           0 :                 del_nr++;
     787           0 :                 ret = btrfs_free_extent(trans, &ref);
     788           0 :                 if (ret) {
     789           0 :                         btrfs_abort_transaction(trans, ret);
     790           0 :                         goto out;
     791             :                 }
     792             :         }
     793           0 :         other_start = 0;
     794           0 :         other_end = start;
     795           0 :         if (extent_mergeable(leaf, path->slots[0] - 1,
     796             :                              ino, bytenr, orig_offset,
     797             :                              &other_start, &other_end)) {
     798           0 :                 if (recow) {
     799           0 :                         btrfs_release_path(path);
     800           0 :                         goto again;
     801             :                 }
     802           0 :                 key.offset = other_start;
     803           0 :                 del_slot = path->slots[0];
     804           0 :                 del_nr++;
     805           0 :                 ret = btrfs_free_extent(trans, &ref);
     806           0 :                 if (ret) {
     807           0 :                         btrfs_abort_transaction(trans, ret);
     808           0 :                         goto out;
     809             :                 }
     810             :         }
     811           0 :         if (del_nr == 0) {
     812           0 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
     813             :                            struct btrfs_file_extent_item);
     814           0 :                 btrfs_set_file_extent_type(leaf, fi,
     815             :                                            BTRFS_FILE_EXTENT_REG);
     816           0 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
     817           0 :                 btrfs_mark_buffer_dirty(leaf);
     818             :         } else {
     819           0 :                 fi = btrfs_item_ptr(leaf, del_slot - 1,
     820             :                            struct btrfs_file_extent_item);
     821           0 :                 btrfs_set_file_extent_type(leaf, fi,
     822             :                                            BTRFS_FILE_EXTENT_REG);
     823           0 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
     824           0 :                 btrfs_set_file_extent_num_bytes(leaf, fi,
     825           0 :                                                 extent_end - key.offset);
     826           0 :                 btrfs_mark_buffer_dirty(leaf);
     827             : 
     828           0 :                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
     829           0 :                 if (ret < 0) {
     830           0 :                         btrfs_abort_transaction(trans, ret);
     831           0 :                         goto out;
     832             :                 }
     833             :         }
     834           0 : out:
     835           0 :         btrfs_free_path(path);
     836           0 :         return ret;
     837             : }
     838             : 
     839             : /*
     840             :  * on error we return an unlocked page and the error value
     841             :  * on success we return a locked page and 0
     842             :  */
     843           0 : static int prepare_uptodate_page(struct inode *inode,
     844             :                                  struct page *page, u64 pos,
     845             :                                  bool force_uptodate)
     846             : {
     847           0 :         struct folio *folio = page_folio(page);
     848           0 :         int ret = 0;
     849             : 
     850           0 :         if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
     851           0 :             !PageUptodate(page)) {
     852           0 :                 ret = btrfs_read_folio(NULL, folio);
     853           0 :                 if (ret)
     854             :                         return ret;
     855           0 :                 lock_page(page);
     856           0 :                 if (!PageUptodate(page)) {
     857           0 :                         unlock_page(page);
     858           0 :                         return -EIO;
     859             :                 }
     860             : 
     861             :                 /*
     862             :                  * Since btrfs_read_folio() will unlock the folio before it
     863             :                  * returns, there is a window where btrfs_release_folio() can be
     864             :                  * called to release the page.  Here we check both inode
     865             :                  * mapping and PagePrivate() to make sure the page was not
     866             :                  * released.
     867             :                  *
     868             :                  * The private flag check is essential for subpage as we need
     869             :                  * to store extra bitmap using page->private.
     870             :                  */
     871           0 :                 if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
     872           0 :                         unlock_page(page);
     873           0 :                         return -EAGAIN;
     874             :                 }
     875             :         }
     876             :         return 0;
     877             : }
     878             : 
     879             : static fgf_t get_prepare_fgp_flags(bool nowait)
     880             : {
     881           0 :         fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
     882             : 
     883           0 :         if (nowait)
     884           0 :                 fgp_flags |= FGP_NOWAIT;
     885             : 
     886           0 :         return fgp_flags;
     887             : }
     888             : 
     889             : static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
     890             : {
     891           0 :         gfp_t gfp;
     892             : 
     893           0 :         gfp = btrfs_alloc_write_mask(inode->i_mapping);
     894           0 :         if (nowait) {
     895           0 :                 gfp &= ~__GFP_DIRECT_RECLAIM;
     896           0 :                 gfp |= GFP_NOWAIT;
     897             :         }
     898             : 
     899           0 :         return gfp;
     900             : }
     901             : 
     902             : /*
     903             :  * this just gets pages into the page cache and locks them down.
     904             :  */
     905           0 : static noinline int prepare_pages(struct inode *inode, struct page **pages,
     906             :                                   size_t num_pages, loff_t pos,
     907             :                                   size_t write_bytes, bool force_uptodate,
     908             :                                   bool nowait)
     909             : {
     910           0 :         int i;
     911           0 :         unsigned long index = pos >> PAGE_SHIFT;
     912           0 :         gfp_t mask = get_prepare_gfp_flags(inode, nowait);
     913           0 :         fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
     914           0 :         int err = 0;
     915           0 :         int faili;
     916             : 
     917           0 :         for (i = 0; i < num_pages; i++) {
     918           0 : again:
     919           0 :                 pages[i] = pagecache_get_page(inode->i_mapping, index + i,
     920             :                                               fgp_flags, mask | __GFP_WRITE);
     921           0 :                 if (!pages[i]) {
     922           0 :                         faili = i - 1;
     923           0 :                         if (nowait)
     924             :                                 err = -EAGAIN;
     925             :                         else
     926           0 :                                 err = -ENOMEM;
     927           0 :                         goto fail;
     928             :                 }
     929             : 
     930           0 :                 err = set_page_extent_mapped(pages[i]);
     931           0 :                 if (err < 0) {
     932           0 :                         faili = i;
     933           0 :                         goto fail;
     934             :                 }
     935             : 
     936           0 :                 if (i == 0)
     937           0 :                         err = prepare_uptodate_page(inode, pages[i], pos,
     938             :                                                     force_uptodate);
     939           0 :                 if (!err && i == num_pages - 1)
     940           0 :                         err = prepare_uptodate_page(inode, pages[i],
     941             :                                                     pos + write_bytes, false);
     942           0 :                 if (err) {
     943           0 :                         put_page(pages[i]);
     944           0 :                         if (!nowait && err == -EAGAIN) {
     945           0 :                                 err = 0;
     946           0 :                                 goto again;
     947             :                         }
     948           0 :                         faili = i - 1;
     949           0 :                         goto fail;
     950             :                 }
     951           0 :                 wait_on_page_writeback(pages[i]);
     952             :         }
     953             : 
     954             :         return 0;
     955           0 : fail:
     956           0 :         while (faili >= 0) {
     957           0 :                 unlock_page(pages[faili]);
     958           0 :                 put_page(pages[faili]);
     959           0 :                 faili--;
     960             :         }
     961             :         return err;
     962             : 
     963             : }
     964             : 
     965             : /*
     966             :  * This function locks the extent and properly waits for data=ordered extents
     967             :  * to finish before allowing the pages to be modified if need.
     968             :  *
     969             :  * The return value:
     970             :  * 1 - the extent is locked
     971             :  * 0 - the extent is not locked, and everything is OK
     972             :  * -EAGAIN - need re-prepare the pages
     973             :  * the other < 0 number - Something wrong happens
     974             :  */
     975             : static noinline int
     976           0 : lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
     977             :                                 size_t num_pages, loff_t pos,
     978             :                                 size_t write_bytes,
     979             :                                 u64 *lockstart, u64 *lockend, bool nowait,
     980             :                                 struct extent_state **cached_state)
     981             : {
     982           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
     983           0 :         u64 start_pos;
     984           0 :         u64 last_pos;
     985           0 :         int i;
     986           0 :         int ret = 0;
     987             : 
     988           0 :         start_pos = round_down(pos, fs_info->sectorsize);
     989           0 :         last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
     990             : 
     991           0 :         if (start_pos < inode->vfs_inode.i_size) {
     992           0 :                 struct btrfs_ordered_extent *ordered;
     993             : 
     994           0 :                 if (nowait) {
     995           0 :                         if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
     996             :                                              cached_state)) {
     997           0 :                                 for (i = 0; i < num_pages; i++) {
     998           0 :                                         unlock_page(pages[i]);
     999           0 :                                         put_page(pages[i]);
    1000           0 :                                         pages[i] = NULL;
    1001             :                                 }
    1002             : 
    1003             :                                 return -EAGAIN;
    1004             :                         }
    1005             :                 } else {
    1006           0 :                         lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
    1007             :                 }
    1008             : 
    1009           0 :                 ordered = btrfs_lookup_ordered_range(inode, start_pos,
    1010           0 :                                                      last_pos - start_pos + 1);
    1011           0 :                 if (ordered &&
    1012           0 :                     ordered->file_offset + ordered->num_bytes > start_pos &&
    1013             :                     ordered->file_offset <= last_pos) {
    1014           0 :                         unlock_extent(&inode->io_tree, start_pos, last_pos,
    1015             :                                       cached_state);
    1016           0 :                         for (i = 0; i < num_pages; i++) {
    1017           0 :                                 unlock_page(pages[i]);
    1018           0 :                                 put_page(pages[i]);
    1019             :                         }
    1020           0 :                         btrfs_start_ordered_extent(ordered);
    1021           0 :                         btrfs_put_ordered_extent(ordered);
    1022           0 :                         return -EAGAIN;
    1023             :                 }
    1024           0 :                 if (ordered)
    1025           0 :                         btrfs_put_ordered_extent(ordered);
    1026             : 
    1027           0 :                 *lockstart = start_pos;
    1028           0 :                 *lockend = last_pos;
    1029           0 :                 ret = 1;
    1030             :         }
    1031             : 
    1032             :         /*
    1033             :          * We should be called after prepare_pages() which should have locked
    1034             :          * all pages in the range.
    1035             :          */
    1036           0 :         for (i = 0; i < num_pages; i++)
    1037           0 :                 WARN_ON(!PageLocked(pages[i]));
    1038             : 
    1039             :         return ret;
    1040             : }
    1041             : 
    1042             : /*
    1043             :  * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
    1044             :  *
    1045             :  * @pos:         File offset.
    1046             :  * @write_bytes: The length to write, will be updated to the nocow writeable
    1047             :  *               range.
    1048             :  *
    1049             :  * This function will flush ordered extents in the range to ensure proper
    1050             :  * nocow checks.
    1051             :  *
    1052             :  * Return:
    1053             :  * > 0          If we can nocow, and updates @write_bytes.
    1054             :  *  0           If we can't do a nocow write.
    1055             :  * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
    1056             :  *              root is in progress.
    1057             :  * < 0          If an error happened.
    1058             :  *
    1059             :  * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
    1060             :  */
    1061           0 : int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
    1062             :                            size_t *write_bytes, bool nowait)
    1063             : {
    1064           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    1065           0 :         struct btrfs_root *root = inode->root;
    1066           0 :         struct extent_state *cached_state = NULL;
    1067           0 :         u64 lockstart, lockend;
    1068           0 :         u64 num_bytes;
    1069           0 :         int ret;
    1070             : 
    1071           0 :         if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
    1072             :                 return 0;
    1073             : 
    1074           0 :         if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
    1075             :                 return -EAGAIN;
    1076             : 
    1077           0 :         lockstart = round_down(pos, fs_info->sectorsize);
    1078           0 :         lockend = round_up(pos + *write_bytes,
    1079             :                            fs_info->sectorsize) - 1;
    1080           0 :         num_bytes = lockend - lockstart + 1;
    1081             : 
    1082           0 :         if (nowait) {
    1083           0 :                 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
    1084             :                                                   &cached_state)) {
    1085           0 :                         btrfs_drew_write_unlock(&root->snapshot_lock);
    1086           0 :                         return -EAGAIN;
    1087             :                 }
    1088             :         } else {
    1089           0 :                 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
    1090             :                                                    &cached_state);
    1091             :         }
    1092           0 :         ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
    1093             :                         NULL, NULL, NULL, nowait, false);
    1094           0 :         if (ret <= 0)
    1095           0 :                 btrfs_drew_write_unlock(&root->snapshot_lock);
    1096             :         else
    1097           0 :                 *write_bytes = min_t(size_t, *write_bytes ,
    1098             :                                      num_bytes - pos + lockstart);
    1099           0 :         unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
    1100             : 
    1101           0 :         return ret;
    1102             : }
    1103             : 
    1104           0 : void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
    1105             : {
    1106           0 :         btrfs_drew_write_unlock(&inode->root->snapshot_lock);
    1107           0 : }
    1108             : 
    1109           0 : static void update_time_for_write(struct inode *inode)
    1110             : {
    1111           0 :         struct timespec64 now;
    1112             : 
    1113           0 :         if (IS_NOCMTIME(inode))
    1114             :                 return;
    1115             : 
    1116           0 :         now = current_time(inode);
    1117           0 :         if (!timespec64_equal(&inode->i_mtime, &now))
    1118           0 :                 inode->i_mtime = now;
    1119             : 
    1120           0 :         if (!timespec64_equal(&inode->i_ctime, &now))
    1121           0 :                 inode->i_ctime = now;
    1122             : 
    1123           0 :         if (IS_I_VERSION(inode))
    1124           0 :                 inode_inc_iversion(inode);
    1125             : }
    1126             : 
    1127           0 : static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
    1128             :                              size_t count)
    1129             : {
    1130           0 :         struct file *file = iocb->ki_filp;
    1131           0 :         struct inode *inode = file_inode(file);
    1132           0 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    1133           0 :         loff_t pos = iocb->ki_pos;
    1134           0 :         int ret;
    1135           0 :         loff_t oldsize;
    1136           0 :         loff_t start_pos;
    1137             : 
    1138             :         /*
    1139             :          * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
    1140             :          * prealloc flags, as without those flags we always have to COW. We will
    1141             :          * later check if we can really COW into the target range (using
    1142             :          * can_nocow_extent() at btrfs_get_blocks_direct_write()).
    1143             :          */
    1144           0 :         if ((iocb->ki_flags & IOCB_NOWAIT) &&
    1145           0 :             !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
    1146             :                 return -EAGAIN;
    1147             : 
    1148           0 :         ret = file_remove_privs(file);
    1149           0 :         if (ret)
    1150             :                 return ret;
    1151             : 
    1152             :         /*
    1153             :          * We reserve space for updating the inode when we reserve space for the
    1154             :          * extent we are going to write, so we will enospc out there.  We don't
    1155             :          * need to start yet another transaction to update the inode as we will
    1156             :          * update the inode when we finish writing whatever data we write.
    1157             :          */
    1158           0 :         update_time_for_write(inode);
    1159             : 
    1160           0 :         start_pos = round_down(pos, fs_info->sectorsize);
    1161           0 :         oldsize = i_size_read(inode);
    1162           0 :         if (start_pos > oldsize) {
    1163             :                 /* Expand hole size to cover write data, preventing empty gap */
    1164           0 :                 loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
    1165             : 
    1166           0 :                 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
    1167           0 :                 if (ret)
    1168           0 :                         return ret;
    1169             :         }
    1170             : 
    1171             :         return 0;
    1172             : }
    1173             : 
    1174           0 : static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
    1175             :                                                struct iov_iter *i)
    1176             : {
    1177           0 :         struct file *file = iocb->ki_filp;
    1178           0 :         loff_t pos;
    1179           0 :         struct inode *inode = file_inode(file);
    1180           0 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    1181           0 :         struct page **pages = NULL;
    1182           0 :         struct extent_changeset *data_reserved = NULL;
    1183           0 :         u64 release_bytes = 0;
    1184           0 :         u64 lockstart;
    1185           0 :         u64 lockend;
    1186           0 :         size_t num_written = 0;
    1187           0 :         int nrptrs;
    1188           0 :         ssize_t ret;
    1189           0 :         bool only_release_metadata = false;
    1190           0 :         bool force_page_uptodate = false;
    1191           0 :         loff_t old_isize = i_size_read(inode);
    1192           0 :         unsigned int ilock_flags = 0;
    1193           0 :         const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
    1194           0 :         unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
    1195             : 
    1196           0 :         if (nowait)
    1197           0 :                 ilock_flags |= BTRFS_ILOCK_TRY;
    1198             : 
    1199           0 :         ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
    1200           0 :         if (ret < 0)
    1201             :                 return ret;
    1202             : 
    1203           0 :         ret = generic_write_checks(iocb, i);
    1204           0 :         if (ret <= 0)
    1205           0 :                 goto out;
    1206             : 
    1207           0 :         ret = btrfs_write_check(iocb, i, ret);
    1208           0 :         if (ret < 0)
    1209           0 :                 goto out;
    1210             : 
    1211           0 :         pos = iocb->ki_pos;
    1212           0 :         nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
    1213             :                         PAGE_SIZE / (sizeof(struct page *)));
    1214           0 :         nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
    1215           0 :         nrptrs = max(nrptrs, 8);
    1216           0 :         pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
    1217           0 :         if (!pages) {
    1218           0 :                 ret = -ENOMEM;
    1219           0 :                 goto out;
    1220             :         }
    1221             : 
    1222           0 :         while (iov_iter_count(i) > 0) {
    1223           0 :                 struct extent_state *cached_state = NULL;
    1224           0 :                 size_t offset = offset_in_page(pos);
    1225           0 :                 size_t sector_offset;
    1226           0 :                 size_t write_bytes = min(iov_iter_count(i),
    1227             :                                          nrptrs * (size_t)PAGE_SIZE -
    1228             :                                          offset);
    1229           0 :                 size_t num_pages;
    1230           0 :                 size_t reserve_bytes;
    1231           0 :                 size_t dirty_pages;
    1232           0 :                 size_t copied;
    1233           0 :                 size_t dirty_sectors;
    1234           0 :                 size_t num_sectors;
    1235           0 :                 int extents_locked;
    1236             : 
    1237             :                 /*
    1238             :                  * Fault pages before locking them in prepare_pages
    1239             :                  * to avoid recursive lock
    1240             :                  */
    1241           0 :                 if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
    1242             :                         ret = -EFAULT;
    1243           0 :                         break;
    1244             :                 }
    1245             : 
    1246           0 :                 only_release_metadata = false;
    1247           0 :                 sector_offset = pos & (fs_info->sectorsize - 1);
    1248             : 
    1249           0 :                 extent_changeset_release(data_reserved);
    1250           0 :                 ret = btrfs_check_data_free_space(BTRFS_I(inode),
    1251             :                                                   &data_reserved, pos,
    1252             :                                                   write_bytes, nowait);
    1253           0 :                 if (ret < 0) {
    1254           0 :                         int can_nocow;
    1255             : 
    1256           0 :                         if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
    1257             :                                 ret = -EAGAIN;
    1258             :                                 break;
    1259             :                         }
    1260             : 
    1261             :                         /*
    1262             :                          * If we don't have to COW at the offset, reserve
    1263             :                          * metadata only. write_bytes may get smaller than
    1264             :                          * requested here.
    1265             :                          */
    1266           0 :                         can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
    1267             :                                                            &write_bytes, nowait);
    1268           0 :                         if (can_nocow < 0)
    1269           0 :                                 ret = can_nocow;
    1270           0 :                         if (can_nocow > 0)
    1271             :                                 ret = 0;
    1272             :                         if (ret)
    1273             :                                 break;
    1274             :                         only_release_metadata = true;
    1275             :                 }
    1276             : 
    1277           0 :                 num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
    1278           0 :                 WARN_ON(num_pages > nrptrs);
    1279           0 :                 reserve_bytes = round_up(write_bytes + sector_offset,
    1280             :                                          fs_info->sectorsize);
    1281           0 :                 WARN_ON(reserve_bytes == 0);
    1282           0 :                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
    1283             :                                                       reserve_bytes,
    1284             :                                                       reserve_bytes, nowait);
    1285           0 :                 if (ret) {
    1286           0 :                         if (!only_release_metadata)
    1287           0 :                                 btrfs_free_reserved_data_space(BTRFS_I(inode),
    1288             :                                                 data_reserved, pos,
    1289             :                                                 write_bytes);
    1290             :                         else
    1291           0 :                                 btrfs_check_nocow_unlock(BTRFS_I(inode));
    1292             : 
    1293           0 :                         if (nowait && ret == -ENOSPC)
    1294           0 :                                 ret = -EAGAIN;
    1295             :                         break;
    1296             :                 }
    1297             : 
    1298             :                 release_bytes = reserve_bytes;
    1299           0 : again:
    1300           0 :                 ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
    1301           0 :                 if (ret) {
    1302           0 :                         btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
    1303           0 :                         break;
    1304             :                 }
    1305             : 
    1306             :                 /*
    1307             :                  * This is going to setup the pages array with the number of
    1308             :                  * pages we want, so we don't really need to worry about the
    1309             :                  * contents of pages from loop to loop
    1310             :                  */
    1311           0 :                 ret = prepare_pages(inode, pages, num_pages,
    1312             :                                     pos, write_bytes, force_page_uptodate, false);
    1313           0 :                 if (ret) {
    1314           0 :                         btrfs_delalloc_release_extents(BTRFS_I(inode),
    1315             :                                                        reserve_bytes);
    1316           0 :                         break;
    1317             :                 }
    1318             : 
    1319           0 :                 extents_locked = lock_and_cleanup_extent_if_need(
    1320             :                                 BTRFS_I(inode), pages,
    1321             :                                 num_pages, pos, write_bytes, &lockstart,
    1322             :                                 &lockend, nowait, &cached_state);
    1323           0 :                 if (extents_locked < 0) {
    1324           0 :                         if (!nowait && extents_locked == -EAGAIN)
    1325           0 :                                 goto again;
    1326             : 
    1327           0 :                         btrfs_delalloc_release_extents(BTRFS_I(inode),
    1328             :                                                        reserve_bytes);
    1329           0 :                         ret = extents_locked;
    1330           0 :                         break;
    1331             :                 }
    1332             : 
    1333           0 :                 copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
    1334             : 
    1335           0 :                 num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
    1336           0 :                 dirty_sectors = round_up(copied + sector_offset,
    1337             :                                         fs_info->sectorsize);
    1338           0 :                 dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
    1339             : 
    1340             :                 /*
    1341             :                  * if we have trouble faulting in the pages, fall
    1342             :                  * back to one page at a time
    1343             :                  */
    1344           0 :                 if (copied < write_bytes)
    1345           0 :                         nrptrs = 1;
    1346             : 
    1347           0 :                 if (copied == 0) {
    1348             :                         force_page_uptodate = true;
    1349             :                         dirty_sectors = 0;
    1350             :                         dirty_pages = 0;
    1351             :                 } else {
    1352           0 :                         force_page_uptodate = false;
    1353           0 :                         dirty_pages = DIV_ROUND_UP(copied + offset,
    1354             :                                                    PAGE_SIZE);
    1355             :                 }
    1356             : 
    1357           0 :                 if (num_sectors > dirty_sectors) {
    1358             :                         /* release everything except the sectors we dirtied */
    1359           0 :                         release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
    1360           0 :                         if (only_release_metadata) {
    1361           0 :                                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
    1362             :                                                         release_bytes, true);
    1363             :                         } else {
    1364           0 :                                 u64 __pos;
    1365             : 
    1366           0 :                                 __pos = round_down(pos,
    1367             :                                                    fs_info->sectorsize) +
    1368           0 :                                         (dirty_pages << PAGE_SHIFT);
    1369           0 :                                 btrfs_delalloc_release_space(BTRFS_I(inode),
    1370             :                                                 data_reserved, __pos,
    1371             :                                                 release_bytes, true);
    1372             :                         }
    1373             :                 }
    1374             : 
    1375           0 :                 release_bytes = round_up(copied + sector_offset,
    1376             :                                         fs_info->sectorsize);
    1377             : 
    1378           0 :                 ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
    1379             :                                         dirty_pages, pos, copied,
    1380             :                                         &cached_state, only_release_metadata);
    1381             : 
    1382             :                 /*
    1383             :                  * If we have not locked the extent range, because the range's
    1384             :                  * start offset is >= i_size, we might still have a non-NULL
    1385             :                  * cached extent state, acquired while marking the extent range
    1386             :                  * as delalloc through btrfs_dirty_pages(). Therefore free any
    1387             :                  * possible cached extent state to avoid a memory leak.
    1388             :                  */
    1389           0 :                 if (extents_locked)
    1390           0 :                         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
    1391             :                                       lockend, &cached_state);
    1392             :                 else
    1393           0 :                         free_extent_state(cached_state);
    1394             : 
    1395           0 :                 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
    1396           0 :                 if (ret) {
    1397           0 :                         btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
    1398           0 :                         break;
    1399             :                 }
    1400             : 
    1401           0 :                 release_bytes = 0;
    1402           0 :                 if (only_release_metadata)
    1403           0 :                         btrfs_check_nocow_unlock(BTRFS_I(inode));
    1404             : 
    1405           0 :                 btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
    1406             : 
    1407           0 :                 cond_resched();
    1408             : 
    1409           0 :                 pos += copied;
    1410           0 :                 num_written += copied;
    1411             :         }
    1412             : 
    1413           0 :         kfree(pages);
    1414             : 
    1415           0 :         if (release_bytes) {
    1416           0 :                 if (only_release_metadata) {
    1417           0 :                         btrfs_check_nocow_unlock(BTRFS_I(inode));
    1418           0 :                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
    1419             :                                         release_bytes, true);
    1420             :                 } else {
    1421           0 :                         btrfs_delalloc_release_space(BTRFS_I(inode),
    1422             :                                         data_reserved,
    1423           0 :                                         round_down(pos, fs_info->sectorsize),
    1424             :                                         release_bytes, true);
    1425             :                 }
    1426             :         }
    1427             : 
    1428           0 :         extent_changeset_free(data_reserved);
    1429           0 :         if (num_written > 0) {
    1430           0 :                 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
    1431           0 :                 iocb->ki_pos += num_written;
    1432             :         }
    1433           0 : out:
    1434           0 :         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1435           0 :         return num_written ? num_written : ret;
    1436             : }
    1437             : 
    1438           0 : static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
    1439             :                                const struct iov_iter *iter, loff_t offset)
    1440             : {
    1441           0 :         const u32 blocksize_mask = fs_info->sectorsize - 1;
    1442             : 
    1443           0 :         if (offset & blocksize_mask)
    1444             :                 return -EINVAL;
    1445             : 
    1446           0 :         if (iov_iter_alignment(iter) & blocksize_mask)
    1447           0 :                 return -EINVAL;
    1448             : 
    1449             :         return 0;
    1450             : }
    1451             : 
    1452           0 : static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
    1453             : {
    1454           0 :         struct file *file = iocb->ki_filp;
    1455           0 :         struct inode *inode = file_inode(file);
    1456           0 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    1457           0 :         loff_t pos;
    1458           0 :         ssize_t written = 0;
    1459           0 :         ssize_t written_buffered;
    1460           0 :         size_t prev_left = 0;
    1461           0 :         loff_t endbyte;
    1462           0 :         ssize_t err;
    1463           0 :         unsigned int ilock_flags = 0;
    1464           0 :         struct iomap_dio *dio;
    1465             : 
    1466           0 :         if (iocb->ki_flags & IOCB_NOWAIT)
    1467           0 :                 ilock_flags |= BTRFS_ILOCK_TRY;
    1468             : 
    1469             :         /* If the write DIO is within EOF, use a shared lock */
    1470           0 :         if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
    1471           0 :                 ilock_flags |= BTRFS_ILOCK_SHARED;
    1472             : 
    1473           0 : relock:
    1474           0 :         err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
    1475           0 :         if (err < 0)
    1476           0 :                 return err;
    1477             : 
    1478           0 :         err = generic_write_checks(iocb, from);
    1479           0 :         if (err <= 0) {
    1480           0 :                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1481           0 :                 return err;
    1482             :         }
    1483             : 
    1484           0 :         err = btrfs_write_check(iocb, from, err);
    1485           0 :         if (err < 0) {
    1486           0 :                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1487           0 :                 goto out;
    1488             :         }
    1489             : 
    1490           0 :         pos = iocb->ki_pos;
    1491             :         /*
    1492             :          * Re-check since file size may have changed just before taking the
    1493             :          * lock or pos may have changed because of O_APPEND in generic_write_check()
    1494             :          */
    1495           0 :         if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
    1496           0 :             pos + iov_iter_count(from) > i_size_read(inode)) {
    1497           0 :                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1498           0 :                 ilock_flags &= ~BTRFS_ILOCK_SHARED;
    1499           0 :                 goto relock;
    1500             :         }
    1501             : 
    1502           0 :         if (check_direct_IO(fs_info, from, pos)) {
    1503           0 :                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1504           0 :                 goto buffered;
    1505             :         }
    1506             : 
    1507             :         /*
    1508             :          * The iov_iter can be mapped to the same file range we are writing to.
    1509             :          * If that's the case, then we will deadlock in the iomap code, because
    1510             :          * it first calls our callback btrfs_dio_iomap_begin(), which will create
    1511             :          * an ordered extent, and after that it will fault in the pages that the
    1512             :          * iov_iter refers to. During the fault in we end up in the readahead
    1513             :          * pages code (starting at btrfs_readahead()), which will lock the range,
    1514             :          * find that ordered extent and then wait for it to complete (at
    1515             :          * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
    1516             :          * obviously the ordered extent can never complete as we didn't submit
    1517             :          * yet the respective bio(s). This always happens when the buffer is
    1518             :          * memory mapped to the same file range, since the iomap DIO code always
    1519             :          * invalidates pages in the target file range (after starting and waiting
    1520             :          * for any writeback).
    1521             :          *
    1522             :          * So here we disable page faults in the iov_iter and then retry if we
    1523             :          * got -EFAULT, faulting in the pages before the retry.
    1524             :          */
    1525           0 :         from->nofault = true;
    1526           0 :         dio = btrfs_dio_write(iocb, from, written);
    1527           0 :         from->nofault = false;
    1528             : 
    1529             :         /*
    1530             :          * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
    1531             :          * iocb, and that needs to lock the inode. So unlock it before calling
    1532             :          * iomap_dio_complete() to avoid a deadlock.
    1533             :          */
    1534           0 :         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
    1535             : 
    1536           0 :         if (IS_ERR_OR_NULL(dio))
    1537           0 :                 err = PTR_ERR_OR_ZERO(dio);
    1538             :         else
    1539           0 :                 err = iomap_dio_complete(dio);
    1540             : 
    1541             :         /* No increment (+=) because iomap returns a cumulative value. */
    1542           0 :         if (err > 0)
    1543           0 :                 written = err;
    1544             : 
    1545           0 :         if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
    1546           0 :                 const size_t left = iov_iter_count(from);
    1547             :                 /*
    1548             :                  * We have more data left to write. Try to fault in as many as
    1549             :                  * possible of the remainder pages and retry. We do this without
    1550             :                  * releasing and locking again the inode, to prevent races with
    1551             :                  * truncate.
    1552             :                  *
    1553             :                  * Also, in case the iov refers to pages in the file range of the
    1554             :                  * file we want to write to (due to a mmap), we could enter an
    1555             :                  * infinite loop if we retry after faulting the pages in, since
    1556             :                  * iomap will invalidate any pages in the range early on, before
    1557             :                  * it tries to fault in the pages of the iov. So we keep track of
    1558             :                  * how much was left of iov in the previous EFAULT and fallback
    1559             :                  * to buffered IO in case we haven't made any progress.
    1560             :                  */
    1561           0 :                 if (left == prev_left) {
    1562             :                         err = -ENOTBLK;
    1563             :                 } else {
    1564           0 :                         fault_in_iov_iter_readable(from, left);
    1565           0 :                         prev_left = left;
    1566           0 :                         goto relock;
    1567             :                 }
    1568             :         }
    1569             : 
    1570             :         /*
    1571             :          * If 'err' is -ENOTBLK or we have not written all data, then it means
    1572             :          * we must fallback to buffered IO.
    1573             :          */
    1574           0 :         if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
    1575           0 :                 goto out;
    1576             : 
    1577           0 : buffered:
    1578             :         /*
    1579             :          * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
    1580             :          * it must retry the operation in a context where blocking is acceptable,
    1581             :          * because even if we end up not blocking during the buffered IO attempt
    1582             :          * below, we will block when flushing and waiting for the IO.
    1583             :          */
    1584           0 :         if (iocb->ki_flags & IOCB_NOWAIT) {
    1585           0 :                 err = -EAGAIN;
    1586           0 :                 goto out;
    1587             :         }
    1588             : 
    1589           0 :         pos = iocb->ki_pos;
    1590           0 :         written_buffered = btrfs_buffered_write(iocb, from);
    1591           0 :         if (written_buffered < 0) {
    1592           0 :                 err = written_buffered;
    1593           0 :                 goto out;
    1594             :         }
    1595             :         /*
    1596             :          * Ensure all data is persisted. We want the next direct IO read to be
    1597             :          * able to read what was just written.
    1598             :          */
    1599           0 :         endbyte = pos + written_buffered - 1;
    1600           0 :         err = btrfs_fdatawrite_range(inode, pos, endbyte);
    1601           0 :         if (err)
    1602           0 :                 goto out;
    1603           0 :         err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
    1604           0 :         if (err)
    1605           0 :                 goto out;
    1606           0 :         written += written_buffered;
    1607           0 :         iocb->ki_pos = pos + written_buffered;
    1608           0 :         invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
    1609           0 :                                  endbyte >> PAGE_SHIFT);
    1610           0 : out:
    1611           0 :         return err < 0 ? err : written;
    1612             : }
    1613             : 
    1614           0 : static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
    1615             :                         const struct btrfs_ioctl_encoded_io_args *encoded)
    1616             : {
    1617           0 :         struct file *file = iocb->ki_filp;
    1618           0 :         struct inode *inode = file_inode(file);
    1619           0 :         loff_t count;
    1620           0 :         ssize_t ret;
    1621             : 
    1622           0 :         btrfs_inode_lock(BTRFS_I(inode), 0);
    1623           0 :         count = encoded->len;
    1624           0 :         ret = generic_write_checks_count(iocb, &count);
    1625           0 :         if (ret == 0 && count != encoded->len) {
    1626             :                 /*
    1627             :                  * The write got truncated by generic_write_checks_count(). We
    1628             :                  * can't do a partial encoded write.
    1629             :                  */
    1630             :                 ret = -EFBIG;
    1631             :         }
    1632           0 :         if (ret || encoded->len == 0)
    1633           0 :                 goto out;
    1634             : 
    1635           0 :         ret = btrfs_write_check(iocb, from, encoded->len);
    1636           0 :         if (ret < 0)
    1637           0 :                 goto out;
    1638             : 
    1639           0 :         ret = btrfs_do_encoded_write(iocb, from, encoded);
    1640           0 : out:
    1641           0 :         btrfs_inode_unlock(BTRFS_I(inode), 0);
    1642           0 :         return ret;
    1643             : }
    1644             : 
    1645           0 : ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
    1646             :                             const struct btrfs_ioctl_encoded_io_args *encoded)
    1647             : {
    1648           0 :         struct file *file = iocb->ki_filp;
    1649           0 :         struct btrfs_inode *inode = BTRFS_I(file_inode(file));
    1650           0 :         ssize_t num_written, num_sync;
    1651             : 
    1652             :         /*
    1653             :          * If the fs flips readonly due to some impossible error, although we
    1654             :          * have opened a file as writable, we have to stop this write operation
    1655             :          * to ensure consistency.
    1656             :          */
    1657           0 :         if (BTRFS_FS_ERROR(inode->root->fs_info))
    1658             :                 return -EROFS;
    1659             : 
    1660           0 :         if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
    1661             :                 return -EOPNOTSUPP;
    1662             : 
    1663           0 :         if (encoded) {
    1664           0 :                 num_written = btrfs_encoded_write(iocb, from, encoded);
    1665           0 :                 num_sync = encoded->len;
    1666           0 :         } else if (iocb->ki_flags & IOCB_DIRECT) {
    1667           0 :                 num_written = btrfs_direct_write(iocb, from);
    1668           0 :                 num_sync = num_written;
    1669             :         } else {
    1670           0 :                 num_written = btrfs_buffered_write(iocb, from);
    1671           0 :                 num_sync = num_written;
    1672             :         }
    1673             : 
    1674           0 :         btrfs_set_inode_last_sub_trans(inode);
    1675             : 
    1676           0 :         if (num_sync > 0) {
    1677           0 :                 num_sync = generic_write_sync(iocb, num_sync);
    1678           0 :                 if (num_sync < 0)
    1679           0 :                         num_written = num_sync;
    1680             :         }
    1681             : 
    1682             :         return num_written;
    1683             : }
    1684             : 
    1685           0 : static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
    1686             : {
    1687           0 :         return btrfs_do_write_iter(iocb, from, NULL);
    1688             : }
    1689             : 
    1690           0 : int btrfs_release_file(struct inode *inode, struct file *filp)
    1691             : {
    1692           0 :         struct btrfs_file_private *private = filp->private_data;
    1693             : 
    1694           0 :         if (private) {
    1695           0 :                 kfree(private->filldir_buf);
    1696           0 :                 free_extent_state(private->llseek_cached_state);
    1697           0 :                 kfree(private);
    1698           0 :                 filp->private_data = NULL;
    1699             :         }
    1700             : 
    1701             :         /*
    1702             :          * Set by setattr when we are about to truncate a file from a non-zero
    1703             :          * size to a zero size.  This tries to flush down new bytes that may
    1704             :          * have been written if the application were using truncate to replace
    1705             :          * a file in place.
    1706             :          */
    1707           0 :         if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
    1708           0 :                                &BTRFS_I(inode)->runtime_flags))
    1709           0 :                         filemap_flush(inode->i_mapping);
    1710           0 :         return 0;
    1711             : }
    1712             : 
    1713           0 : static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
    1714             : {
    1715           0 :         int ret;
    1716           0 :         struct blk_plug plug;
    1717             : 
    1718             :         /*
    1719             :          * This is only called in fsync, which would do synchronous writes, so
    1720             :          * a plug can merge adjacent IOs as much as possible.  Esp. in case of
    1721             :          * multiple disks using raid profile, a large IO can be split to
    1722             :          * several segments of stripe length (currently 64K).
    1723             :          */
    1724           0 :         blk_start_plug(&plug);
    1725           0 :         ret = btrfs_fdatawrite_range(inode, start, end);
    1726           0 :         blk_finish_plug(&plug);
    1727             : 
    1728           0 :         return ret;
    1729             : }
    1730             : 
    1731           0 : static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
    1732             : {
    1733           0 :         struct btrfs_inode *inode = BTRFS_I(ctx->inode);
    1734           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    1735             : 
    1736           0 :         if (btrfs_inode_in_log(inode, fs_info->generation) &&
    1737           0 :             list_empty(&ctx->ordered_extents))
    1738             :                 return true;
    1739             : 
    1740             :         /*
    1741             :          * If we are doing a fast fsync we can not bail out if the inode's
    1742             :          * last_trans is <= then the last committed transaction, because we only
    1743             :          * update the last_trans of the inode during ordered extent completion,
    1744             :          * and for a fast fsync we don't wait for that, we only wait for the
    1745             :          * writeback to complete.
    1746             :          */
    1747           0 :         if (inode->last_trans <= fs_info->last_trans_committed &&
    1748           0 :             (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
    1749           0 :              list_empty(&ctx->ordered_extents)))
    1750           0 :                 return true;
    1751             : 
    1752             :         return false;
    1753             : }
    1754             : 
    1755             : /*
    1756             :  * fsync call for both files and directories.  This logs the inode into
    1757             :  * the tree log instead of forcing full commits whenever possible.
    1758             :  *
    1759             :  * It needs to call filemap_fdatawait so that all ordered extent updates are
    1760             :  * in the metadata btree are up to date for copying to the log.
    1761             :  *
    1762             :  * It drops the inode mutex before doing the tree log commit.  This is an
    1763             :  * important optimization for directories because holding the mutex prevents
    1764             :  * new operations on the dir while we write to disk.
    1765             :  */
    1766           0 : int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
    1767             : {
    1768           0 :         struct dentry *dentry = file_dentry(file);
    1769           0 :         struct inode *inode = d_inode(dentry);
    1770           0 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    1771           0 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    1772           0 :         struct btrfs_trans_handle *trans;
    1773           0 :         struct btrfs_log_ctx ctx;
    1774           0 :         int ret = 0, err;
    1775           0 :         u64 len;
    1776           0 :         bool full_sync;
    1777             : 
    1778           0 :         trace_btrfs_sync_file(file, datasync);
    1779             : 
    1780           0 :         btrfs_init_log_ctx(&ctx, inode);
    1781             : 
    1782             :         /*
    1783             :          * Always set the range to a full range, otherwise we can get into
    1784             :          * several problems, from missing file extent items to represent holes
    1785             :          * when not using the NO_HOLES feature, to log tree corruption due to
    1786             :          * races between hole detection during logging and completion of ordered
    1787             :          * extents outside the range, to missing checksums due to ordered extents
    1788             :          * for which we flushed only a subset of their pages.
    1789             :          */
    1790           0 :         start = 0;
    1791           0 :         end = LLONG_MAX;
    1792           0 :         len = (u64)LLONG_MAX + 1;
    1793             : 
    1794             :         /*
    1795             :          * We write the dirty pages in the range and wait until they complete
    1796             :          * out of the ->i_mutex. If so, we can flush the dirty pages by
    1797             :          * multi-task, and make the performance up.  See
    1798             :          * btrfs_wait_ordered_range for an explanation of the ASYNC check.
    1799             :          */
    1800           0 :         ret = start_ordered_ops(inode, start, end);
    1801           0 :         if (ret)
    1802           0 :                 goto out;
    1803             : 
    1804           0 :         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    1805             : 
    1806           0 :         atomic_inc(&root->log_batch);
    1807             : 
    1808             :         /*
    1809             :          * Before we acquired the inode's lock and the mmap lock, someone may
    1810             :          * have dirtied more pages in the target range. We need to make sure
    1811             :          * that writeback for any such pages does not start while we are logging
    1812             :          * the inode, because if it does, any of the following might happen when
    1813             :          * we are not doing a full inode sync:
    1814             :          *
    1815             :          * 1) We log an extent after its writeback finishes but before its
    1816             :          *    checksums are added to the csum tree, leading to -EIO errors
    1817             :          *    when attempting to read the extent after a log replay.
    1818             :          *
    1819             :          * 2) We can end up logging an extent before its writeback finishes.
    1820             :          *    Therefore after the log replay we will have a file extent item
    1821             :          *    pointing to an unwritten extent (and no data checksums as well).
    1822             :          *
    1823             :          * So trigger writeback for any eventual new dirty pages and then we
    1824             :          * wait for all ordered extents to complete below.
    1825             :          */
    1826           0 :         ret = start_ordered_ops(inode, start, end);
    1827           0 :         if (ret) {
    1828           0 :                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    1829           0 :                 goto out;
    1830             :         }
    1831             : 
    1832             :         /*
    1833             :          * Always check for the full sync flag while holding the inode's lock,
    1834             :          * to avoid races with other tasks. The flag must be either set all the
    1835             :          * time during logging or always off all the time while logging.
    1836             :          * We check the flag here after starting delalloc above, because when
    1837             :          * running delalloc the full sync flag may be set if we need to drop
    1838             :          * extra extent map ranges due to temporary memory allocation failures.
    1839             :          */
    1840           0 :         full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
    1841             :                              &BTRFS_I(inode)->runtime_flags);
    1842             : 
    1843             :         /*
    1844             :          * We have to do this here to avoid the priority inversion of waiting on
    1845             :          * IO of a lower priority task while holding a transaction open.
    1846             :          *
    1847             :          * For a full fsync we wait for the ordered extents to complete while
    1848             :          * for a fast fsync we wait just for writeback to complete, and then
    1849             :          * attach the ordered extents to the transaction so that a transaction
    1850             :          * commit waits for their completion, to avoid data loss if we fsync,
    1851             :          * the current transaction commits before the ordered extents complete
    1852             :          * and a power failure happens right after that.
    1853             :          *
    1854             :          * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
    1855             :          * logical address recorded in the ordered extent may change. We need
    1856             :          * to wait for the IO to stabilize the logical address.
    1857             :          */
    1858           0 :         if (full_sync || btrfs_is_zoned(fs_info)) {
    1859           0 :                 ret = btrfs_wait_ordered_range(inode, start, len);
    1860             :         } else {
    1861             :                 /*
    1862             :                  * Get our ordered extents as soon as possible to avoid doing
    1863             :                  * checksum lookups in the csum tree, and use instead the
    1864             :                  * checksums attached to the ordered extents.
    1865             :                  */
    1866           0 :                 btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
    1867             :                                                       &ctx.ordered_extents);
    1868           0 :                 ret = filemap_fdatawait_range(inode->i_mapping, start, end);
    1869             :         }
    1870             : 
    1871           0 :         if (ret)
    1872           0 :                 goto out_release_extents;
    1873             : 
    1874           0 :         atomic_inc(&root->log_batch);
    1875             : 
    1876           0 :         smp_mb();
    1877           0 :         if (skip_inode_logging(&ctx)) {
    1878             :                 /*
    1879             :                  * We've had everything committed since the last time we were
    1880             :                  * modified so clear this flag in case it was set for whatever
    1881             :                  * reason, it's no longer relevant.
    1882             :                  */
    1883           0 :                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
    1884           0 :                           &BTRFS_I(inode)->runtime_flags);
    1885             :                 /*
    1886             :                  * An ordered extent might have started before and completed
    1887             :                  * already with io errors, in which case the inode was not
    1888             :                  * updated and we end up here. So check the inode's mapping
    1889             :                  * for any errors that might have happened since we last
    1890             :                  * checked called fsync.
    1891             :                  */
    1892           0 :                 ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
    1893           0 :                 goto out_release_extents;
    1894             :         }
    1895             : 
    1896             :         /*
    1897             :          * We use start here because we will need to wait on the IO to complete
    1898             :          * in btrfs_sync_log, which could require joining a transaction (for
    1899             :          * example checking cross references in the nocow path).  If we use join
    1900             :          * here we could get into a situation where we're waiting on IO to
    1901             :          * happen that is blocked on a transaction trying to commit.  With start
    1902             :          * we inc the extwriter counter, so we wait for all extwriters to exit
    1903             :          * before we start blocking joiners.  This comment is to keep somebody
    1904             :          * from thinking they are super smart and changing this to
    1905             :          * btrfs_join_transaction *cough*Josef*cough*.
    1906             :          */
    1907           0 :         trans = btrfs_start_transaction(root, 0);
    1908           0 :         if (IS_ERR(trans)) {
    1909           0 :                 ret = PTR_ERR(trans);
    1910           0 :                 goto out_release_extents;
    1911             :         }
    1912           0 :         trans->in_fsync = true;
    1913             : 
    1914           0 :         ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
    1915           0 :         btrfs_release_log_ctx_extents(&ctx);
    1916           0 :         if (ret < 0) {
    1917             :                 /* Fallthrough and commit/free transaction. */
    1918           0 :                 ret = BTRFS_LOG_FORCE_COMMIT;
    1919             :         }
    1920             : 
    1921             :         /* we've logged all the items and now have a consistent
    1922             :          * version of the file in the log.  It is possible that
    1923             :          * someone will come in and modify the file, but that's
    1924             :          * fine because the log is consistent on disk, and we
    1925             :          * have references to all of the file's extents
    1926             :          *
    1927             :          * It is possible that someone will come in and log the
    1928             :          * file again, but that will end up using the synchronization
    1929             :          * inside btrfs_sync_log to keep things safe.
    1930             :          */
    1931           0 :         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    1932             : 
    1933           0 :         if (ret == BTRFS_NO_LOG_SYNC) {
    1934           0 :                 ret = btrfs_end_transaction(trans);
    1935           0 :                 goto out;
    1936             :         }
    1937             : 
    1938             :         /* We successfully logged the inode, attempt to sync the log. */
    1939           0 :         if (!ret) {
    1940           0 :                 ret = btrfs_sync_log(trans, root, &ctx);
    1941           0 :                 if (!ret) {
    1942           0 :                         ret = btrfs_end_transaction(trans);
    1943           0 :                         goto out;
    1944             :                 }
    1945             :         }
    1946             : 
    1947             :         /*
    1948             :          * At this point we need to commit the transaction because we had
    1949             :          * btrfs_need_log_full_commit() or some other error.
    1950             :          *
    1951             :          * If we didn't do a full sync we have to stop the trans handle, wait on
    1952             :          * the ordered extents, start it again and commit the transaction.  If
    1953             :          * we attempt to wait on the ordered extents here we could deadlock with
    1954             :          * something like fallocate() that is holding the extent lock trying to
    1955             :          * start a transaction while some other thread is trying to commit the
    1956             :          * transaction while we (fsync) are currently holding the transaction
    1957             :          * open.
    1958             :          */
    1959           0 :         if (!full_sync) {
    1960           0 :                 ret = btrfs_end_transaction(trans);
    1961           0 :                 if (ret)
    1962           0 :                         goto out;
    1963           0 :                 ret = btrfs_wait_ordered_range(inode, start, len);
    1964           0 :                 if (ret)
    1965           0 :                         goto out;
    1966             : 
    1967             :                 /*
    1968             :                  * This is safe to use here because we're only interested in
    1969             :                  * making sure the transaction that had the ordered extents is
    1970             :                  * committed.  We aren't waiting on anything past this point,
    1971             :                  * we're purely getting the transaction and committing it.
    1972             :                  */
    1973           0 :                 trans = btrfs_attach_transaction_barrier(root);
    1974           0 :                 if (IS_ERR(trans)) {
    1975           0 :                         ret = PTR_ERR(trans);
    1976             : 
    1977             :                         /*
    1978             :                          * We committed the transaction and there's no currently
    1979             :                          * running transaction, this means everything we care
    1980             :                          * about made it to disk and we are done.
    1981             :                          */
    1982           0 :                         if (ret == -ENOENT)
    1983           0 :                                 ret = 0;
    1984           0 :                         goto out;
    1985             :                 }
    1986             :         }
    1987             : 
    1988           0 :         ret = btrfs_commit_transaction(trans);
    1989           0 : out:
    1990           0 :         ASSERT(list_empty(&ctx.list));
    1991           0 :         ASSERT(list_empty(&ctx.conflict_inodes));
    1992           0 :         err = file_check_and_advance_wb_err(file);
    1993           0 :         if (!ret)
    1994           0 :                 ret = err;
    1995           0 :         return ret > 0 ? -EIO : ret;
    1996             : 
    1997           0 : out_release_extents:
    1998           0 :         btrfs_release_log_ctx_extents(&ctx);
    1999           0 :         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    2000           0 :         goto out;
    2001             : }
    2002             : 
    2003             : static const struct vm_operations_struct btrfs_file_vm_ops = {
    2004             :         .fault          = filemap_fault,
    2005             :         .map_pages      = filemap_map_pages,
    2006             :         .page_mkwrite   = btrfs_page_mkwrite,
    2007             : };
    2008             : 
    2009           0 : static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
    2010             : {
    2011           0 :         struct address_space *mapping = filp->f_mapping;
    2012             : 
    2013           0 :         if (!mapping->a_ops->read_folio)
    2014             :                 return -ENOEXEC;
    2015             : 
    2016           0 :         file_accessed(filp);
    2017           0 :         vma->vm_ops = &btrfs_file_vm_ops;
    2018             : 
    2019           0 :         return 0;
    2020             : }
    2021             : 
    2022           0 : static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
    2023             :                           int slot, u64 start, u64 end)
    2024             : {
    2025           0 :         struct btrfs_file_extent_item *fi;
    2026           0 :         struct btrfs_key key;
    2027             : 
    2028           0 :         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
    2029             :                 return 0;
    2030             : 
    2031           0 :         btrfs_item_key_to_cpu(leaf, &key, slot);
    2032           0 :         if (key.objectid != btrfs_ino(inode) ||
    2033           0 :             key.type != BTRFS_EXTENT_DATA_KEY)
    2034             :                 return 0;
    2035             : 
    2036           0 :         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
    2037             : 
    2038           0 :         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
    2039             :                 return 0;
    2040             : 
    2041           0 :         if (btrfs_file_extent_disk_bytenr(leaf, fi))
    2042             :                 return 0;
    2043             : 
    2044           0 :         if (key.offset == end)
    2045             :                 return 1;
    2046           0 :         if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
    2047           0 :                 return 1;
    2048             :         return 0;
    2049             : }
    2050             : 
    2051           0 : static int fill_holes(struct btrfs_trans_handle *trans,
    2052             :                 struct btrfs_inode *inode,
    2053             :                 struct btrfs_path *path, u64 offset, u64 end)
    2054             : {
    2055           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2056           0 :         struct btrfs_root *root = inode->root;
    2057           0 :         struct extent_buffer *leaf;
    2058           0 :         struct btrfs_file_extent_item *fi;
    2059           0 :         struct extent_map *hole_em;
    2060           0 :         struct btrfs_key key;
    2061           0 :         int ret;
    2062             : 
    2063           0 :         if (btrfs_fs_incompat(fs_info, NO_HOLES))
    2064           0 :                 goto out;
    2065             : 
    2066           0 :         key.objectid = btrfs_ino(inode);
    2067           0 :         key.type = BTRFS_EXTENT_DATA_KEY;
    2068           0 :         key.offset = offset;
    2069             : 
    2070           0 :         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
    2071           0 :         if (ret <= 0) {
    2072             :                 /*
    2073             :                  * We should have dropped this offset, so if we find it then
    2074             :                  * something has gone horribly wrong.
    2075             :                  */
    2076           0 :                 if (ret == 0)
    2077           0 :                         ret = -EINVAL;
    2078           0 :                 return ret;
    2079             :         }
    2080             : 
    2081           0 :         leaf = path->nodes[0];
    2082           0 :         if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
    2083           0 :                 u64 num_bytes;
    2084             : 
    2085           0 :                 path->slots[0]--;
    2086           0 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
    2087             :                                     struct btrfs_file_extent_item);
    2088           0 :                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
    2089             :                         end - offset;
    2090           0 :                 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
    2091           0 :                 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
    2092           0 :                 btrfs_set_file_extent_offset(leaf, fi, 0);
    2093           0 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
    2094           0 :                 btrfs_mark_buffer_dirty(leaf);
    2095           0 :                 goto out;
    2096             :         }
    2097             : 
    2098           0 :         if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
    2099           0 :                 u64 num_bytes;
    2100             : 
    2101           0 :                 key.offset = offset;
    2102           0 :                 btrfs_set_item_key_safe(fs_info, path, &key);
    2103           0 :                 fi = btrfs_item_ptr(leaf, path->slots[0],
    2104             :                                     struct btrfs_file_extent_item);
    2105           0 :                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
    2106             :                         offset;
    2107           0 :                 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
    2108           0 :                 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
    2109           0 :                 btrfs_set_file_extent_offset(leaf, fi, 0);
    2110           0 :                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
    2111           0 :                 btrfs_mark_buffer_dirty(leaf);
    2112           0 :                 goto out;
    2113             :         }
    2114           0 :         btrfs_release_path(path);
    2115             : 
    2116           0 :         ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
    2117             :                                        end - offset);
    2118           0 :         if (ret)
    2119             :                 return ret;
    2120             : 
    2121           0 : out:
    2122           0 :         btrfs_release_path(path);
    2123             : 
    2124           0 :         hole_em = alloc_extent_map();
    2125           0 :         if (!hole_em) {
    2126           0 :                 btrfs_drop_extent_map_range(inode, offset, end - 1, false);
    2127           0 :                 btrfs_set_inode_full_sync(inode);
    2128             :         } else {
    2129           0 :                 hole_em->start = offset;
    2130           0 :                 hole_em->len = end - offset;
    2131           0 :                 hole_em->ram_bytes = hole_em->len;
    2132           0 :                 hole_em->orig_start = offset;
    2133             : 
    2134           0 :                 hole_em->block_start = EXTENT_MAP_HOLE;
    2135           0 :                 hole_em->block_len = 0;
    2136           0 :                 hole_em->orig_block_len = 0;
    2137           0 :                 hole_em->compress_type = BTRFS_COMPRESS_NONE;
    2138           0 :                 hole_em->generation = trans->transid;
    2139             : 
    2140           0 :                 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
    2141           0 :                 free_extent_map(hole_em);
    2142           0 :                 if (ret)
    2143           0 :                         btrfs_set_inode_full_sync(inode);
    2144             :         }
    2145             : 
    2146             :         return 0;
    2147             : }
    2148             : 
    2149             : /*
    2150             :  * Find a hole extent on given inode and change start/len to the end of hole
    2151             :  * extent.(hole/vacuum extent whose em->start <= start &&
    2152             :  *         em->start + em->len > start)
    2153             :  * When a hole extent is found, return 1 and modify start/len.
    2154             :  */
    2155           0 : static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
    2156             : {
    2157           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    2158           0 :         struct extent_map *em;
    2159           0 :         int ret = 0;
    2160             : 
    2161           0 :         em = btrfs_get_extent(inode, NULL, 0,
    2162           0 :                               round_down(*start, fs_info->sectorsize),
    2163           0 :                               round_up(*len, fs_info->sectorsize));
    2164           0 :         if (IS_ERR(em))
    2165           0 :                 return PTR_ERR(em);
    2166             : 
    2167             :         /* Hole or vacuum extent(only exists in no-hole mode) */
    2168           0 :         if (em->block_start == EXTENT_MAP_HOLE) {
    2169           0 :                 ret = 1;
    2170           0 :                 *len = em->start + em->len > *start + *len ?
    2171           0 :                        0 : *start + *len - em->start - em->len;
    2172           0 :                 *start = em->start + em->len;
    2173             :         }
    2174           0 :         free_extent_map(em);
    2175           0 :         return ret;
    2176             : }
    2177             : 
    2178           0 : static void btrfs_punch_hole_lock_range(struct inode *inode,
    2179             :                                         const u64 lockstart,
    2180             :                                         const u64 lockend,
    2181             :                                         struct extent_state **cached_state)
    2182             : {
    2183             :         /*
    2184             :          * For subpage case, if the range is not at page boundary, we could
    2185             :          * have pages at the leading/tailing part of the range.
    2186             :          * This could lead to dead loop since filemap_range_has_page()
    2187             :          * will always return true.
    2188             :          * So here we need to do extra page alignment for
    2189             :          * filemap_range_has_page().
    2190             :          */
    2191           0 :         const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
    2192           0 :         const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
    2193             : 
    2194           0 :         while (1) {
    2195           0 :                 truncate_pagecache_range(inode, lockstart, lockend);
    2196             : 
    2197           0 :                 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
    2198             :                             cached_state);
    2199             :                 /*
    2200             :                  * We can't have ordered extents in the range, nor dirty/writeback
    2201             :                  * pages, because we have locked the inode's VFS lock in exclusive
    2202             :                  * mode, we have locked the inode's i_mmap_lock in exclusive mode,
    2203             :                  * we have flushed all delalloc in the range and we have waited
    2204             :                  * for any ordered extents in the range to complete.
    2205             :                  * We can race with anyone reading pages from this range, so after
    2206             :                  * locking the range check if we have pages in the range, and if
    2207             :                  * we do, unlock the range and retry.
    2208             :                  */
    2209           0 :                 if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
    2210             :                                             page_lockend))
    2211             :                         break;
    2212             : 
    2213           0 :                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
    2214             :                               cached_state);
    2215             :         }
    2216             : 
    2217           0 :         btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
    2218           0 : }
    2219             : 
    2220           0 : static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
    2221             :                                      struct btrfs_inode *inode,
    2222             :                                      struct btrfs_path *path,
    2223             :                                      struct btrfs_replace_extent_info *extent_info,
    2224             :                                      const u64 replace_len,
    2225             :                                      const u64 bytes_to_drop)
    2226             : {
    2227           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2228           0 :         struct btrfs_root *root = inode->root;
    2229           0 :         struct btrfs_file_extent_item *extent;
    2230           0 :         struct extent_buffer *leaf;
    2231           0 :         struct btrfs_key key;
    2232           0 :         int slot;
    2233           0 :         struct btrfs_ref ref = { 0 };
    2234           0 :         int ret;
    2235             : 
    2236           0 :         if (replace_len == 0)
    2237             :                 return 0;
    2238             : 
    2239           0 :         if (extent_info->disk_offset == 0 &&
    2240           0 :             btrfs_fs_incompat(fs_info, NO_HOLES)) {
    2241           0 :                 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
    2242           0 :                 return 0;
    2243             :         }
    2244             : 
    2245           0 :         key.objectid = btrfs_ino(inode);
    2246           0 :         key.type = BTRFS_EXTENT_DATA_KEY;
    2247           0 :         key.offset = extent_info->file_offset;
    2248           0 :         ret = btrfs_insert_empty_item(trans, root, path, &key,
    2249             :                                       sizeof(struct btrfs_file_extent_item));
    2250           0 :         if (ret)
    2251             :                 return ret;
    2252           0 :         leaf = path->nodes[0];
    2253           0 :         slot = path->slots[0];
    2254           0 :         write_extent_buffer(leaf, extent_info->extent_buf,
    2255           0 :                             btrfs_item_ptr_offset(leaf, slot),
    2256             :                             sizeof(struct btrfs_file_extent_item));
    2257           0 :         extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
    2258           0 :         ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
    2259           0 :         btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
    2260           0 :         btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
    2261           0 :         if (extent_info->is_new_extent)
    2262           0 :                 btrfs_set_file_extent_generation(leaf, extent, trans->transid);
    2263           0 :         btrfs_mark_buffer_dirty(leaf);
    2264           0 :         btrfs_release_path(path);
    2265             : 
    2266           0 :         ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
    2267             :                                                 replace_len);
    2268           0 :         if (ret)
    2269             :                 return ret;
    2270             : 
    2271             :         /* If it's a hole, nothing more needs to be done. */
    2272           0 :         if (extent_info->disk_offset == 0) {
    2273           0 :                 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
    2274           0 :                 return 0;
    2275             :         }
    2276             : 
    2277           0 :         btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
    2278             : 
    2279           0 :         if (extent_info->is_new_extent && extent_info->insertions == 0) {
    2280           0 :                 key.objectid = extent_info->disk_offset;
    2281           0 :                 key.type = BTRFS_EXTENT_ITEM_KEY;
    2282           0 :                 key.offset = extent_info->disk_len;
    2283           0 :                 ret = btrfs_alloc_reserved_file_extent(trans, root,
    2284             :                                                        btrfs_ino(inode),
    2285             :                                                        extent_info->file_offset,
    2286           0 :                                                        extent_info->qgroup_reserved,
    2287             :                                                        &key);
    2288             :         } else {
    2289           0 :                 u64 ref_offset;
    2290             : 
    2291           0 :                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
    2292             :                                        extent_info->disk_offset,
    2293             :                                        extent_info->disk_len, 0);
    2294           0 :                 ref_offset = extent_info->file_offset - extent_info->data_offset;
    2295           0 :                 btrfs_init_data_ref(&ref, root->root_key.objectid,
    2296             :                                     btrfs_ino(inode), ref_offset, 0, false);
    2297           0 :                 ret = btrfs_inc_extent_ref(trans, &ref);
    2298             :         }
    2299             : 
    2300           0 :         extent_info->insertions++;
    2301             : 
    2302           0 :         return ret;
    2303             : }
    2304             : 
    2305             : /*
    2306             :  * The respective range must have been previously locked, as well as the inode.
    2307             :  * The end offset is inclusive (last byte of the range).
    2308             :  * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
    2309             :  * the file range with an extent.
    2310             :  * When not punching a hole, we don't want to end up in a state where we dropped
    2311             :  * extents without inserting a new one, so we must abort the transaction to avoid
    2312             :  * a corruption.
    2313             :  */
    2314           0 : int btrfs_replace_file_extents(struct btrfs_inode *inode,
    2315             :                                struct btrfs_path *path, const u64 start,
    2316             :                                const u64 end,
    2317             :                                struct btrfs_replace_extent_info *extent_info,
    2318             :                                struct btrfs_trans_handle **trans_out)
    2319             : {
    2320           0 :         struct btrfs_drop_extents_args drop_args = { 0 };
    2321           0 :         struct btrfs_root *root = inode->root;
    2322           0 :         struct btrfs_fs_info *fs_info = root->fs_info;
    2323           0 :         u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
    2324           0 :         u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
    2325           0 :         struct btrfs_trans_handle *trans = NULL;
    2326           0 :         struct btrfs_block_rsv *rsv;
    2327           0 :         unsigned int rsv_count;
    2328           0 :         u64 cur_offset;
    2329           0 :         u64 len = end - start;
    2330           0 :         int ret = 0;
    2331             : 
    2332           0 :         if (end <= start)
    2333             :                 return -EINVAL;
    2334             : 
    2335           0 :         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
    2336           0 :         if (!rsv) {
    2337           0 :                 ret = -ENOMEM;
    2338           0 :                 goto out;
    2339             :         }
    2340           0 :         rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
    2341           0 :         rsv->failfast = true;
    2342             : 
    2343             :         /*
    2344             :          * 1 - update the inode
    2345             :          * 1 - removing the extents in the range
    2346             :          * 1 - adding the hole extent if no_holes isn't set or if we are
    2347             :          *     replacing the range with a new extent
    2348             :          */
    2349           0 :         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
    2350             :                 rsv_count = 3;
    2351             :         else
    2352           0 :                 rsv_count = 2;
    2353             : 
    2354           0 :         trans = btrfs_start_transaction(root, rsv_count);
    2355           0 :         if (IS_ERR(trans)) {
    2356           0 :                 ret = PTR_ERR(trans);
    2357           0 :                 trans = NULL;
    2358           0 :                 goto out_free;
    2359             :         }
    2360             : 
    2361           0 :         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
    2362             :                                       min_size, false);
    2363           0 :         if (WARN_ON(ret))
    2364           0 :                 goto out_trans;
    2365           0 :         trans->block_rsv = rsv;
    2366             : 
    2367           0 :         cur_offset = start;
    2368           0 :         drop_args.path = path;
    2369           0 :         drop_args.end = end + 1;
    2370           0 :         drop_args.drop_cache = true;
    2371           0 :         while (cur_offset < end) {
    2372           0 :                 drop_args.start = cur_offset;
    2373           0 :                 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
    2374             :                 /* If we are punching a hole decrement the inode's byte count */
    2375           0 :                 if (!extent_info)
    2376           0 :                         btrfs_update_inode_bytes(inode, 0,
    2377             :                                                  drop_args.bytes_found);
    2378           0 :                 if (ret != -ENOSPC) {
    2379             :                         /*
    2380             :                          * The only time we don't want to abort is if we are
    2381             :                          * attempting to clone a partial inline extent, in which
    2382             :                          * case we'll get EOPNOTSUPP.  However if we aren't
    2383             :                          * clone we need to abort no matter what, because if we
    2384             :                          * got EOPNOTSUPP via prealloc then we messed up and
    2385             :                          * need to abort.
    2386             :                          */
    2387           0 :                         if (ret &&
    2388           0 :                             (ret != -EOPNOTSUPP ||
    2389           0 :                              (extent_info && extent_info->is_new_extent)))
    2390           0 :                                 btrfs_abort_transaction(trans, ret);
    2391             :                         break;
    2392             :                 }
    2393             : 
    2394           0 :                 trans->block_rsv = &fs_info->trans_block_rsv;
    2395             : 
    2396           0 :                 if (!extent_info && cur_offset < drop_args.drop_end &&
    2397             :                     cur_offset < ino_size) {
    2398           0 :                         ret = fill_holes(trans, inode, path, cur_offset,
    2399             :                                          drop_args.drop_end);
    2400           0 :                         if (ret) {
    2401             :                                 /*
    2402             :                                  * If we failed then we didn't insert our hole
    2403             :                                  * entries for the area we dropped, so now the
    2404             :                                  * fs is corrupted, so we must abort the
    2405             :                                  * transaction.
    2406             :                                  */
    2407           0 :                                 btrfs_abort_transaction(trans, ret);
    2408           0 :                                 break;
    2409             :                         }
    2410           0 :                 } else if (!extent_info && cur_offset < drop_args.drop_end) {
    2411             :                         /*
    2412             :                          * We are past the i_size here, but since we didn't
    2413             :                          * insert holes we need to clear the mapped area so we
    2414             :                          * know to not set disk_i_size in this area until a new
    2415             :                          * file extent is inserted here.
    2416             :                          */
    2417           0 :                         ret = btrfs_inode_clear_file_extent_range(inode,
    2418             :                                         cur_offset,
    2419             :                                         drop_args.drop_end - cur_offset);
    2420           0 :                         if (ret) {
    2421             :                                 /*
    2422             :                                  * We couldn't clear our area, so we could
    2423             :                                  * presumably adjust up and corrupt the fs, so
    2424             :                                  * we need to abort.
    2425             :                                  */
    2426           0 :                                 btrfs_abort_transaction(trans, ret);
    2427           0 :                                 break;
    2428             :                         }
    2429             :                 }
    2430             : 
    2431           0 :                 if (extent_info &&
    2432           0 :                     drop_args.drop_end > extent_info->file_offset) {
    2433           0 :                         u64 replace_len = drop_args.drop_end -
    2434             :                                           extent_info->file_offset;
    2435             : 
    2436           0 :                         ret = btrfs_insert_replace_extent(trans, inode, path,
    2437             :                                         extent_info, replace_len,
    2438             :                                         drop_args.bytes_found);
    2439           0 :                         if (ret) {
    2440           0 :                                 btrfs_abort_transaction(trans, ret);
    2441           0 :                                 break;
    2442             :                         }
    2443           0 :                         extent_info->data_len -= replace_len;
    2444           0 :                         extent_info->data_offset += replace_len;
    2445           0 :                         extent_info->file_offset += replace_len;
    2446             :                 }
    2447             : 
    2448             :                 /*
    2449             :                  * We are releasing our handle on the transaction, balance the
    2450             :                  * dirty pages of the btree inode and flush delayed items, and
    2451             :                  * then get a new transaction handle, which may now point to a
    2452             :                  * new transaction in case someone else may have committed the
    2453             :                  * transaction we used to replace/drop file extent items. So
    2454             :                  * bump the inode's iversion and update mtime and ctime except
    2455             :                  * if we are called from a dedupe context. This is because a
    2456             :                  * power failure/crash may happen after the transaction is
    2457             :                  * committed and before we finish replacing/dropping all the
    2458             :                  * file extent items we need.
    2459             :                  */
    2460           0 :                 inode_inc_iversion(&inode->vfs_inode);
    2461             : 
    2462           0 :                 if (!extent_info || extent_info->update_times) {
    2463           0 :                         inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
    2464           0 :                         inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
    2465             :                 }
    2466             : 
    2467           0 :                 ret = btrfs_update_inode(trans, root, inode);
    2468           0 :                 if (ret)
    2469             :                         break;
    2470             : 
    2471           0 :                 btrfs_end_transaction(trans);
    2472           0 :                 btrfs_btree_balance_dirty(fs_info);
    2473             : 
    2474           0 :                 trans = btrfs_start_transaction(root, rsv_count);
    2475           0 :                 if (IS_ERR(trans)) {
    2476           0 :                         ret = PTR_ERR(trans);
    2477           0 :                         trans = NULL;
    2478           0 :                         break;
    2479             :                 }
    2480             : 
    2481           0 :                 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
    2482             :                                               rsv, min_size, false);
    2483           0 :                 if (WARN_ON(ret))
    2484             :                         break;
    2485           0 :                 trans->block_rsv = rsv;
    2486             : 
    2487           0 :                 cur_offset = drop_args.drop_end;
    2488           0 :                 len = end - cur_offset;
    2489           0 :                 if (!extent_info && len) {
    2490           0 :                         ret = find_first_non_hole(inode, &cur_offset, &len);
    2491           0 :                         if (unlikely(ret < 0))
    2492             :                                 break;
    2493           0 :                         if (ret && !len) {
    2494             :                                 ret = 0;
    2495             :                                 break;
    2496             :                         }
    2497             :                 }
    2498             :         }
    2499             : 
    2500             :         /*
    2501             :          * If we were cloning, force the next fsync to be a full one since we
    2502             :          * we replaced (or just dropped in the case of cloning holes when
    2503             :          * NO_HOLES is enabled) file extent items and did not setup new extent
    2504             :          * maps for the replacement extents (or holes).
    2505             :          */
    2506           0 :         if (extent_info && !extent_info->is_new_extent)
    2507           0 :                 btrfs_set_inode_full_sync(inode);
    2508             : 
    2509           0 :         if (ret)
    2510           0 :                 goto out_trans;
    2511             : 
    2512           0 :         trans->block_rsv = &fs_info->trans_block_rsv;
    2513             :         /*
    2514             :          * If we are using the NO_HOLES feature we might have had already an
    2515             :          * hole that overlaps a part of the region [lockstart, lockend] and
    2516             :          * ends at (or beyond) lockend. Since we have no file extent items to
    2517             :          * represent holes, drop_end can be less than lockend and so we must
    2518             :          * make sure we have an extent map representing the existing hole (the
    2519             :          * call to __btrfs_drop_extents() might have dropped the existing extent
    2520             :          * map representing the existing hole), otherwise the fast fsync path
    2521             :          * will not record the existence of the hole region
    2522             :          * [existing_hole_start, lockend].
    2523             :          */
    2524           0 :         if (drop_args.drop_end <= end)
    2525           0 :                 drop_args.drop_end = end + 1;
    2526             :         /*
    2527             :          * Don't insert file hole extent item if it's for a range beyond eof
    2528             :          * (because it's useless) or if it represents a 0 bytes range (when
    2529             :          * cur_offset == drop_end).
    2530             :          */
    2531           0 :         if (!extent_info && cur_offset < ino_size &&
    2532           0 :             cur_offset < drop_args.drop_end) {
    2533           0 :                 ret = fill_holes(trans, inode, path, cur_offset,
    2534             :                                  drop_args.drop_end);
    2535           0 :                 if (ret) {
    2536             :                         /* Same comment as above. */
    2537           0 :                         btrfs_abort_transaction(trans, ret);
    2538           0 :                         goto out_trans;
    2539             :                 }
    2540           0 :         } else if (!extent_info && cur_offset < drop_args.drop_end) {
    2541             :                 /* See the comment in the loop above for the reasoning here. */
    2542           0 :                 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
    2543             :                                         drop_args.drop_end - cur_offset);
    2544           0 :                 if (ret) {
    2545           0 :                         btrfs_abort_transaction(trans, ret);
    2546           0 :                         goto out_trans;
    2547             :                 }
    2548             : 
    2549             :         }
    2550           0 :         if (extent_info) {
    2551           0 :                 ret = btrfs_insert_replace_extent(trans, inode, path,
    2552             :                                 extent_info, extent_info->data_len,
    2553             :                                 drop_args.bytes_found);
    2554           0 :                 if (ret) {
    2555           0 :                         btrfs_abort_transaction(trans, ret);
    2556           0 :                         goto out_trans;
    2557             :                 }
    2558             :         }
    2559             : 
    2560           0 : out_trans:
    2561           0 :         if (!trans)
    2562           0 :                 goto out_free;
    2563             : 
    2564           0 :         trans->block_rsv = &fs_info->trans_block_rsv;
    2565           0 :         if (ret)
    2566           0 :                 btrfs_end_transaction(trans);
    2567             :         else
    2568           0 :                 *trans_out = trans;
    2569           0 : out_free:
    2570           0 :         btrfs_free_block_rsv(fs_info, rsv);
    2571             : out:
    2572             :         return ret;
    2573             : }
    2574             : 
    2575           0 : static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
    2576             : {
    2577           0 :         struct inode *inode = file_inode(file);
    2578           0 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
    2579           0 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    2580           0 :         struct extent_state *cached_state = NULL;
    2581           0 :         struct btrfs_path *path;
    2582           0 :         struct btrfs_trans_handle *trans = NULL;
    2583           0 :         u64 lockstart;
    2584           0 :         u64 lockend;
    2585           0 :         u64 tail_start;
    2586           0 :         u64 tail_len;
    2587           0 :         u64 orig_start = offset;
    2588           0 :         int ret = 0;
    2589           0 :         bool same_block;
    2590           0 :         u64 ino_size;
    2591           0 :         bool truncated_block = false;
    2592           0 :         bool updated_inode = false;
    2593             : 
    2594           0 :         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    2595             : 
    2596           0 :         ret = btrfs_wait_ordered_range(inode, offset, len);
    2597           0 :         if (ret)
    2598           0 :                 goto out_only_mutex;
    2599             : 
    2600           0 :         ino_size = round_up(inode->i_size, fs_info->sectorsize);
    2601           0 :         ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
    2602           0 :         if (ret < 0)
    2603           0 :                 goto out_only_mutex;
    2604           0 :         if (ret && !len) {
    2605             :                 /* Already in a large hole */
    2606           0 :                 ret = 0;
    2607           0 :                 goto out_only_mutex;
    2608             :         }
    2609             : 
    2610           0 :         ret = file_modified(file);
    2611           0 :         if (ret)
    2612           0 :                 goto out_only_mutex;
    2613             : 
    2614           0 :         lockstart = round_up(offset, fs_info->sectorsize);
    2615           0 :         lockend = round_down(offset + len, fs_info->sectorsize) - 1;
    2616           0 :         same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
    2617           0 :                 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
    2618             :         /*
    2619             :          * We needn't truncate any block which is beyond the end of the file
    2620             :          * because we are sure there is no data there.
    2621             :          */
    2622             :         /*
    2623             :          * Only do this if we are in the same block and we aren't doing the
    2624             :          * entire block.
    2625             :          */
    2626           0 :         if (same_block && len < fs_info->sectorsize) {
    2627           0 :                 if (offset < ino_size) {
    2628           0 :                         truncated_block = true;
    2629           0 :                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
    2630             :                                                    0);
    2631             :                 } else {
    2632             :                         ret = 0;
    2633             :                 }
    2634           0 :                 goto out_only_mutex;
    2635             :         }
    2636             : 
    2637             :         /* zero back part of the first block */
    2638           0 :         if (offset < ino_size) {
    2639           0 :                 truncated_block = true;
    2640           0 :                 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
    2641           0 :                 if (ret) {
    2642           0 :                         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    2643           0 :                         return ret;
    2644             :                 }
    2645             :         }
    2646             : 
    2647             :         /* Check the aligned pages after the first unaligned page,
    2648             :          * if offset != orig_start, which means the first unaligned page
    2649             :          * including several following pages are already in holes,
    2650             :          * the extra check can be skipped */
    2651           0 :         if (offset == orig_start) {
    2652             :                 /* after truncate page, check hole again */
    2653           0 :                 len = offset + len - lockstart;
    2654           0 :                 offset = lockstart;
    2655           0 :                 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
    2656           0 :                 if (ret < 0)
    2657           0 :                         goto out_only_mutex;
    2658           0 :                 if (ret && !len) {
    2659           0 :                         ret = 0;
    2660           0 :                         goto out_only_mutex;
    2661             :                 }
    2662           0 :                 lockstart = offset;
    2663             :         }
    2664             : 
    2665             :         /* Check the tail unaligned part is in a hole */
    2666           0 :         tail_start = lockend + 1;
    2667           0 :         tail_len = offset + len - tail_start;
    2668           0 :         if (tail_len) {
    2669           0 :                 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
    2670           0 :                 if (unlikely(ret < 0))
    2671           0 :                         goto out_only_mutex;
    2672           0 :                 if (!ret) {
    2673             :                         /* zero the front end of the last page */
    2674           0 :                         if (tail_start + tail_len < ino_size) {
    2675           0 :                                 truncated_block = true;
    2676           0 :                                 ret = btrfs_truncate_block(BTRFS_I(inode),
    2677             :                                                         tail_start + tail_len,
    2678             :                                                         0, 1);
    2679           0 :                                 if (ret)
    2680           0 :                                         goto out_only_mutex;
    2681             :                         }
    2682             :                 }
    2683             :         }
    2684             : 
    2685           0 :         if (lockend < lockstart) {
    2686           0 :                 ret = 0;
    2687           0 :                 goto out_only_mutex;
    2688             :         }
    2689             : 
    2690           0 :         btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
    2691             : 
    2692           0 :         path = btrfs_alloc_path();
    2693           0 :         if (!path) {
    2694           0 :                 ret = -ENOMEM;
    2695           0 :                 goto out;
    2696             :         }
    2697             : 
    2698           0 :         ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
    2699             :                                          lockend, NULL, &trans);
    2700           0 :         btrfs_free_path(path);
    2701           0 :         if (ret)
    2702           0 :                 goto out;
    2703             : 
    2704           0 :         ASSERT(trans != NULL);
    2705           0 :         inode_inc_iversion(inode);
    2706           0 :         inode->i_mtime = current_time(inode);
    2707           0 :         inode->i_ctime = inode->i_mtime;
    2708           0 :         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    2709           0 :         updated_inode = true;
    2710           0 :         btrfs_end_transaction(trans);
    2711           0 :         btrfs_btree_balance_dirty(fs_info);
    2712           0 : out:
    2713           0 :         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
    2714             :                       &cached_state);
    2715           0 : out_only_mutex:
    2716           0 :         if (!updated_inode && truncated_block && !ret) {
    2717             :                 /*
    2718             :                  * If we only end up zeroing part of a page, we still need to
    2719             :                  * update the inode item, so that all the time fields are
    2720             :                  * updated as well as the necessary btrfs inode in memory fields
    2721             :                  * for detecting, at fsync time, if the inode isn't yet in the
    2722             :                  * log tree or it's there but not up to date.
    2723             :                  */
    2724           0 :                 struct timespec64 now = current_time(inode);
    2725             : 
    2726           0 :                 inode_inc_iversion(inode);
    2727           0 :                 inode->i_mtime = now;
    2728           0 :                 inode->i_ctime = now;
    2729           0 :                 trans = btrfs_start_transaction(root, 1);
    2730           0 :                 if (IS_ERR(trans)) {
    2731           0 :                         ret = PTR_ERR(trans);
    2732             :                 } else {
    2733           0 :                         int ret2;
    2734             : 
    2735           0 :                         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    2736           0 :                         ret2 = btrfs_end_transaction(trans);
    2737           0 :                         if (!ret)
    2738           0 :                                 ret = ret2;
    2739             :                 }
    2740             :         }
    2741           0 :         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    2742           0 :         return ret;
    2743             : }
    2744             : 
    2745             : /* Helper structure to record which range is already reserved */
    2746             : struct falloc_range {
    2747             :         struct list_head list;
    2748             :         u64 start;
    2749             :         u64 len;
    2750             : };
    2751             : 
    2752             : /*
    2753             :  * Helper function to add falloc range
    2754             :  *
    2755             :  * Caller should have locked the larger range of extent containing
    2756             :  * [start, len)
    2757             :  */
    2758           0 : static int add_falloc_range(struct list_head *head, u64 start, u64 len)
    2759             : {
    2760           0 :         struct falloc_range *range = NULL;
    2761             : 
    2762           0 :         if (!list_empty(head)) {
    2763             :                 /*
    2764             :                  * As fallocate iterates by bytenr order, we only need to check
    2765             :                  * the last range.
    2766             :                  */
    2767           0 :                 range = list_last_entry(head, struct falloc_range, list);
    2768           0 :                 if (range->start + range->len == start) {
    2769           0 :                         range->len += len;
    2770           0 :                         return 0;
    2771             :                 }
    2772             :         }
    2773             : 
    2774           0 :         range = kmalloc(sizeof(*range), GFP_KERNEL);
    2775           0 :         if (!range)
    2776             :                 return -ENOMEM;
    2777           0 :         range->start = start;
    2778           0 :         range->len = len;
    2779           0 :         list_add_tail(&range->list, head);
    2780           0 :         return 0;
    2781             : }
    2782             : 
    2783           0 : static int btrfs_fallocate_update_isize(struct inode *inode,
    2784             :                                         const u64 end,
    2785             :                                         const int mode)
    2786             : {
    2787           0 :         struct btrfs_trans_handle *trans;
    2788           0 :         struct btrfs_root *root = BTRFS_I(inode)->root;
    2789           0 :         int ret;
    2790           0 :         int ret2;
    2791             : 
    2792           0 :         if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
    2793             :                 return 0;
    2794             : 
    2795           0 :         trans = btrfs_start_transaction(root, 1);
    2796           0 :         if (IS_ERR(trans))
    2797           0 :                 return PTR_ERR(trans);
    2798             : 
    2799           0 :         inode->i_ctime = current_time(inode);
    2800           0 :         i_size_write(inode, end);
    2801           0 :         btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
    2802           0 :         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    2803           0 :         ret2 = btrfs_end_transaction(trans);
    2804             : 
    2805           0 :         return ret ? ret : ret2;
    2806             : }
    2807             : 
    2808             : enum {
    2809             :         RANGE_BOUNDARY_WRITTEN_EXTENT,
    2810             :         RANGE_BOUNDARY_PREALLOC_EXTENT,
    2811             :         RANGE_BOUNDARY_HOLE,
    2812             : };
    2813             : 
    2814           0 : static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
    2815             :                                                  u64 offset)
    2816             : {
    2817           0 :         const u64 sectorsize = inode->root->fs_info->sectorsize;
    2818           0 :         struct extent_map *em;
    2819           0 :         int ret;
    2820             : 
    2821           0 :         offset = round_down(offset, sectorsize);
    2822           0 :         em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
    2823           0 :         if (IS_ERR(em))
    2824           0 :                 return PTR_ERR(em);
    2825             : 
    2826           0 :         if (em->block_start == EXTENT_MAP_HOLE)
    2827             :                 ret = RANGE_BOUNDARY_HOLE;
    2828           0 :         else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
    2829             :                 ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
    2830             :         else
    2831           0 :                 ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
    2832             : 
    2833           0 :         free_extent_map(em);
    2834           0 :         return ret;
    2835             : }
    2836             : 
    2837           0 : static int btrfs_zero_range(struct inode *inode,
    2838             :                             loff_t offset,
    2839             :                             loff_t len,
    2840             :                             const int mode)
    2841             : {
    2842           0 :         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
    2843           0 :         struct extent_map *em;
    2844           0 :         struct extent_changeset *data_reserved = NULL;
    2845           0 :         int ret;
    2846           0 :         u64 alloc_hint = 0;
    2847           0 :         const u64 sectorsize = fs_info->sectorsize;
    2848           0 :         u64 alloc_start = round_down(offset, sectorsize);
    2849           0 :         u64 alloc_end = round_up(offset + len, sectorsize);
    2850           0 :         u64 bytes_to_reserve = 0;
    2851           0 :         bool space_reserved = false;
    2852             : 
    2853           0 :         em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
    2854             :                               alloc_end - alloc_start);
    2855           0 :         if (IS_ERR(em)) {
    2856           0 :                 ret = PTR_ERR(em);
    2857           0 :                 goto out;
    2858             :         }
    2859             : 
    2860             :         /*
    2861             :          * Avoid hole punching and extent allocation for some cases. More cases
    2862             :          * could be considered, but these are unlikely common and we keep things
    2863             :          * as simple as possible for now. Also, intentionally, if the target
    2864             :          * range contains one or more prealloc extents together with regular
    2865             :          * extents and holes, we drop all the existing extents and allocate a
    2866             :          * new prealloc extent, so that we get a larger contiguous disk extent.
    2867             :          */
    2868           0 :         if (em->start <= alloc_start &&
    2869           0 :             test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
    2870           0 :                 const u64 em_end = em->start + em->len;
    2871             : 
    2872           0 :                 if (em_end >= offset + len) {
    2873             :                         /*
    2874             :                          * The whole range is already a prealloc extent,
    2875             :                          * do nothing except updating the inode's i_size if
    2876             :                          * needed.
    2877             :                          */
    2878           0 :                         free_extent_map(em);
    2879           0 :                         ret = btrfs_fallocate_update_isize(inode, offset + len,
    2880             :                                                            mode);
    2881           0 :                         goto out;
    2882             :                 }
    2883             :                 /*
    2884             :                  * Part of the range is already a prealloc extent, so operate
    2885             :                  * only on the remaining part of the range.
    2886             :                  */
    2887           0 :                 alloc_start = em_end;
    2888           0 :                 ASSERT(IS_ALIGNED(alloc_start, sectorsize));
    2889           0 :                 len = offset + len - alloc_start;
    2890           0 :                 offset = alloc_start;
    2891           0 :                 alloc_hint = em->block_start + em->len;
    2892             :         }
    2893           0 :         free_extent_map(em);
    2894             : 
    2895           0 :         if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
    2896           0 :             BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
    2897           0 :                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
    2898             :                                       sectorsize);
    2899           0 :                 if (IS_ERR(em)) {
    2900           0 :                         ret = PTR_ERR(em);
    2901           0 :                         goto out;
    2902             :                 }
    2903             : 
    2904           0 :                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
    2905           0 :                         free_extent_map(em);
    2906           0 :                         ret = btrfs_fallocate_update_isize(inode, offset + len,
    2907             :                                                            mode);
    2908           0 :                         goto out;
    2909             :                 }
    2910           0 :                 if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
    2911           0 :                         free_extent_map(em);
    2912           0 :                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
    2913             :                                                    0);
    2914           0 :                         if (!ret)
    2915           0 :                                 ret = btrfs_fallocate_update_isize(inode,
    2916             :                                                                    offset + len,
    2917             :                                                                    mode);
    2918           0 :                         return ret;
    2919             :                 }
    2920           0 :                 free_extent_map(em);
    2921           0 :                 alloc_start = round_down(offset, sectorsize);
    2922           0 :                 alloc_end = alloc_start + sectorsize;
    2923           0 :                 goto reserve_space;
    2924             :         }
    2925             : 
    2926           0 :         alloc_start = round_up(offset, sectorsize);
    2927           0 :         alloc_end = round_down(offset + len, sectorsize);
    2928             : 
    2929             :         /*
    2930             :          * For unaligned ranges, check the pages at the boundaries, they might
    2931             :          * map to an extent, in which case we need to partially zero them, or
    2932             :          * they might map to a hole, in which case we need our allocation range
    2933             :          * to cover them.
    2934             :          */
    2935           0 :         if (!IS_ALIGNED(offset, sectorsize)) {
    2936           0 :                 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
    2937             :                                                             offset);
    2938           0 :                 if (ret < 0)
    2939           0 :                         goto out;
    2940           0 :                 if (ret == RANGE_BOUNDARY_HOLE) {
    2941           0 :                         alloc_start = round_down(offset, sectorsize);
    2942           0 :                         ret = 0;
    2943           0 :                 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
    2944           0 :                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
    2945           0 :                         if (ret)
    2946           0 :                                 goto out;
    2947             :                 } else {
    2948             :                         ret = 0;
    2949             :                 }
    2950             :         }
    2951             : 
    2952           0 :         if (!IS_ALIGNED(offset + len, sectorsize)) {
    2953           0 :                 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
    2954             :                                                             offset + len);
    2955           0 :                 if (ret < 0)
    2956           0 :                         goto out;
    2957           0 :                 if (ret == RANGE_BOUNDARY_HOLE) {
    2958           0 :                         alloc_end = round_up(offset + len, sectorsize);
    2959           0 :                         ret = 0;
    2960           0 :                 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
    2961           0 :                         ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
    2962             :                                                    0, 1);
    2963           0 :                         if (ret)
    2964           0 :                                 goto out;
    2965             :                 } else {
    2966             :                         ret = 0;
    2967             :                 }
    2968             :         }
    2969             : 
    2970           0 : reserve_space:
    2971           0 :         if (alloc_start < alloc_end) {
    2972           0 :                 struct extent_state *cached_state = NULL;
    2973           0 :                 const u64 lockstart = alloc_start;
    2974           0 :                 const u64 lockend = alloc_end - 1;
    2975             : 
    2976           0 :                 bytes_to_reserve = alloc_end - alloc_start;
    2977           0 :                 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
    2978             :                                                       bytes_to_reserve);
    2979           0 :                 if (ret < 0)
    2980           0 :                         goto out;
    2981           0 :                 space_reserved = true;
    2982           0 :                 btrfs_punch_hole_lock_range(inode, lockstart, lockend,
    2983             :                                             &cached_state);
    2984           0 :                 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
    2985             :                                                 alloc_start, bytes_to_reserve);
    2986           0 :                 if (ret) {
    2987           0 :                         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
    2988             :                                       lockend, &cached_state);
    2989           0 :                         goto out;
    2990             :                 }
    2991           0 :                 ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
    2992             :                                                 alloc_end - alloc_start,
    2993             :                                                 i_blocksize(inode),
    2994             :                                                 offset + len, &alloc_hint);
    2995           0 :                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
    2996             :                               &cached_state);
    2997             :                 /* btrfs_prealloc_file_range releases reserved space on error */
    2998           0 :                 if (ret) {
    2999           0 :                         space_reserved = false;
    3000           0 :                         goto out;
    3001             :                 }
    3002             :         }
    3003           0 :         ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
    3004           0 :  out:
    3005           0 :         if (ret && space_reserved)
    3006           0 :                 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
    3007             :                                                alloc_start, bytes_to_reserve);
    3008           0 :         extent_changeset_free(data_reserved);
    3009             : 
    3010           0 :         return ret;
    3011             : }
    3012             : 
    3013           0 : static long btrfs_fallocate(struct file *file, int mode,
    3014             :                             loff_t offset, loff_t len)
    3015             : {
    3016           0 :         struct inode *inode = file_inode(file);
    3017           0 :         struct extent_state *cached_state = NULL;
    3018           0 :         struct extent_changeset *data_reserved = NULL;
    3019           0 :         struct falloc_range *range;
    3020           0 :         struct falloc_range *tmp;
    3021           0 :         struct list_head reserve_list;
    3022           0 :         u64 cur_offset;
    3023           0 :         u64 last_byte;
    3024           0 :         u64 alloc_start;
    3025           0 :         u64 alloc_end;
    3026           0 :         u64 alloc_hint = 0;
    3027           0 :         u64 locked_end;
    3028           0 :         u64 actual_end = 0;
    3029           0 :         u64 data_space_needed = 0;
    3030           0 :         u64 data_space_reserved = 0;
    3031           0 :         u64 qgroup_reserved = 0;
    3032           0 :         struct extent_map *em;
    3033           0 :         int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
    3034           0 :         int ret;
    3035             : 
    3036             :         /* Do not allow fallocate in ZONED mode */
    3037           0 :         if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
    3038             :                 return -EOPNOTSUPP;
    3039             : 
    3040           0 :         alloc_start = round_down(offset, blocksize);
    3041           0 :         alloc_end = round_up(offset + len, blocksize);
    3042           0 :         cur_offset = alloc_start;
    3043             : 
    3044             :         /* Make sure we aren't being give some crap mode */
    3045           0 :         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
    3046             :                      FALLOC_FL_ZERO_RANGE))
    3047             :                 return -EOPNOTSUPP;
    3048             : 
    3049           0 :         if (mode & FALLOC_FL_PUNCH_HOLE)
    3050           0 :                 return btrfs_punch_hole(file, offset, len);
    3051             : 
    3052           0 :         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    3053             : 
    3054           0 :         if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
    3055           0 :                 ret = inode_newsize_ok(inode, offset + len);
    3056           0 :                 if (ret)
    3057           0 :                         goto out;
    3058             :         }
    3059             : 
    3060           0 :         ret = file_modified(file);
    3061           0 :         if (ret)
    3062           0 :                 goto out;
    3063             : 
    3064             :         /*
    3065             :          * TODO: Move these two operations after we have checked
    3066             :          * accurate reserved space, or fallocate can still fail but
    3067             :          * with page truncated or size expanded.
    3068             :          *
    3069             :          * But that's a minor problem and won't do much harm BTW.
    3070             :          */
    3071           0 :         if (alloc_start > inode->i_size) {
    3072           0 :                 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
    3073             :                                         alloc_start);
    3074           0 :                 if (ret)
    3075           0 :                         goto out;
    3076           0 :         } else if (offset + len > inode->i_size) {
    3077             :                 /*
    3078             :                  * If we are fallocating from the end of the file onward we
    3079             :                  * need to zero out the end of the block if i_size lands in the
    3080             :                  * middle of a block.
    3081             :                  */
    3082           0 :                 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
    3083           0 :                 if (ret)
    3084           0 :                         goto out;
    3085             :         }
    3086             : 
    3087             :         /*
    3088             :          * We have locked the inode at the VFS level (in exclusive mode) and we
    3089             :          * have locked the i_mmap_lock lock (in exclusive mode). Now before
    3090             :          * locking the file range, flush all dealloc in the range and wait for
    3091             :          * all ordered extents in the range to complete. After this we can lock
    3092             :          * the file range and, due to the previous locking we did, we know there
    3093             :          * can't be more delalloc or ordered extents in the range.
    3094             :          */
    3095           0 :         ret = btrfs_wait_ordered_range(inode, alloc_start,
    3096             :                                        alloc_end - alloc_start);
    3097           0 :         if (ret)
    3098           0 :                 goto out;
    3099             : 
    3100           0 :         if (mode & FALLOC_FL_ZERO_RANGE) {
    3101           0 :                 ret = btrfs_zero_range(inode, offset, len, mode);
    3102           0 :                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    3103           0 :                 return ret;
    3104             :         }
    3105             : 
    3106           0 :         locked_end = alloc_end - 1;
    3107           0 :         lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
    3108             :                     &cached_state);
    3109             : 
    3110           0 :         btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
    3111             : 
    3112             :         /* First, check if we exceed the qgroup limit */
    3113           0 :         INIT_LIST_HEAD(&reserve_list);
    3114           0 :         while (cur_offset < alloc_end) {
    3115           0 :                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
    3116             :                                       alloc_end - cur_offset);
    3117           0 :                 if (IS_ERR(em)) {
    3118           0 :                         ret = PTR_ERR(em);
    3119           0 :                         break;
    3120             :                 }
    3121           0 :                 last_byte = min(extent_map_end(em), alloc_end);
    3122           0 :                 actual_end = min_t(u64, extent_map_end(em), offset + len);
    3123           0 :                 last_byte = ALIGN(last_byte, blocksize);
    3124           0 :                 if (em->block_start == EXTENT_MAP_HOLE ||
    3125           0 :                     (cur_offset >= inode->i_size &&
    3126           0 :                      !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
    3127           0 :                         const u64 range_len = last_byte - cur_offset;
    3128             : 
    3129           0 :                         ret = add_falloc_range(&reserve_list, cur_offset, range_len);
    3130           0 :                         if (ret < 0) {
    3131           0 :                                 free_extent_map(em);
    3132           0 :                                 break;
    3133             :                         }
    3134           0 :                         ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
    3135             :                                         &data_reserved, cur_offset, range_len);
    3136           0 :                         if (ret < 0) {
    3137           0 :                                 free_extent_map(em);
    3138           0 :                                 break;
    3139             :                         }
    3140           0 :                         qgroup_reserved += range_len;
    3141           0 :                         data_space_needed += range_len;
    3142             :                 }
    3143           0 :                 free_extent_map(em);
    3144           0 :                 cur_offset = last_byte;
    3145             :         }
    3146             : 
    3147           0 :         if (!ret && data_space_needed > 0) {
    3148             :                 /*
    3149             :                  * We are safe to reserve space here as we can't have delalloc
    3150             :                  * in the range, see above.
    3151             :                  */
    3152           0 :                 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
    3153             :                                                       data_space_needed);
    3154           0 :                 if (!ret)
    3155           0 :                         data_space_reserved = data_space_needed;
    3156             :         }
    3157             : 
    3158             :         /*
    3159             :          * If ret is still 0, means we're OK to fallocate.
    3160             :          * Or just cleanup the list and exit.
    3161             :          */
    3162           0 :         list_for_each_entry_safe(range, tmp, &reserve_list, list) {
    3163           0 :                 if (!ret) {
    3164           0 :                         ret = btrfs_prealloc_file_range(inode, mode,
    3165             :                                         range->start,
    3166             :                                         range->len, i_blocksize(inode),
    3167             :                                         offset + len, &alloc_hint);
    3168             :                         /*
    3169             :                          * btrfs_prealloc_file_range() releases space even
    3170             :                          * if it returns an error.
    3171             :                          */
    3172           0 :                         data_space_reserved -= range->len;
    3173           0 :                         qgroup_reserved -= range->len;
    3174           0 :                 } else if (data_space_reserved > 0) {
    3175           0 :                         btrfs_free_reserved_data_space(BTRFS_I(inode),
    3176             :                                                data_reserved, range->start,
    3177             :                                                range->len);
    3178           0 :                         data_space_reserved -= range->len;
    3179           0 :                         qgroup_reserved -= range->len;
    3180           0 :                 } else if (qgroup_reserved > 0) {
    3181           0 :                         btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
    3182             :                                                range->start, range->len);
    3183           0 :                         qgroup_reserved -= range->len;
    3184             :                 }
    3185           0 :                 list_del(&range->list);
    3186           0 :                 kfree(range);
    3187             :         }
    3188           0 :         if (ret < 0)
    3189           0 :                 goto out_unlock;
    3190             : 
    3191             :         /*
    3192             :          * We didn't need to allocate any more space, but we still extended the
    3193             :          * size of the file so we need to update i_size and the inode item.
    3194             :          */
    3195           0 :         ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
    3196           0 : out_unlock:
    3197           0 :         unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
    3198             :                       &cached_state);
    3199           0 : out:
    3200           0 :         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
    3201           0 :         extent_changeset_free(data_reserved);
    3202           0 :         return ret;
    3203             : }
    3204             : 
    3205             : /*
    3206             :  * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
    3207             :  * that has unflushed and/or flushing delalloc. There might be other adjacent
    3208             :  * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
    3209             :  * looping while it gets adjacent subranges, and merging them together.
    3210             :  */
    3211           0 : static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
    3212             :                                    struct extent_state **cached_state,
    3213             :                                    bool *search_io_tree,
    3214             :                                    u64 *delalloc_start_ret, u64 *delalloc_end_ret)
    3215             : {
    3216           0 :         u64 len = end + 1 - start;
    3217           0 :         u64 delalloc_len = 0;
    3218           0 :         struct btrfs_ordered_extent *oe;
    3219           0 :         u64 oe_start;
    3220           0 :         u64 oe_end;
    3221             : 
    3222             :         /*
    3223             :          * Search the io tree first for EXTENT_DELALLOC. If we find any, it
    3224             :          * means we have delalloc (dirty pages) for which writeback has not
    3225             :          * started yet.
    3226             :          */
    3227           0 :         if (*search_io_tree) {
    3228           0 :                 spin_lock(&inode->lock);
    3229           0 :                 if (inode->delalloc_bytes > 0) {
    3230           0 :                         spin_unlock(&inode->lock);
    3231           0 :                         *delalloc_start_ret = start;
    3232           0 :                         delalloc_len = count_range_bits(&inode->io_tree,
    3233             :                                                         delalloc_start_ret, end,
    3234             :                                                         len, EXTENT_DELALLOC, 1,
    3235             :                                                         cached_state);
    3236             :                 } else {
    3237           0 :                         spin_unlock(&inode->lock);
    3238             :                 }
    3239             :         }
    3240             : 
    3241           0 :         if (delalloc_len > 0) {
    3242             :                 /*
    3243             :                  * If delalloc was found then *delalloc_start_ret has a sector size
    3244             :                  * aligned value (rounded down).
    3245             :                  */
    3246           0 :                 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
    3247             : 
    3248           0 :                 if (*delalloc_start_ret == start) {
    3249             :                         /* Delalloc for the whole range, nothing more to do. */
    3250           0 :                         if (*delalloc_end_ret == end)
    3251             :                                 return true;
    3252             :                         /* Else trim our search range for ordered extents. */
    3253           0 :                         start = *delalloc_end_ret + 1;
    3254           0 :                         len = end + 1 - start;
    3255             :                 }
    3256             :         } else {
    3257             :                 /* No delalloc, future calls don't need to search again. */
    3258           0 :                 *search_io_tree = false;
    3259             :         }
    3260             : 
    3261             :         /*
    3262             :          * Now also check if there's any ordered extent in the range.
    3263             :          * We do this because:
    3264             :          *
    3265             :          * 1) When delalloc is flushed, the file range is locked, we clear the
    3266             :          *    EXTENT_DELALLOC bit from the io tree and create an extent map and
    3267             :          *    an ordered extent for the write. So we might just have been called
    3268             :          *    after delalloc is flushed and before the ordered extent completes
    3269             :          *    and inserts the new file extent item in the subvolume's btree;
    3270             :          *
    3271             :          * 2) We may have an ordered extent created by flushing delalloc for a
    3272             :          *    subrange that starts before the subrange we found marked with
    3273             :          *    EXTENT_DELALLOC in the io tree.
    3274             :          *
    3275             :          * We could also use the extent map tree to find such delalloc that is
    3276             :          * being flushed, but using the ordered extents tree is more efficient
    3277             :          * because it's usually much smaller as ordered extents are removed from
    3278             :          * the tree once they complete. With the extent maps, we mau have them
    3279             :          * in the extent map tree for a very long time, and they were either
    3280             :          * created by previous writes or loaded by read operations.
    3281             :          */
    3282           0 :         oe = btrfs_lookup_first_ordered_range(inode, start, len);
    3283           0 :         if (!oe)
    3284           0 :                 return (delalloc_len > 0);
    3285             : 
    3286             :         /* The ordered extent may span beyond our search range. */
    3287           0 :         oe_start = max(oe->file_offset, start);
    3288           0 :         oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
    3289             : 
    3290           0 :         btrfs_put_ordered_extent(oe);
    3291             : 
    3292             :         /* Don't have unflushed delalloc, return the ordered extent range. */
    3293           0 :         if (delalloc_len == 0) {
    3294           0 :                 *delalloc_start_ret = oe_start;
    3295           0 :                 *delalloc_end_ret = oe_end;
    3296           0 :                 return true;
    3297             :         }
    3298             : 
    3299             :         /*
    3300             :          * We have both unflushed delalloc (io_tree) and an ordered extent.
    3301             :          * If the ranges are adjacent returned a combined range, otherwise
    3302             :          * return the leftmost range.
    3303             :          */
    3304           0 :         if (oe_start < *delalloc_start_ret) {
    3305           0 :                 if (oe_end < *delalloc_start_ret)
    3306           0 :                         *delalloc_end_ret = oe_end;
    3307           0 :                 *delalloc_start_ret = oe_start;
    3308           0 :         } else if (*delalloc_end_ret + 1 == oe_start) {
    3309           0 :                 *delalloc_end_ret = oe_end;
    3310             :         }
    3311             : 
    3312             :         return true;
    3313             : }
    3314             : 
    3315             : /*
    3316             :  * Check if there's delalloc in a given range.
    3317             :  *
    3318             :  * @inode:               The inode.
    3319             :  * @start:               The start offset of the range. It does not need to be
    3320             :  *                       sector size aligned.
    3321             :  * @end:                 The end offset (inclusive value) of the search range.
    3322             :  *                       It does not need to be sector size aligned.
    3323             :  * @cached_state:        Extent state record used for speeding up delalloc
    3324             :  *                       searches in the inode's io_tree. Can be NULL.
    3325             :  * @delalloc_start_ret:  Output argument, set to the start offset of the
    3326             :  *                       subrange found with delalloc (may not be sector size
    3327             :  *                       aligned).
    3328             :  * @delalloc_end_ret:    Output argument, set to he end offset (inclusive value)
    3329             :  *                       of the subrange found with delalloc.
    3330             :  *
    3331             :  * Returns true if a subrange with delalloc is found within the given range, and
    3332             :  * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
    3333             :  * end offsets of the subrange.
    3334             :  */
    3335           0 : bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
    3336             :                                   struct extent_state **cached_state,
    3337             :                                   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
    3338             : {
    3339           0 :         u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
    3340           0 :         u64 prev_delalloc_end = 0;
    3341           0 :         bool search_io_tree = true;
    3342           0 :         bool ret = false;
    3343             : 
    3344           0 :         while (cur_offset <= end) {
    3345           0 :                 u64 delalloc_start;
    3346           0 :                 u64 delalloc_end;
    3347           0 :                 bool delalloc;
    3348             : 
    3349           0 :                 delalloc = find_delalloc_subrange(inode, cur_offset, end,
    3350             :                                                   cached_state, &search_io_tree,
    3351             :                                                   &delalloc_start,
    3352             :                                                   &delalloc_end);
    3353           0 :                 if (!delalloc)
    3354             :                         break;
    3355             : 
    3356           0 :                 if (prev_delalloc_end == 0) {
    3357             :                         /* First subrange found. */
    3358           0 :                         *delalloc_start_ret = max(delalloc_start, start);
    3359           0 :                         *delalloc_end_ret = delalloc_end;
    3360           0 :                         ret = true;
    3361           0 :                 } else if (delalloc_start == prev_delalloc_end + 1) {
    3362             :                         /* Subrange adjacent to the previous one, merge them. */
    3363           0 :                         *delalloc_end_ret = delalloc_end;
    3364             :                 } else {
    3365             :                         /* Subrange not adjacent to the previous one, exit. */
    3366             :                         break;
    3367             :                 }
    3368             : 
    3369           0 :                 prev_delalloc_end = delalloc_end;
    3370           0 :                 cur_offset = delalloc_end + 1;
    3371           0 :                 cond_resched();
    3372             :         }
    3373             : 
    3374           0 :         return ret;
    3375             : }
    3376             : 
    3377             : /*
    3378             :  * Check if there's a hole or delalloc range in a range representing a hole (or
    3379             :  * prealloc extent) found in the inode's subvolume btree.
    3380             :  *
    3381             :  * @inode:      The inode.
    3382             :  * @whence:     Seek mode (SEEK_DATA or SEEK_HOLE).
    3383             :  * @start:      Start offset of the hole region. It does not need to be sector
    3384             :  *              size aligned.
    3385             :  * @end:        End offset (inclusive value) of the hole region. It does not
    3386             :  *              need to be sector size aligned.
    3387             :  * @start_ret:  Return parameter, used to set the start of the subrange in the
    3388             :  *              hole that matches the search criteria (seek mode), if such
    3389             :  *              subrange is found (return value of the function is true).
    3390             :  *              The value returned here may not be sector size aligned.
    3391             :  *
    3392             :  * Returns true if a subrange matching the given seek mode is found, and if one
    3393             :  * is found, it updates @start_ret with the start of the subrange.
    3394             :  */
    3395           0 : static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
    3396             :                                         struct extent_state **cached_state,
    3397             :                                         u64 start, u64 end, u64 *start_ret)
    3398             : {
    3399           0 :         u64 delalloc_start;
    3400           0 :         u64 delalloc_end;
    3401           0 :         bool delalloc;
    3402             : 
    3403           0 :         delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
    3404             :                                                 &delalloc_start, &delalloc_end);
    3405           0 :         if (delalloc && whence == SEEK_DATA) {
    3406           0 :                 *start_ret = delalloc_start;
    3407           0 :                 return true;
    3408             :         }
    3409             : 
    3410           0 :         if (delalloc && whence == SEEK_HOLE) {
    3411             :                 /*
    3412             :                  * We found delalloc but it starts after out start offset. So we
    3413             :                  * have a hole between our start offset and the delalloc start.
    3414             :                  */
    3415           0 :                 if (start < delalloc_start) {
    3416           0 :                         *start_ret = start;
    3417           0 :                         return true;
    3418             :                 }
    3419             :                 /*
    3420             :                  * Delalloc range starts at our start offset.
    3421             :                  * If the delalloc range's length is smaller than our range,
    3422             :                  * then it means we have a hole that starts where the delalloc
    3423             :                  * subrange ends.
    3424             :                  */
    3425           0 :                 if (delalloc_end < end) {
    3426           0 :                         *start_ret = delalloc_end + 1;
    3427           0 :                         return true;
    3428             :                 }
    3429             : 
    3430             :                 /* There's delalloc for the whole range. */
    3431             :                 return false;
    3432             :         }
    3433             : 
    3434           0 :         if (!delalloc && whence == SEEK_HOLE) {
    3435           0 :                 *start_ret = start;
    3436           0 :                 return true;
    3437             :         }
    3438             : 
    3439             :         /*
    3440             :          * No delalloc in the range and we are seeking for data. The caller has
    3441             :          * to iterate to the next extent item in the subvolume btree.
    3442             :          */
    3443             :         return false;
    3444             : }
    3445             : 
    3446           0 : static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
    3447             : {
    3448           0 :         struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
    3449           0 :         struct btrfs_file_private *private = file->private_data;
    3450           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    3451           0 :         struct extent_state *cached_state = NULL;
    3452           0 :         struct extent_state **delalloc_cached_state;
    3453           0 :         const loff_t i_size = i_size_read(&inode->vfs_inode);
    3454           0 :         const u64 ino = btrfs_ino(inode);
    3455           0 :         struct btrfs_root *root = inode->root;
    3456           0 :         struct btrfs_path *path;
    3457           0 :         struct btrfs_key key;
    3458           0 :         u64 last_extent_end;
    3459           0 :         u64 lockstart;
    3460           0 :         u64 lockend;
    3461           0 :         u64 start;
    3462           0 :         int ret;
    3463           0 :         bool found = false;
    3464             : 
    3465           0 :         if (i_size == 0 || offset >= i_size)
    3466             :                 return -ENXIO;
    3467             : 
    3468             :         /*
    3469             :          * Quick path. If the inode has no prealloc extents and its number of
    3470             :          * bytes used matches its i_size, then it can not have holes.
    3471             :          */
    3472           0 :         if (whence == SEEK_HOLE &&
    3473           0 :             !(inode->flags & BTRFS_INODE_PREALLOC) &&
    3474           0 :             inode_get_bytes(&inode->vfs_inode) == i_size)
    3475             :                 return i_size;
    3476             : 
    3477           0 :         if (!private) {
    3478           0 :                 private = kzalloc(sizeof(*private), GFP_KERNEL);
    3479             :                 /*
    3480             :                  * No worries if memory allocation failed.
    3481             :                  * The private structure is used only for speeding up multiple
    3482             :                  * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
    3483             :                  * so everything will still be correct.
    3484             :                  */
    3485           0 :                 file->private_data = private;
    3486             :         }
    3487             : 
    3488           0 :         if (private)
    3489           0 :                 delalloc_cached_state = &private->llseek_cached_state;
    3490             :         else
    3491             :                 delalloc_cached_state = NULL;
    3492             : 
    3493             :         /*
    3494             :          * offset can be negative, in this case we start finding DATA/HOLE from
    3495             :          * the very start of the file.
    3496             :          */
    3497           0 :         start = max_t(loff_t, 0, offset);
    3498             : 
    3499           0 :         lockstart = round_down(start, fs_info->sectorsize);
    3500           0 :         lockend = round_up(i_size, fs_info->sectorsize);
    3501           0 :         if (lockend <= lockstart)
    3502           0 :                 lockend = lockstart + fs_info->sectorsize;
    3503           0 :         lockend--;
    3504             : 
    3505           0 :         path = btrfs_alloc_path();
    3506           0 :         if (!path)
    3507             :                 return -ENOMEM;
    3508           0 :         path->reada = READA_FORWARD;
    3509             : 
    3510           0 :         key.objectid = ino;
    3511           0 :         key.type = BTRFS_EXTENT_DATA_KEY;
    3512           0 :         key.offset = start;
    3513             : 
    3514           0 :         last_extent_end = lockstart;
    3515             : 
    3516           0 :         lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
    3517             : 
    3518           0 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    3519           0 :         if (ret < 0) {
    3520           0 :                 goto out;
    3521           0 :         } else if (ret > 0 && path->slots[0] > 0) {
    3522           0 :                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
    3523           0 :                 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
    3524           0 :                         path->slots[0]--;
    3525             :         }
    3526             : 
    3527           0 :         while (start < i_size) {
    3528           0 :                 struct extent_buffer *leaf = path->nodes[0];
    3529           0 :                 struct btrfs_file_extent_item *extent;
    3530           0 :                 u64 extent_end;
    3531           0 :                 u8 type;
    3532             : 
    3533           0 :                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
    3534           0 :                         ret = btrfs_next_leaf(root, path);
    3535           0 :                         if (ret < 0)
    3536           0 :                                 goto out;
    3537           0 :                         else if (ret > 0)
    3538             :                                 break;
    3539             : 
    3540           0 :                         leaf = path->nodes[0];
    3541             :                 }
    3542             : 
    3543           0 :                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    3544           0 :                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
    3545             :                         break;
    3546             : 
    3547           0 :                 extent_end = btrfs_file_extent_end(path);
    3548             : 
    3549             :                 /*
    3550             :                  * In the first iteration we may have a slot that points to an
    3551             :                  * extent that ends before our start offset, so skip it.
    3552             :                  */
    3553           0 :                 if (extent_end <= start) {
    3554           0 :                         path->slots[0]++;
    3555           0 :                         continue;
    3556             :                 }
    3557             : 
    3558             :                 /* We have an implicit hole, NO_HOLES feature is likely set. */
    3559           0 :                 if (last_extent_end < key.offset) {
    3560           0 :                         u64 search_start = last_extent_end;
    3561           0 :                         u64 found_start;
    3562             : 
    3563             :                         /*
    3564             :                          * First iteration, @start matches @offset and it's
    3565             :                          * within the hole.
    3566             :                          */
    3567           0 :                         if (start == offset)
    3568           0 :                                 search_start = offset;
    3569             : 
    3570           0 :                         found = find_desired_extent_in_hole(inode, whence,
    3571             :                                                             delalloc_cached_state,
    3572             :                                                             search_start,
    3573             :                                                             key.offset - 1,
    3574             :                                                             &found_start);
    3575           0 :                         if (found) {
    3576           0 :                                 start = found_start;
    3577           0 :                                 break;
    3578             :                         }
    3579             :                         /*
    3580             :                          * Didn't find data or a hole (due to delalloc) in the
    3581             :                          * implicit hole range, so need to analyze the extent.
    3582             :                          */
    3583             :                 }
    3584             : 
    3585           0 :                 extent = btrfs_item_ptr(leaf, path->slots[0],
    3586             :                                         struct btrfs_file_extent_item);
    3587           0 :                 type = btrfs_file_extent_type(leaf, extent);
    3588             : 
    3589             :                 /*
    3590             :                  * Can't access the extent's disk_bytenr field if this is an
    3591             :                  * inline extent, since at that offset, it's where the extent
    3592             :                  * data starts.
    3593             :                  */
    3594           0 :                 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
    3595           0 :                     (type == BTRFS_FILE_EXTENT_REG &&
    3596           0 :                      btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
    3597             :                         /*
    3598             :                          * Explicit hole or prealloc extent, search for delalloc.
    3599             :                          * A prealloc extent is treated like a hole.
    3600             :                          */
    3601           0 :                         u64 search_start = key.offset;
    3602           0 :                         u64 found_start;
    3603             : 
    3604             :                         /*
    3605             :                          * First iteration, @start matches @offset and it's
    3606             :                          * within the hole.
    3607             :                          */
    3608           0 :                         if (start == offset)
    3609           0 :                                 search_start = offset;
    3610             : 
    3611           0 :                         found = find_desired_extent_in_hole(inode, whence,
    3612             :                                                             delalloc_cached_state,
    3613             :                                                             search_start,
    3614             :                                                             extent_end - 1,
    3615             :                                                             &found_start);
    3616           0 :                         if (found) {
    3617           0 :                                 start = found_start;
    3618           0 :                                 break;
    3619             :                         }
    3620             :                         /*
    3621             :                          * Didn't find data or a hole (due to delalloc) in the
    3622             :                          * implicit hole range, so need to analyze the next
    3623             :                          * extent item.
    3624             :                          */
    3625             :                 } else {
    3626             :                         /*
    3627             :                          * Found a regular or inline extent.
    3628             :                          * If we are seeking for data, adjust the start offset
    3629             :                          * and stop, we're done.
    3630             :                          */
    3631           0 :                         if (whence == SEEK_DATA) {
    3632           0 :                                 start = max_t(u64, key.offset, offset);
    3633           0 :                                 found = true;
    3634           0 :                                 break;
    3635             :                         }
    3636             :                         /*
    3637             :                          * Else, we are seeking for a hole, check the next file
    3638             :                          * extent item.
    3639             :                          */
    3640             :                 }
    3641             : 
    3642           0 :                 start = extent_end;
    3643           0 :                 last_extent_end = extent_end;
    3644           0 :                 path->slots[0]++;
    3645           0 :                 if (fatal_signal_pending(current)) {
    3646           0 :                         ret = -EINTR;
    3647           0 :                         goto out;
    3648             :                 }
    3649           0 :                 cond_resched();
    3650             :         }
    3651             : 
    3652             :         /* We have an implicit hole from the last extent found up to i_size. */
    3653           0 :         if (!found && start < i_size) {
    3654           0 :                 found = find_desired_extent_in_hole(inode, whence,
    3655             :                                                     delalloc_cached_state, start,
    3656             :                                                     i_size - 1, &start);
    3657           0 :                 if (!found)
    3658           0 :                         start = i_size;
    3659             :         }
    3660             : 
    3661           0 : out:
    3662           0 :         unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
    3663           0 :         btrfs_free_path(path);
    3664             : 
    3665           0 :         if (ret < 0)
    3666           0 :                 return ret;
    3667             : 
    3668           0 :         if (whence == SEEK_DATA && start >= i_size)
    3669             :                 return -ENXIO;
    3670             : 
    3671           0 :         return min_t(loff_t, start, i_size);
    3672             : }
    3673             : 
    3674           0 : static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
    3675             : {
    3676           0 :         struct inode *inode = file->f_mapping->host;
    3677             : 
    3678           0 :         switch (whence) {
    3679           0 :         default:
    3680           0 :                 return generic_file_llseek(file, offset, whence);
    3681             :         case SEEK_DATA:
    3682             :         case SEEK_HOLE:
    3683           0 :                 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
    3684           0 :                 offset = find_desired_extent(file, offset, whence);
    3685           0 :                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
    3686           0 :                 break;
    3687             :         }
    3688             : 
    3689           0 :         if (offset < 0)
    3690             :                 return offset;
    3691             : 
    3692           0 :         return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
    3693             : }
    3694             : 
    3695           0 : static int btrfs_file_open(struct inode *inode, struct file *filp)
    3696             : {
    3697           0 :         int ret;
    3698             : 
    3699           0 :         filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
    3700             :                         FMODE_CAN_ODIRECT;
    3701             : 
    3702           0 :         ret = fsverity_file_open(inode, filp);
    3703           0 :         if (ret)
    3704             :                 return ret;
    3705           0 :         return generic_file_open(inode, filp);
    3706             : }
    3707             : 
    3708           0 : static int check_direct_read(struct btrfs_fs_info *fs_info,
    3709             :                              const struct iov_iter *iter, loff_t offset)
    3710             : {
    3711           0 :         int ret;
    3712           0 :         int i, seg;
    3713             : 
    3714           0 :         ret = check_direct_IO(fs_info, iter, offset);
    3715           0 :         if (ret < 0)
    3716             :                 return ret;
    3717             : 
    3718           0 :         if (!iter_is_iovec(iter))
    3719             :                 return 0;
    3720             : 
    3721           0 :         for (seg = 0; seg < iter->nr_segs; seg++) {
    3722           0 :                 for (i = seg + 1; i < iter->nr_segs; i++) {
    3723           0 :                         const struct iovec *iov1 = iter_iov(iter) + seg;
    3724           0 :                         const struct iovec *iov2 = iter_iov(iter) + i;
    3725             : 
    3726           0 :                         if (iov1->iov_base == iov2->iov_base)
    3727             :                                 return -EINVAL;
    3728             :                 }
    3729             :         }
    3730             :         return 0;
    3731             : }
    3732             : 
    3733           0 : static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
    3734             : {
    3735           0 :         struct inode *inode = file_inode(iocb->ki_filp);
    3736           0 :         size_t prev_left = 0;
    3737           0 :         ssize_t read = 0;
    3738           0 :         ssize_t ret;
    3739             : 
    3740           0 :         if (fsverity_active(inode))
    3741             :                 return 0;
    3742             : 
    3743           0 :         if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
    3744             :                 return 0;
    3745             : 
    3746           0 :         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
    3747           0 : again:
    3748             :         /*
    3749             :          * This is similar to what we do for direct IO writes, see the comment
    3750             :          * at btrfs_direct_write(), but we also disable page faults in addition
    3751             :          * to disabling them only at the iov_iter level. This is because when
    3752             :          * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
    3753             :          * which can still trigger page fault ins despite having set ->nofault
    3754             :          * to true of our 'to' iov_iter.
    3755             :          *
    3756             :          * The difference to direct IO writes is that we deadlock when trying
    3757             :          * to lock the extent range in the inode's tree during he page reads
    3758             :          * triggered by the fault in (while for writes it is due to waiting for
    3759             :          * our own ordered extent). This is because for direct IO reads,
    3760             :          * btrfs_dio_iomap_begin() returns with the extent range locked, which
    3761             :          * is only unlocked in the endio callback (end_bio_extent_readpage()).
    3762             :          */
    3763           0 :         pagefault_disable();
    3764           0 :         to->nofault = true;
    3765           0 :         ret = btrfs_dio_read(iocb, to, read);
    3766           0 :         to->nofault = false;
    3767           0 :         pagefault_enable();
    3768             : 
    3769             :         /* No increment (+=) because iomap returns a cumulative value. */
    3770           0 :         if (ret > 0)
    3771           0 :                 read = ret;
    3772             : 
    3773           0 :         if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
    3774           0 :                 const size_t left = iov_iter_count(to);
    3775             : 
    3776           0 :                 if (left == prev_left) {
    3777             :                         /*
    3778             :                          * We didn't make any progress since the last attempt,
    3779             :                          * fallback to a buffered read for the remainder of the
    3780             :                          * range. This is just to avoid any possibility of looping
    3781             :                          * for too long.
    3782             :                          */
    3783             :                         ret = read;
    3784             :                 } else {
    3785             :                         /*
    3786             :                          * We made some progress since the last retry or this is
    3787             :                          * the first time we are retrying. Fault in as many pages
    3788             :                          * as possible and retry.
    3789             :                          */
    3790           0 :                         fault_in_iov_iter_writeable(to, left);
    3791           0 :                         prev_left = left;
    3792           0 :                         goto again;
    3793             :                 }
    3794             :         }
    3795           0 :         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
    3796           0 :         return ret < 0 ? ret : read;
    3797             : }
    3798             : 
    3799           0 : static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
    3800             : {
    3801           0 :         ssize_t ret = 0;
    3802             : 
    3803           0 :         if (iocb->ki_flags & IOCB_DIRECT) {
    3804           0 :                 ret = btrfs_direct_read(iocb, to);
    3805           0 :                 if (ret < 0 || !iov_iter_count(to) ||
    3806           0 :                     iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
    3807             :                         return ret;
    3808             :         }
    3809             : 
    3810           0 :         return filemap_read(iocb, to, ret);
    3811             : }
    3812             : 
    3813             : const struct file_operations btrfs_file_operations = {
    3814             :         .llseek         = btrfs_file_llseek,
    3815             :         .read_iter      = btrfs_file_read_iter,
    3816             :         .splice_read    = filemap_splice_read,
    3817             :         .write_iter     = btrfs_file_write_iter,
    3818             :         .splice_write   = iter_file_splice_write,
    3819             :         .mmap           = btrfs_file_mmap,
    3820             :         .open           = btrfs_file_open,
    3821             :         .release        = btrfs_release_file,
    3822             :         .get_unmapped_area = thp_get_unmapped_area,
    3823             :         .fsync          = btrfs_sync_file,
    3824             :         .fallocate      = btrfs_fallocate,
    3825             :         .unlocked_ioctl = btrfs_ioctl,
    3826             : #ifdef CONFIG_COMPAT
    3827             :         .compat_ioctl   = btrfs_compat_ioctl,
    3828             : #endif
    3829             :         .remap_file_range = btrfs_remap_file_range,
    3830             : };
    3831             : 
    3832           0 : int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
    3833             : {
    3834           0 :         int ret;
    3835             : 
    3836             :         /*
    3837             :          * So with compression we will find and lock a dirty page and clear the
    3838             :          * first one as dirty, setup an async extent, and immediately return
    3839             :          * with the entire range locked but with nobody actually marked with
    3840             :          * writeback.  So we can't just filemap_write_and_wait_range() and
    3841             :          * expect it to work since it will just kick off a thread to do the
    3842             :          * actual work.  So we need to call filemap_fdatawrite_range _again_
    3843             :          * since it will wait on the page lock, which won't be unlocked until
    3844             :          * after the pages have been marked as writeback and so we're good to go
    3845             :          * from there.  We have to do this otherwise we'll miss the ordered
    3846             :          * extents and that results in badness.  Please Josef, do not think you
    3847             :          * know better and pull this out at some point in the future, it is
    3848             :          * right and you are wrong.
    3849             :          */
    3850           0 :         ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
    3851           0 :         if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
    3852             :                              &BTRFS_I(inode)->runtime_flags))
    3853           0 :                 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
    3854             : 
    3855           0 :         return ret;
    3856             : }

Generated by: LCOV version 1.14