Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 2007 Oracle. All rights reserved.
4 : */
5 :
6 : #include <linux/slab.h>
7 : #include <linux/blkdev.h>
8 : #include <linux/writeback.h>
9 : #include <linux/sched/mm.h>
10 : #include "messages.h"
11 : #include "misc.h"
12 : #include "ctree.h"
13 : #include "transaction.h"
14 : #include "btrfs_inode.h"
15 : #include "extent_io.h"
16 : #include "disk-io.h"
17 : #include "compression.h"
18 : #include "delalloc-space.h"
19 : #include "qgroup.h"
20 : #include "subpage.h"
21 : #include "file.h"
22 : #include "super.h"
23 :
24 : static struct kmem_cache *btrfs_ordered_extent_cache;
25 :
26 : static u64 entry_end(struct btrfs_ordered_extent *entry)
27 : {
28 7082313 : if (entry->file_offset + entry->num_bytes < entry->file_offset)
29 0 : return (u64)-1;
30 : return entry->file_offset + entry->num_bytes;
31 : }
32 :
33 : /* returns NULL if the insertion worked, or it returns the node it did find
34 : * in the tree
35 : */
36 3507033 : static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
37 : struct rb_node *node)
38 : {
39 3507033 : struct rb_node **p = &root->rb_node;
40 3507033 : struct rb_node *parent = NULL;
41 3507033 : struct btrfs_ordered_extent *entry;
42 :
43 9230367 : while (*p) {
44 5723334 : parent = *p;
45 5723334 : entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
46 :
47 5723334 : if (file_offset < entry->file_offset)
48 480639 : p = &(*p)->rb_left;
49 5242695 : else if (file_offset >= entry_end(entry))
50 5242695 : p = &(*p)->rb_right;
51 : else
52 0 : return parent;
53 : }
54 :
55 3507033 : rb_link_node(node, parent, p);
56 3507033 : rb_insert_color(node, root);
57 3507033 : return NULL;
58 : }
59 :
60 : /*
61 : * look for a given offset in the tree, and if it can't be found return the
62 : * first lesser offset
63 : */
64 102412390 : static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
65 : struct rb_node **prev_ret)
66 : {
67 102412390 : struct rb_node *n = root->rb_node;
68 102412390 : struct rb_node *prev = NULL;
69 102412390 : struct rb_node *test;
70 102412390 : struct btrfs_ordered_extent *entry;
71 102412390 : struct btrfs_ordered_extent *prev_entry = NULL;
72 :
73 110740198 : while (n) {
74 10820174 : entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
75 10820174 : prev = n;
76 10820174 : prev_entry = entry;
77 :
78 10820174 : if (file_offset < entry->file_offset)
79 2614441 : n = n->rb_left;
80 8205733 : else if (file_offset >= entry_end(entry))
81 5713367 : n = n->rb_right;
82 : else
83 2492366 : return n;
84 : }
85 99920024 : if (!prev_ret)
86 : return NULL;
87 :
88 103354692 : while (prev && file_offset >= entry_end(prev_entry)) {
89 1577993 : test = rb_next(prev);
90 1580121 : if (!test)
91 : break;
92 149613 : prev_entry = rb_entry(test, struct btrfs_ordered_extent,
93 : rb_node);
94 149613 : if (file_offset < entry_end(prev_entry))
95 : break;
96 :
97 : prev = test;
98 : }
99 99922152 : if (prev)
100 3438345 : prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
101 : rb_node);
102 103773274 : while (prev && file_offset < entry_end(prev_entry)) {
103 1863891 : test = rb_prev(prev);
104 1856661 : if (!test)
105 : break;
106 203472 : prev_entry = rb_entry(test, struct btrfs_ordered_extent,
107 : rb_node);
108 203472 : prev = test;
109 : }
110 99914922 : *prev_ret = prev;
111 99914922 : return NULL;
112 : }
113 :
114 : static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
115 : u64 len)
116 : {
117 9219634 : if (file_offset + len <= entry->file_offset ||
118 3023003 : entry->file_offset + entry->num_bytes <= file_offset)
119 2100748 : return 0;
120 : return 1;
121 : }
122 :
123 : /*
124 : * look find the first ordered struct that has this offset, otherwise
125 : * the first one less than this offset
126 : */
127 103405767 : static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
128 : u64 file_offset)
129 : {
130 103405767 : struct rb_root *root = &tree->tree;
131 103405767 : struct rb_node *prev = NULL;
132 103405767 : struct rb_node *ret;
133 103405767 : struct btrfs_ordered_extent *entry;
134 :
135 103405767 : if (tree->last) {
136 4734295 : entry = rb_entry(tree->last, struct btrfs_ordered_extent,
137 : rb_node);
138 4734295 : if (in_range(file_offset, entry->file_offset, entry->num_bytes))
139 : return tree->last;
140 : }
141 102481728 : ret = __tree_search(root, file_offset, &prev);
142 102412683 : if (!ret)
143 99913756 : ret = prev;
144 102412683 : if (ret)
145 5928725 : tree->last = ret;
146 : return ret;
147 : }
148 :
149 3503918 : static struct btrfs_ordered_extent *alloc_ordered_extent(
150 : struct btrfs_inode *inode, u64 file_offset, u64 num_bytes,
151 : u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes,
152 : u64 offset, unsigned long flags, int compress_type)
153 : {
154 3503918 : struct btrfs_ordered_extent *entry;
155 3503918 : int ret;
156 :
157 3503918 : if (flags &
158 : ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
159 : /* For nocow write, we can release the qgroup rsv right now */
160 291453 : ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
161 291453 : if (ret < 0)
162 0 : return ERR_PTR(ret);
163 : } else {
164 : /*
165 : * The ordered extent has reserved qgroup space, release now
166 : * and pass the reserved number for qgroup_record to free.
167 : */
168 3212465 : ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
169 3212289 : if (ret < 0)
170 0 : return ERR_PTR(ret);
171 : }
172 3503742 : entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
173 3504110 : if (!entry)
174 : return ERR_PTR(-ENOMEM);
175 :
176 3504110 : entry->file_offset = file_offset;
177 3504110 : entry->num_bytes = num_bytes;
178 3504110 : entry->ram_bytes = ram_bytes;
179 3504110 : entry->disk_bytenr = disk_bytenr;
180 3504110 : entry->disk_num_bytes = disk_num_bytes;
181 3504110 : entry->offset = offset;
182 3504110 : entry->bytes_left = num_bytes;
183 3504110 : entry->inode = igrab(&inode->vfs_inode);
184 3504203 : entry->compress_type = compress_type;
185 3504203 : entry->truncated_len = (u64)-1;
186 3504203 : entry->qgroup_rsv = ret;
187 3504203 : entry->flags = flags;
188 3504203 : refcount_set(&entry->refs, 1);
189 3504203 : init_waitqueue_head(&entry->wait);
190 3504125 : INIT_LIST_HEAD(&entry->list);
191 3504125 : INIT_LIST_HEAD(&entry->log_list);
192 3504125 : INIT_LIST_HEAD(&entry->root_extent_list);
193 3504125 : INIT_LIST_HEAD(&entry->work_list);
194 3504125 : init_completion(&entry->completion);
195 :
196 : /*
197 : * We don't need the count_max_extents here, we can assume that all of
198 : * that work has been done at higher layers, so this is truly the
199 : * smallest the extent is going to get.
200 : */
201 3504106 : spin_lock(&inode->lock);
202 3504106 : btrfs_mod_outstanding_extents(inode, 1);
203 3503866 : spin_unlock(&inode->lock);
204 :
205 3503866 : return entry;
206 : }
207 :
208 3501119 : static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
209 : {
210 3501119 : struct btrfs_inode *inode = BTRFS_I(entry->inode);
211 3501119 : struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
212 3501119 : struct btrfs_root *root = inode->root;
213 3501119 : struct btrfs_fs_info *fs_info = root->fs_info;
214 3501119 : struct rb_node *node;
215 :
216 3501119 : trace_btrfs_ordered_extent_add(inode, entry);
217 :
218 3501044 : percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes,
219 : fs_info->delalloc_batch);
220 :
221 : /* One ref for the tree. */
222 3501126 : refcount_inc(&entry->refs);
223 :
224 3501189 : spin_lock_irq(&tree->lock);
225 3501211 : node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node);
226 3501063 : if (node)
227 0 : btrfs_panic(fs_info, -EEXIST,
228 : "inconsistency in ordered tree at offset %llu",
229 : entry->file_offset);
230 3501063 : spin_unlock_irq(&tree->lock);
231 :
232 3501193 : spin_lock(&root->ordered_extent_lock);
233 3501317 : list_add_tail(&entry->root_extent_list,
234 : &root->ordered_extents);
235 3501302 : root->nr_ordered_extents++;
236 3501302 : if (root->nr_ordered_extents == 1) {
237 1224777 : spin_lock(&fs_info->ordered_root_lock);
238 1224813 : BUG_ON(!list_empty(&root->ordered_root));
239 1224813 : list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
240 1224813 : spin_unlock(&fs_info->ordered_root_lock);
241 : }
242 3501337 : spin_unlock(&root->ordered_extent_lock);
243 3501351 : }
244 :
245 : /*
246 : * Add an ordered extent to the per-inode tree.
247 : *
248 : * @inode: Inode that this extent is for.
249 : * @file_offset: Logical offset in file where the extent starts.
250 : * @num_bytes: Logical length of extent in file.
251 : * @ram_bytes: Full length of unencoded data.
252 : * @disk_bytenr: Offset of extent on disk.
253 : * @disk_num_bytes: Size of extent on disk.
254 : * @offset: Offset into unencoded data where file data starts.
255 : * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
256 : * @compress_type: Compression algorithm used for data.
257 : *
258 : * Most of these parameters correspond to &struct btrfs_file_extent_item. The
259 : * tree is given a single reference on the ordered extent that was inserted, and
260 : * the returned pointer is given a second reference.
261 : *
262 : * Return: the new ordered extent or error pointer.
263 : */
264 3501148 : struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
265 : struct btrfs_inode *inode, u64 file_offset,
266 : u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
267 : u64 disk_num_bytes, u64 offset, unsigned long flags,
268 : int compress_type)
269 : {
270 3501148 : struct btrfs_ordered_extent *entry;
271 :
272 3501148 : ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
273 :
274 3501148 : entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes,
275 : disk_bytenr, disk_num_bytes, offset, flags,
276 : compress_type);
277 3501163 : if (!IS_ERR(entry))
278 3501163 : insert_ordered_extent(entry);
279 3501351 : return entry;
280 : }
281 :
282 : /*
283 : * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
284 : * when an ordered extent is finished. If the list covers more than one
285 : * ordered extent, it is split across multiples.
286 : */
287 3555414 : void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
288 : struct btrfs_ordered_sum *sum)
289 : {
290 3555414 : struct btrfs_ordered_inode_tree *tree;
291 :
292 3555414 : tree = &BTRFS_I(entry->inode)->ordered_tree;
293 3555414 : spin_lock_irq(&tree->lock);
294 3555857 : list_add_tail(&sum->list, &entry->list);
295 3555558 : spin_unlock_irq(&tree->lock);
296 3555771 : }
297 :
298 3504053 : static void finish_ordered_fn(struct btrfs_work *work)
299 : {
300 3504053 : struct btrfs_ordered_extent *ordered_extent;
301 :
302 3504053 : ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
303 3504053 : btrfs_finish_ordered_io(ordered_extent);
304 3503628 : }
305 :
306 51253053 : static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
307 : struct page *page, u64 file_offset,
308 : u64 len, bool uptodate)
309 : {
310 51253053 : struct btrfs_inode *inode = BTRFS_I(ordered->inode);
311 51253053 : struct btrfs_fs_info *fs_info = inode->root->fs_info;
312 :
313 51253053 : lockdep_assert_held(&inode->ordered_tree.lock);
314 :
315 51253053 : if (page) {
316 50234756 : ASSERT(page->mapping);
317 50234756 : ASSERT(page_offset(page) <= file_offset);
318 50234756 : ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE);
319 :
320 : /*
321 : * Ordered (Private2) bit indicates whether we still have
322 : * pending io unfinished for the ordered extent.
323 : *
324 : * If there's no such bit, we need to skip to next range.
325 : */
326 50234756 : if (!btrfs_page_test_ordered(fs_info, page, file_offset, len))
327 : return false;
328 50234756 : btrfs_page_clear_ordered(fs_info, page, file_offset, len);
329 : }
330 :
331 : /* Now we're fine to update the accounting. */
332 51253053 : if (WARN_ON_ONCE(len > ordered->bytes_left)) {
333 0 : btrfs_crit(fs_info,
334 : "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
335 : inode->root->root_key.objectid, btrfs_ino(inode),
336 : ordered->file_offset, ordered->num_bytes,
337 : len, ordered->bytes_left);
338 0 : ordered->bytes_left = 0;
339 : } else {
340 51253053 : ordered->bytes_left -= len;
341 : }
342 :
343 51253053 : if (!uptodate)
344 129553 : set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
345 :
346 51253053 : if (ordered->bytes_left)
347 : return false;
348 :
349 : /*
350 : * All the IO of the ordered extent is finished, we need to queue
351 : * the finish_func to be executed.
352 : */
353 3504078 : set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags);
354 3504078 : cond_wake_up(&ordered->wait);
355 3504078 : refcount_inc(&ordered->refs);
356 3504078 : trace_btrfs_ordered_extent_mark_finished(inode, ordered);
357 3504078 : return true;
358 : }
359 :
360 3504078 : static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
361 : {
362 3504078 : struct btrfs_inode *inode = BTRFS_I(ordered->inode);
363 3504078 : struct btrfs_fs_info *fs_info = inode->root->fs_info;
364 3504078 : struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
365 3504078 : fs_info->endio_freespace_worker : fs_info->endio_write_workers;
366 :
367 3504078 : btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
368 3504070 : btrfs_queue_work(wq, &ordered->work);
369 3504078 : }
370 :
371 51253051 : bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
372 : struct page *page, u64 file_offset, u64 len,
373 : bool uptodate)
374 : {
375 51253051 : struct btrfs_inode *inode = BTRFS_I(ordered->inode);
376 51253051 : unsigned long flags;
377 51253051 : bool ret;
378 :
379 51253051 : trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
380 :
381 51253051 : spin_lock_irqsave(&inode->ordered_tree.lock, flags);
382 51253053 : ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
383 51253053 : spin_unlock_irqrestore(&inode->ordered_tree.lock, flags);
384 :
385 51253053 : if (ret)
386 3504078 : btrfs_queue_ordered_fn(ordered);
387 51253053 : return ret;
388 : }
389 :
390 : /*
391 : * Mark all ordered extents io inside the specified range finished.
392 : *
393 : * @page: The involved page for the operation.
394 : * For uncompressed buffered IO, the page status also needs to be
395 : * updated to indicate whether the pending ordered io is finished.
396 : * Can be NULL for direct IO and compressed write.
397 : * For these cases, callers are ensured they won't execute the
398 : * endio function twice.
399 : *
400 : * This function is called for endio, thus the range must have ordered
401 : * extent(s) covering it.
402 : */
403 0 : void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
404 : struct page *page, u64 file_offset,
405 : u64 num_bytes, bool uptodate)
406 : {
407 0 : struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
408 0 : struct rb_node *node;
409 0 : struct btrfs_ordered_extent *entry = NULL;
410 0 : unsigned long flags;
411 0 : u64 cur = file_offset;
412 :
413 0 : spin_lock_irqsave(&tree->lock, flags);
414 0 : while (cur < file_offset + num_bytes) {
415 0 : u64 entry_end;
416 0 : u64 end;
417 0 : u32 len;
418 :
419 0 : node = tree_search(tree, cur);
420 : /* No ordered extents at all */
421 0 : if (!node)
422 : break;
423 :
424 0 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
425 0 : entry_end = entry->file_offset + entry->num_bytes;
426 : /*
427 : * |<-- OE --->| |
428 : * cur
429 : * Go to next OE.
430 : */
431 0 : if (cur >= entry_end) {
432 0 : node = rb_next(node);
433 : /* No more ordered extents, exit */
434 0 : if (!node)
435 : break;
436 0 : entry = rb_entry(node, struct btrfs_ordered_extent,
437 : rb_node);
438 :
439 : /* Go to next ordered extent and continue */
440 0 : cur = entry->file_offset;
441 0 : continue;
442 : }
443 : /*
444 : * | |<--- OE --->|
445 : * cur
446 : * Go to the start of OE.
447 : */
448 0 : if (cur < entry->file_offset) {
449 0 : cur = entry->file_offset;
450 0 : continue;
451 : }
452 :
453 : /*
454 : * Now we are definitely inside one ordered extent.
455 : *
456 : * |<--- OE --->|
457 : * |
458 : * cur
459 : */
460 0 : end = min(entry->file_offset + entry->num_bytes,
461 : file_offset + num_bytes) - 1;
462 0 : ASSERT(end + 1 - cur < U32_MAX);
463 0 : len = end + 1 - cur;
464 :
465 0 : if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
466 0 : spin_unlock_irqrestore(&tree->lock, flags);
467 0 : btrfs_queue_ordered_fn(entry);
468 0 : spin_lock_irqsave(&tree->lock, flags);
469 : }
470 0 : cur += len;
471 : }
472 0 : spin_unlock_irqrestore(&tree->lock, flags);
473 0 : }
474 :
475 : /*
476 : * Finish IO for one ordered extent across a given range. The range can only
477 : * contain one ordered extent.
478 : *
479 : * @cached: The cached ordered extent. If not NULL, we can skip the tree
480 : * search and use the ordered extent directly.
481 : * Will be also used to store the finished ordered extent.
482 : * @file_offset: File offset for the finished IO
483 : * @io_size: Length of the finish IO range
484 : *
485 : * Return true if the ordered extent is finished in the range, and update
486 : * @cached.
487 : * Return false otherwise.
488 : *
489 : * NOTE: The range can NOT cross multiple ordered extents.
490 : * Thus caller should ensure the range doesn't cross ordered extents.
491 : */
492 12465 : bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
493 : struct btrfs_ordered_extent **cached,
494 : u64 file_offset, u64 io_size)
495 : {
496 12465 : struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
497 12465 : struct rb_node *node;
498 12465 : struct btrfs_ordered_extent *entry = NULL;
499 12465 : unsigned long flags;
500 12465 : bool finished = false;
501 :
502 12465 : spin_lock_irqsave(&tree->lock, flags);
503 12465 : if (cached && *cached) {
504 12465 : entry = *cached;
505 12465 : goto have_entry;
506 : }
507 :
508 0 : node = tree_search(tree, file_offset);
509 0 : if (!node)
510 0 : goto out;
511 :
512 0 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
513 12465 : have_entry:
514 12465 : if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
515 0 : goto out;
516 :
517 12465 : if (io_size > entry->bytes_left)
518 0 : btrfs_crit(inode->root->fs_info,
519 : "bad ordered accounting left %llu size %llu",
520 : entry->bytes_left, io_size);
521 :
522 12465 : entry->bytes_left -= io_size;
523 :
524 12465 : if (entry->bytes_left == 0) {
525 : /*
526 : * Ensure only one caller can set the flag and finished_ret
527 : * accordingly
528 : */
529 226 : finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
530 : /* test_and_set_bit implies a barrier */
531 226 : cond_wake_up_nomb(&entry->wait);
532 : }
533 12239 : out:
534 12465 : if (finished && cached && entry) {
535 226 : *cached = entry;
536 226 : refcount_inc(&entry->refs);
537 226 : trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
538 : }
539 12465 : spin_unlock_irqrestore(&tree->lock, flags);
540 12465 : return finished;
541 : }
542 :
543 : /*
544 : * used to drop a reference on an ordered extent. This will free
545 : * the extent if the last reference is dropped
546 : */
547 16731714 : void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
548 : {
549 16731714 : struct list_head *cur;
550 16731714 : struct btrfs_ordered_sum *sum;
551 :
552 16731714 : trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
553 :
554 16731196 : if (refcount_dec_and_test(&entry->refs)) {
555 3503930 : ASSERT(list_empty(&entry->root_extent_list));
556 3503930 : ASSERT(list_empty(&entry->log_list));
557 3503930 : ASSERT(RB_EMPTY_NODE(&entry->rb_node));
558 3503930 : if (entry->inode)
559 3503930 : btrfs_add_delayed_iput(BTRFS_I(entry->inode));
560 7059823 : while (!list_empty(&entry->list)) {
561 3556148 : cur = entry->list.next;
562 3556148 : sum = list_entry(cur, struct btrfs_ordered_sum, list);
563 3556148 : list_del(&sum->list);
564 3556005 : kvfree(sum);
565 : }
566 3503675 : kmem_cache_free(btrfs_ordered_extent_cache, entry);
567 : }
568 16732035 : }
569 :
570 : /*
571 : * remove an ordered extent from the tree. No references are dropped
572 : * and waiters are woken up.
573 : */
574 3503706 : void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
575 : struct btrfs_ordered_extent *entry)
576 : {
577 3503706 : struct btrfs_ordered_inode_tree *tree;
578 3503706 : struct btrfs_root *root = btrfs_inode->root;
579 3503706 : struct btrfs_fs_info *fs_info = root->fs_info;
580 3503706 : struct rb_node *node;
581 3503706 : bool pending;
582 3503706 : bool freespace_inode;
583 :
584 : /*
585 : * If this is a free space inode the thread has not acquired the ordered
586 : * extents lockdep map.
587 : */
588 3503706 : freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
589 :
590 3503706 : btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
591 : /* This is paired with btrfs_alloc_ordered_extent. */
592 3503706 : spin_lock(&btrfs_inode->lock);
593 3504117 : btrfs_mod_outstanding_extents(btrfs_inode, -1);
594 3503738 : spin_unlock(&btrfs_inode->lock);
595 3504018 : if (root != fs_info->tree_root) {
596 3503978 : u64 release;
597 :
598 7007956 : if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
599 0 : release = entry->disk_num_bytes;
600 : else
601 3503978 : release = entry->num_bytes;
602 3503978 : btrfs_delalloc_release_metadata(btrfs_inode, release, false);
603 : }
604 :
605 3503390 : percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
606 : fs_info->delalloc_batch);
607 :
608 3503532 : tree = &btrfs_inode->ordered_tree;
609 3503532 : spin_lock_irq(&tree->lock);
610 3503964 : node = &entry->rb_node;
611 3503964 : rb_erase(node, &tree->tree);
612 3503877 : RB_CLEAR_NODE(node);
613 3503877 : if (tree->last == node)
614 2118779 : tree->last = NULL;
615 3503877 : set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
616 3503998 : pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
617 3504030 : spin_unlock_irq(&tree->lock);
618 :
619 : /*
620 : * The current running transaction is waiting on us, we need to let it
621 : * know that we're complete and wake it up.
622 : */
623 3504058 : if (pending) {
624 7416 : struct btrfs_transaction *trans;
625 :
626 : /*
627 : * The checks for trans are just a formality, it should be set,
628 : * but if it isn't we don't want to deref/assert under the spin
629 : * lock, so be nice and check if trans is set, but ASSERT() so
630 : * if it isn't set a developer will notice.
631 : */
632 7416 : spin_lock(&fs_info->trans_lock);
633 7416 : trans = fs_info->running_transaction;
634 7416 : if (trans)
635 7416 : refcount_inc(&trans->use_count);
636 7416 : spin_unlock(&fs_info->trans_lock);
637 :
638 7416 : ASSERT(trans);
639 7416 : if (trans) {
640 7416 : if (atomic_dec_and_test(&trans->pending_ordered))
641 935 : wake_up(&trans->pending_wait);
642 7416 : btrfs_put_transaction(trans);
643 : }
644 : }
645 :
646 3504058 : btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
647 :
648 3504058 : spin_lock(&root->ordered_extent_lock);
649 3504297 : list_del_init(&entry->root_extent_list);
650 3504300 : root->nr_ordered_extents--;
651 :
652 3504300 : trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
653 :
654 3504298 : if (!root->nr_ordered_extents) {
655 1224811 : spin_lock(&fs_info->ordered_root_lock);
656 1224813 : BUG_ON(list_empty(&root->ordered_root));
657 1224813 : list_del_init(&root->ordered_root);
658 1224813 : spin_unlock(&fs_info->ordered_root_lock);
659 : }
660 3504300 : spin_unlock(&root->ordered_extent_lock);
661 3504293 : wake_up(&entry->wait);
662 3504227 : if (!freespace_inode)
663 3504227 : btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
664 3504227 : }
665 :
666 93756 : static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
667 : {
668 93756 : struct btrfs_ordered_extent *ordered;
669 :
670 93756 : ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
671 93756 : btrfs_start_ordered_extent(ordered);
672 93711 : complete(&ordered->completion);
673 93727 : }
674 :
675 : /*
676 : * wait for all the ordered extents in a root. This is done when balancing
677 : * space between drives.
678 : */
679 21178 : u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
680 : const u64 range_start, const u64 range_len)
681 : {
682 21178 : struct btrfs_fs_info *fs_info = root->fs_info;
683 21178 : LIST_HEAD(splice);
684 21178 : LIST_HEAD(skipped);
685 21178 : LIST_HEAD(works);
686 21178 : struct btrfs_ordered_extent *ordered, *next;
687 21178 : u64 count = 0;
688 21178 : const u64 range_end = range_start + range_len;
689 :
690 21178 : mutex_lock(&root->ordered_extent_mutex);
691 21180 : spin_lock(&root->ordered_extent_lock);
692 21179 : list_splice_init(&root->ordered_extents, &splice);
693 115064 : while (!list_empty(&splice) && nr) {
694 93884 : ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
695 : root_extent_list);
696 :
697 93884 : if (range_end <= ordered->disk_bytenr ||
698 93805 : ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
699 129 : list_move_tail(&ordered->root_extent_list, &skipped);
700 129 : cond_resched_lock(&root->ordered_extent_lock);
701 129 : continue;
702 : }
703 :
704 93755 : list_move_tail(&ordered->root_extent_list,
705 : &root->ordered_extents);
706 93755 : refcount_inc(&ordered->refs);
707 93756 : spin_unlock(&root->ordered_extent_lock);
708 :
709 93756 : btrfs_init_work(&ordered->flush_work,
710 : btrfs_run_ordered_extent_work, NULL, NULL);
711 93756 : list_add_tail(&ordered->work_list, &works);
712 93756 : btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
713 :
714 93756 : cond_resched();
715 93756 : spin_lock(&root->ordered_extent_lock);
716 93756 : if (nr != U64_MAX)
717 28691 : nr--;
718 93756 : count++;
719 : }
720 21180 : list_splice_tail(&skipped, &root->ordered_extents);
721 21180 : list_splice_tail(&splice, &root->ordered_extents);
722 21180 : spin_unlock(&root->ordered_extent_lock);
723 :
724 114935 : list_for_each_entry_safe(ordered, next, &works, work_list) {
725 93756 : list_del_init(&ordered->work_list);
726 93756 : wait_for_completion(&ordered->completion);
727 93756 : btrfs_put_ordered_extent(ordered);
728 93756 : cond_resched();
729 : }
730 21179 : mutex_unlock(&root->ordered_extent_mutex);
731 :
732 21181 : return count;
733 : }
734 :
735 39752 : void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
736 : const u64 range_start, const u64 range_len)
737 : {
738 39752 : struct btrfs_root *root;
739 39752 : struct list_head splice;
740 39752 : u64 done;
741 :
742 39752 : INIT_LIST_HEAD(&splice);
743 :
744 39752 : mutex_lock(&fs_info->ordered_operations_mutex);
745 39759 : spin_lock(&fs_info->ordered_root_lock);
746 39758 : list_splice_init(&fs_info->ordered_roots, &splice);
747 53878 : while (!list_empty(&splice) && nr) {
748 14120 : root = list_first_entry(&splice, struct btrfs_root,
749 : ordered_root);
750 14120 : root = btrfs_grab_root(root);
751 14120 : BUG_ON(!root);
752 14120 : list_move_tail(&root->ordered_root,
753 : &fs_info->ordered_roots);
754 14120 : spin_unlock(&fs_info->ordered_root_lock);
755 :
756 14120 : done = btrfs_wait_ordered_extents(root, nr,
757 : range_start, range_len);
758 14120 : btrfs_put_root(root);
759 :
760 14120 : spin_lock(&fs_info->ordered_root_lock);
761 14120 : if (nr != U64_MAX) {
762 1335 : nr -= done;
763 : }
764 : }
765 39758 : list_splice_tail(&splice, &fs_info->ordered_roots);
766 39758 : spin_unlock(&fs_info->ordered_root_lock);
767 39759 : mutex_unlock(&fs_info->ordered_operations_mutex);
768 39759 : }
769 :
770 : /*
771 : * Start IO and wait for a given ordered extent to finish.
772 : *
773 : * Wait on page writeback for all the pages in the extent and the IO completion
774 : * code to insert metadata into the btree corresponding to the extent.
775 : */
776 314744 : void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
777 : {
778 314744 : u64 start = entry->file_offset;
779 314744 : u64 end = start + entry->num_bytes - 1;
780 314744 : struct btrfs_inode *inode = BTRFS_I(entry->inode);
781 314744 : bool freespace_inode;
782 :
783 314744 : trace_btrfs_ordered_extent_start(inode, entry);
784 :
785 : /*
786 : * If this is a free space inode do not take the ordered extents lockdep
787 : * map.
788 : */
789 314743 : freespace_inode = btrfs_is_free_space_inode(inode);
790 :
791 : /*
792 : * pages in the range can be dirty, clean or writeback. We
793 : * start IO on any dirty ones so the wait doesn't stall waiting
794 : * for the flusher thread to find them
795 : */
796 314743 : if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
797 279382 : filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
798 :
799 314707 : if (!freespace_inode)
800 314707 : btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
801 1194313 : wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
802 314690 : }
803 :
804 : /*
805 : * Used to wait on ordered extents across a large range of bytes.
806 : */
807 8599330 : int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
808 : {
809 8599330 : int ret = 0;
810 8599330 : int ret_wb = 0;
811 8599330 : u64 end;
812 8599330 : u64 orig_end;
813 8599330 : struct btrfs_ordered_extent *ordered;
814 :
815 8599330 : if (start + len < start) {
816 : orig_end = OFFSET_MAX;
817 : } else {
818 8434613 : orig_end = start + len - 1;
819 8434613 : if (orig_end > OFFSET_MAX)
820 : orig_end = OFFSET_MAX;
821 : }
822 :
823 : /* start IO across the range first to instantiate any delalloc
824 : * extents
825 : */
826 8599330 : ret = btrfs_fdatawrite_range(inode, start, orig_end);
827 8599843 : if (ret)
828 : return ret;
829 :
830 : /*
831 : * If we have a writeback error don't return immediately. Wait first
832 : * for any ordered extents that haven't completed yet. This is to make
833 : * sure no one can dirty the same page ranges and call writepages()
834 : * before the ordered extents complete - to avoid failures (-EEXIST)
835 : * when adding the new ordered extents to the ordered tree.
836 : */
837 8599872 : ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
838 :
839 8599872 : end = orig_end;
840 8650679 : while (1) {
841 8625046 : ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
842 8625303 : if (!ordered)
843 : break;
844 1722859 : if (ordered->file_offset > orig_end) {
845 1204458 : btrfs_put_ordered_extent(ordered);
846 1204458 : break;
847 : }
848 518401 : if (ordered->file_offset + ordered->num_bytes <= start) {
849 322089 : btrfs_put_ordered_extent(ordered);
850 322089 : break;
851 : }
852 196312 : btrfs_start_ordered_extent(ordered);
853 196311 : end = ordered->file_offset;
854 : /*
855 : * If the ordered extent had an error save the error but don't
856 : * exit without waiting first for all other ordered extents in
857 : * the range to complete.
858 : */
859 392622 : if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
860 1 : ret = -EIO;
861 196311 : btrfs_put_ordered_extent(ordered);
862 196312 : if (end == 0 || end == start)
863 : break;
864 25633 : end--;
865 : }
866 8599670 : return ret_wb ? ret_wb : ret;
867 : }
868 :
869 : /*
870 : * find an ordered extent corresponding to file_offset. return NULL if
871 : * nothing is found, otherwise take a reference on the extent and return it
872 : */
873 4490639 : struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
874 : u64 file_offset)
875 : {
876 4490639 : struct btrfs_ordered_inode_tree *tree;
877 4490639 : struct rb_node *node;
878 4490639 : struct btrfs_ordered_extent *entry = NULL;
879 4490639 : unsigned long flags;
880 :
881 4490639 : tree = &inode->ordered_tree;
882 4490639 : spin_lock_irqsave(&tree->lock, flags);
883 4501285 : node = tree_search(tree, file_offset);
884 4494555 : if (!node)
885 1901365 : goto out;
886 :
887 2593190 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
888 2593190 : if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
889 : entry = NULL;
890 2539873 : if (entry) {
891 2539873 : refcount_inc(&entry->refs);
892 2539980 : trace_btrfs_ordered_extent_lookup(inode, entry);
893 : }
894 0 : out:
895 4494629 : spin_unlock_irqrestore(&tree->lock, flags);
896 4503168 : return entry;
897 : }
898 :
899 : /* Since the DIO code tries to lock a wide area we need to look for any ordered
900 : * extents that exist in the range, rather than just the start of the range.
901 : */
902 44480363 : struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
903 : struct btrfs_inode *inode, u64 file_offset, u64 len)
904 : {
905 44480363 : struct btrfs_ordered_inode_tree *tree;
906 44480363 : struct rb_node *node;
907 44480363 : struct btrfs_ordered_extent *entry = NULL;
908 :
909 44480363 : tree = &inode->ordered_tree;
910 44480363 : spin_lock_irq(&tree->lock);
911 44590934 : node = tree_search(tree, file_offset);
912 44505591 : if (!node) {
913 41941844 : node = tree_search(tree, file_offset + len);
914 41945778 : if (!node)
915 41972813 : goto out;
916 : }
917 :
918 2808423 : while (1) {
919 2808423 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
920 2808423 : if (range_overlaps(entry, file_offset, len))
921 : break;
922 :
923 2100748 : if (entry->file_offset >= file_offset + len) {
924 : entry = NULL;
925 : break;
926 : }
927 1401780 : entry = NULL;
928 1401780 : node = rb_next(node);
929 1401957 : if (!node)
930 : break;
931 : }
932 1130246 : out:
933 44509702 : if (entry) {
934 707742 : refcount_inc(&entry->refs);
935 707742 : trace_btrfs_ordered_extent_lookup_range(inode, entry);
936 : }
937 44509702 : spin_unlock_irq(&tree->lock);
938 44574917 : return entry;
939 : }
940 :
941 : /*
942 : * Adds all ordered extents to the given list. The list ends up sorted by the
943 : * file_offset of the ordered extents.
944 : */
945 140380 : void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
946 : struct list_head *list)
947 : {
948 140380 : struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
949 140380 : struct rb_node *n;
950 :
951 140380 : ASSERT(inode_is_locked(&inode->vfs_inode));
952 :
953 140380 : spin_lock_irq(&tree->lock);
954 410409 : for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
955 270028 : struct btrfs_ordered_extent *ordered;
956 :
957 270028 : ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
958 :
959 540056 : if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
960 64455 : continue;
961 :
962 205573 : ASSERT(list_empty(&ordered->log_list));
963 205573 : list_add_tail(&ordered->log_list, list);
964 205568 : refcount_inc(&ordered->refs);
965 205578 : trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
966 : }
967 140386 : spin_unlock_irq(&tree->lock);
968 140395 : }
969 :
970 : /*
971 : * lookup and return any extent before 'file_offset'. NULL is returned
972 : * if none is found
973 : */
974 : struct btrfs_ordered_extent *
975 12490288 : btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
976 : {
977 12490288 : struct btrfs_ordered_inode_tree *tree;
978 12490288 : struct rb_node *node;
979 12490288 : struct btrfs_ordered_extent *entry = NULL;
980 :
981 12490288 : tree = &inode->ordered_tree;
982 12490288 : spin_lock_irq(&tree->lock);
983 12490952 : node = tree_search(tree, file_offset);
984 12490199 : if (!node)
985 10767342 : goto out;
986 :
987 1722857 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
988 1722857 : refcount_inc(&entry->refs);
989 1722858 : trace_btrfs_ordered_extent_lookup_first(inode, entry);
990 12490200 : out:
991 12490200 : spin_unlock_irq(&tree->lock);
992 12490738 : return entry;
993 : }
994 :
995 : /*
996 : * Lookup the first ordered extent that overlaps the range
997 : * [@file_offset, @file_offset + @len).
998 : *
999 : * The difference between this and btrfs_lookup_first_ordered_extent() is
1000 : * that this one won't return any ordered extent that does not overlap the range.
1001 : * And the difference against btrfs_lookup_ordered_extent() is, this function
1002 : * ensures the first ordered extent gets returned.
1003 : */
1004 85805559 : struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
1005 : struct btrfs_inode *inode, u64 file_offset, u64 len)
1006 : {
1007 85805559 : struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
1008 85805559 : struct rb_node *node;
1009 85805559 : struct rb_node *cur;
1010 85805559 : struct rb_node *prev;
1011 85805559 : struct rb_node *next;
1012 85805559 : struct btrfs_ordered_extent *entry = NULL;
1013 :
1014 85805559 : spin_lock_irq(&tree->lock);
1015 85810067 : node = tree->tree.rb_node;
1016 : /*
1017 : * Here we don't want to use tree_search() which will use tree->last
1018 : * and screw up the search order.
1019 : * And __tree_search() can't return the adjacent ordered extents
1020 : * either, thus here we do our own search.
1021 : */
1022 89808667 : while (node) {
1023 4092758 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
1024 :
1025 4092758 : if (file_offset < entry->file_offset) {
1026 2816738 : node = node->rb_left;
1027 1276020 : } else if (file_offset >= entry_end(entry)) {
1028 1181862 : node = node->rb_right;
1029 : } else {
1030 : /*
1031 : * Direct hit, got an ordered extent that starts at
1032 : * @file_offset
1033 : */
1034 94158 : goto out;
1035 : }
1036 : }
1037 85715909 : if (!entry) {
1038 : /* Empty tree */
1039 82529551 : goto out;
1040 : }
1041 :
1042 3186358 : cur = &entry->rb_node;
1043 : /* We got an entry around @file_offset, check adjacent entries */
1044 3186358 : if (entry->file_offset < file_offset) {
1045 784418 : prev = cur;
1046 784418 : next = rb_next(cur);
1047 : } else {
1048 2401940 : prev = rb_prev(cur);
1049 2401940 : next = cur;
1050 : }
1051 3186358 : if (prev) {
1052 913375 : entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
1053 913375 : if (range_overlaps(entry, file_offset, len))
1054 0 : goto out;
1055 : }
1056 3186358 : if (next) {
1057 2474833 : entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
1058 2474833 : if (range_overlaps(entry, file_offset, len))
1059 100 : goto out;
1060 : }
1061 : /* No ordered extent in the range */
1062 : entry = NULL;
1063 82623809 : out:
1064 82623809 : if (entry) {
1065 95315 : refcount_inc(&entry->refs);
1066 94258 : trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
1067 : }
1068 :
1069 85809010 : spin_unlock_irq(&tree->lock);
1070 85808903 : return entry;
1071 : }
1072 :
1073 : /*
1074 : * Lock the passed range and ensures all pending ordered extents in it are run
1075 : * to completion.
1076 : *
1077 : * @inode: Inode whose ordered tree is to be searched
1078 : * @start: Beginning of range to flush
1079 : * @end: Last byte of range to lock
1080 : * @cached_state: If passed, will return the extent state responsible for the
1081 : * locked range. It's the caller's responsibility to free the
1082 : * cached state.
1083 : *
1084 : * Always return with the given range locked, ensuring after it's called no
1085 : * order extent can be pending.
1086 : */
1087 19238090 : void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
1088 : u64 end,
1089 : struct extent_state **cached_state)
1090 : {
1091 19238090 : struct btrfs_ordered_extent *ordered;
1092 19238090 : struct extent_state *cache = NULL;
1093 19238090 : struct extent_state **cachedp = &cache;
1094 :
1095 19238090 : if (cached_state)
1096 1658940 : cachedp = cached_state;
1097 :
1098 19258185 : while (1) {
1099 19248138 : lock_extent(&inode->io_tree, start, end, cachedp);
1100 19213850 : ordered = btrfs_lookup_ordered_range(inode, start,
1101 19213850 : end - start + 1);
1102 19215798 : if (!ordered) {
1103 : /*
1104 : * If no external cached_state has been passed then
1105 : * decrement the extra ref taken for cachedp since we
1106 : * aren't exposing it outside of this function
1107 : */
1108 19205749 : if (!cached_state)
1109 17569727 : refcount_dec(&cache->refs);
1110 19219058 : break;
1111 : }
1112 10049 : unlock_extent(&inode->io_tree, start, end, cachedp);
1113 10049 : btrfs_start_ordered_extent(ordered);
1114 10047 : btrfs_put_ordered_extent(ordered);
1115 : }
1116 19219058 : }
1117 :
1118 : /*
1119 : * Lock the passed range and ensure all pending ordered extents in it are run
1120 : * to completion in nowait mode.
1121 : *
1122 : * Return true if btrfs_lock_ordered_range does not return any extents,
1123 : * otherwise false.
1124 : */
1125 0 : bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
1126 : struct extent_state **cached_state)
1127 : {
1128 0 : struct btrfs_ordered_extent *ordered;
1129 :
1130 0 : if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
1131 : return false;
1132 :
1133 0 : ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
1134 0 : if (!ordered)
1135 : return true;
1136 :
1137 0 : btrfs_put_ordered_extent(ordered);
1138 0 : unlock_extent(&inode->io_tree, start, end, cached_state);
1139 :
1140 0 : return false;
1141 : }
1142 :
1143 : /* Split out a new ordered extent for this first @len bytes of @ordered. */
1144 2950 : struct btrfs_ordered_extent *btrfs_split_ordered_extent(
1145 : struct btrfs_ordered_extent *ordered, u64 len)
1146 : {
1147 2950 : struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1148 2950 : struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
1149 2950 : struct btrfs_root *root = inode->root;
1150 2950 : struct btrfs_fs_info *fs_info = root->fs_info;
1151 2950 : u64 file_offset = ordered->file_offset;
1152 2950 : u64 disk_bytenr = ordered->disk_bytenr;
1153 2950 : unsigned long flags = ordered->flags;
1154 2950 : struct btrfs_ordered_sum *sum, *tmpsum;
1155 2950 : struct btrfs_ordered_extent *new;
1156 2950 : struct rb_node *node;
1157 2950 : u64 offset = 0;
1158 :
1159 2950 : trace_btrfs_ordered_extent_split(inode, ordered);
1160 :
1161 2950 : ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
1162 :
1163 : /*
1164 : * The entire bio must be covered by the ordered extent, but we can't
1165 : * reduce the original extent to a zero length either.
1166 : */
1167 2950 : if (WARN_ON_ONCE(len >= ordered->num_bytes))
1168 : return ERR_PTR(-EINVAL);
1169 : /* We cannot split partially completed ordered extents. */
1170 2950 : if (ordered->bytes_left) {
1171 2950 : ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
1172 2950 : if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
1173 : return ERR_PTR(-EINVAL);
1174 : }
1175 : /* We cannot split a compressed ordered extent. */
1176 2950 : if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
1177 : return ERR_PTR(-EINVAL);
1178 :
1179 2950 : new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr,
1180 : len, 0, flags, ordered->compress_type);
1181 2950 : if (IS_ERR(new))
1182 : return new;
1183 :
1184 : /* One ref for the tree. */
1185 2950 : refcount_inc(&new->refs);
1186 :
1187 2950 : spin_lock_irq(&root->ordered_extent_lock);
1188 2950 : spin_lock(&tree->lock);
1189 : /* Remove from tree once */
1190 2950 : node = &ordered->rb_node;
1191 2950 : rb_erase(node, &tree->tree);
1192 2950 : RB_CLEAR_NODE(node);
1193 2950 : if (tree->last == node)
1194 0 : tree->last = NULL;
1195 :
1196 2950 : ordered->file_offset += len;
1197 2950 : ordered->disk_bytenr += len;
1198 2950 : ordered->num_bytes -= len;
1199 2950 : ordered->disk_num_bytes -= len;
1200 :
1201 5900 : if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
1202 0 : ASSERT(ordered->bytes_left == 0);
1203 0 : new->bytes_left = 0;
1204 : } else {
1205 2950 : ordered->bytes_left -= len;
1206 : }
1207 :
1208 5900 : if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) {
1209 0 : if (ordered->truncated_len > len) {
1210 0 : ordered->truncated_len -= len;
1211 : } else {
1212 0 : new->truncated_len = ordered->truncated_len;
1213 0 : ordered->truncated_len = 0;
1214 : }
1215 : }
1216 :
1217 2950 : list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) {
1218 0 : if (offset == len)
1219 : break;
1220 0 : list_move_tail(&sum->list, &new->list);
1221 0 : offset += sum->len;
1222 : }
1223 :
1224 : /* Re-insert the node */
1225 2950 : node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
1226 2950 : if (node)
1227 0 : btrfs_panic(fs_info, -EEXIST,
1228 : "zoned: inconsistency in ordered tree at offset %llu",
1229 : ordered->file_offset);
1230 :
1231 2950 : node = tree_insert(&tree->tree, new->file_offset, &new->rb_node);
1232 2950 : if (node)
1233 0 : btrfs_panic(fs_info, -EEXIST,
1234 : "zoned: inconsistency in ordered tree at offset %llu",
1235 : new->file_offset);
1236 2950 : spin_unlock(&tree->lock);
1237 :
1238 2950 : list_add_tail(&new->root_extent_list, &root->ordered_extents);
1239 2950 : root->nr_ordered_extents++;
1240 2950 : spin_unlock_irq(&root->ordered_extent_lock);
1241 2950 : return new;
1242 : }
1243 :
1244 11 : int __init ordered_data_init(void)
1245 : {
1246 11 : btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
1247 : sizeof(struct btrfs_ordered_extent), 0,
1248 : SLAB_MEM_SPREAD,
1249 : NULL);
1250 11 : if (!btrfs_ordered_extent_cache)
1251 0 : return -ENOMEM;
1252 :
1253 : return 0;
1254 : }
1255 :
1256 0 : void __cold ordered_data_exit(void)
1257 : {
1258 0 : kmem_cache_destroy(btrfs_ordered_extent_cache);
1259 0 : }
|